#!/usr/bin/env python3 import io import lzma import os import struct import sys import time from abc import ABC, abstractmethod from collections import defaultdict, namedtuple from collections.abc import Callable import requests from Crypto.Hash import SHA512 CA_FORMAT_INDEX = 0x96824d9c7b129ff9 CA_FORMAT_TABLE = 0xe75b9e112f17417d CA_FORMAT_TABLE_TAIL_MARKER = 0xe75b9e112f17417 FLAGS = 0xb000000000000000 CA_HEADER_LEN = 48 CA_TABLE_HEADER_LEN = 16 CA_TABLE_ENTRY_LEN = 40 CA_TABLE_MIN_LEN = CA_TABLE_HEADER_LEN + CA_TABLE_ENTRY_LEN CHUNK_DOWNLOAD_TIMEOUT = 60 CHUNK_DOWNLOAD_RETRIES = 3 CAIBX_DOWNLOAD_TIMEOUT = 120 Chunk = namedtuple('Chunk', ['sha', 'offset', 'length']) ChunkDict = dict[bytes, Chunk] class ChunkReader(ABC): @abstractmethod def read(self, chunk: Chunk) -> bytes: ... class FileChunkReader(ChunkReader): """Reads chunks from a local file""" def __init__(self, fn: str) -> None: super().__init__() self.f = open(fn, 'rb') def __del__(self): self.f.close() def read(self, chunk: Chunk) -> bytes: self.f.seek(chunk.offset) return self.f.read(chunk.length) class RemoteChunkReader(ChunkReader): """Reads lzma compressed chunks from a remote store""" def __init__(self, url: str) -> None: super().__init__() self.url = url self.session = requests.Session() def read(self, chunk: Chunk) -> bytes: sha_hex = chunk.sha.hex() url = os.path.join(self.url, sha_hex[:4], sha_hex + ".cacnk") if os.path.isfile(url): with open(url, 'rb') as f: contents = f.read() else: for i in range(CHUNK_DOWNLOAD_RETRIES): try: resp = self.session.get(url, timeout=CHUNK_DOWNLOAD_TIMEOUT) break except Exception: if i == CHUNK_DOWNLOAD_RETRIES - 1: raise time.sleep(CHUNK_DOWNLOAD_TIMEOUT) resp.raise_for_status() contents = resp.content decompressor = lzma.LZMADecompressor(format=lzma.FORMAT_AUTO) return decompressor.decompress(contents) def parse_caibx(caibx_path: str) -> list[Chunk]: """Parses the chunks from a caibx file. Can handle both local and remote files. Returns a list of chunks with hash, offset and length""" caibx: io.BufferedIOBase if os.path.isfile(caibx_path): caibx = open(caibx_path, 'rb') else: resp = requests.get(caibx_path, timeout=CAIBX_DOWNLOAD_TIMEOUT) resp.raise_for_status() caibx = io.BytesIO(resp.content) caibx.seek(0, os.SEEK_END) caibx_len = caibx.tell() caibx.seek(0, os.SEEK_SET) # Parse header length, magic, flags, min_size, _, max_size = struct.unpack("= min_size chunks.append(Chunk(sha, offset, length)) offset = new_offset caibx.close() return chunks def build_chunk_dict(chunks: list[Chunk]) -> ChunkDict: """Turn a list of chunks into a dict for faster lookups based on hash. Keep first chunk since it's more likely to be already downloaded.""" r = {} for c in chunks: if c.sha not in r: r[c.sha] = c return r def extract(target: list[Chunk], sources: list[tuple[str, ChunkReader, ChunkDict]], out_path: str, progress: Callable[[int], None] | None = None): stats: dict[str, int] = defaultdict(int) mode = 'rb+' if os.path.exists(out_path) else 'wb' with open(out_path, mode) as out: for cur_chunk in target: # Find source for desired chunk for name, chunk_reader, store_chunks in sources: if cur_chunk.sha in store_chunks: bts = chunk_reader.read(store_chunks[cur_chunk.sha]) # Check length if len(bts) != cur_chunk.length: continue # Check hash if SHA512.new(bts, truncate="256").digest() != cur_chunk.sha: continue # Write to output out.seek(cur_chunk.offset) out.write(bts) stats[name] += cur_chunk.length if progress is not None: progress(sum(stats.values())) break else: raise RuntimeError("Desired chunk not found in provided stores") return stats def print_stats(stats: dict[str, int]): total_bytes = sum(stats.values()) print(f"Total size: {total_bytes / 1024 / 1024:.2f} MB") for name, total in stats.items(): print(f" {name}: {total / 1024 / 1024:.2f} MB ({total / total_bytes * 100:.1f}%)") def extract_simple(caibx_path, out_path, store_path): # (name, callback, chunks) target = parse_caibx(caibx_path) sources = [ # (store_path, RemoteChunkReader(store_path), build_chunk_dict(target)), (store_path, FileChunkReader(store_path), build_chunk_dict(target)), ] return extract(target, sources, out_path) if __name__ == "__main__": caibx = sys.argv[1] out = sys.argv[2] store = sys.argv[3] stats = extract_simple(caibx, out, store) print_stats(stats)