casync: manifest compare script (#25129)
	
		
	
				
					
				
			* casync compare script * typo * cleanup outputpull/25136/head
							parent
							
								
									f0b5ff5c1a
								
							
						
					
					
						commit
						205f6f7414
					
				
				 1 changed files with 74 additions and 0 deletions
			
			
		| @ -0,0 +1,74 @@ | ||||
| #!/usr/bin/env python3 | ||||
| import argparse | ||||
| import collections | ||||
| import multiprocessing | ||||
| import os | ||||
| from typing import Dict, List | ||||
| 
 | ||||
| import requests | ||||
| from tqdm import tqdm | ||||
| 
 | ||||
| import system.hardware.tici.casync as casync | ||||
| 
 | ||||
| 
 | ||||
| def get_chunk_download_size(chunk): | ||||
|   sha = chunk.sha.hex() | ||||
|   path = os.path.join(remote_url, sha[:4], sha + ".cacnk") | ||||
|   if os.path.isfile(path): | ||||
|     return os.path.getsize(path) | ||||
|   else: | ||||
|     r = requests.head(path) | ||||
|     r.raise_for_status() | ||||
|     return int(r.headers['content-length']) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
| 
 | ||||
|   parser = argparse.ArgumentParser(description='Compute overlap between two casync manifests') | ||||
|   parser.add_argument('frm') | ||||
|   parser.add_argument('to') | ||||
|   args = parser.parse_args() | ||||
| 
 | ||||
|   frm = casync.parse_caibx(args.frm) | ||||
|   to = casync.parse_caibx(args.to) | ||||
|   remote_url = args.to.replace('.caibx', '') | ||||
| 
 | ||||
|   most_common = collections.Counter(t.sha for t in to).most_common(1)[0][0] | ||||
| 
 | ||||
|   frm_dict = casync.build_chunk_dict(frm) | ||||
| 
 | ||||
|   # Get content-length for each chunk | ||||
|   with multiprocessing.Pool() as pool: | ||||
|     szs = list(tqdm(pool.imap(get_chunk_download_size, to), total=len(to))) | ||||
|   chunk_sizes = {t.sha: sz for (t, sz) in zip(to, szs)} | ||||
| 
 | ||||
|   sources: Dict[str, List[int]] = { | ||||
|     'seed': [], | ||||
|     'remote_uncompressed': [], | ||||
|     'remote_compressed': [], | ||||
|   } | ||||
| 
 | ||||
|   for chunk in to: | ||||
|     # Assume most common chunk is the zero chunk | ||||
|     if chunk.sha == most_common: | ||||
|       continue | ||||
| 
 | ||||
|     if chunk.sha in frm_dict: | ||||
|       sources['seed'].append(chunk.length) | ||||
|     else: | ||||
|       sources['remote_uncompressed'].append(chunk.length) | ||||
|       sources['remote_compressed'].append(chunk_sizes[chunk.sha]) | ||||
| 
 | ||||
|   print() | ||||
|   print("Update statistics (excluding zeros)") | ||||
|   print() | ||||
|   print("Download only with no seed:") | ||||
|   print(f"  Remote (uncompressed)\t\t{sum(sources['seed'] + sources['remote_uncompressed']) / 1000 / 1000:.2f} MB\tn = {len(to)}") | ||||
|   print(f"  Remote (compressed download)\t{sum(chunk_sizes.values()) / 1000 / 1000:.2f} MB\tn = {len(to)}") | ||||
|   print() | ||||
|   print("Upgrade with seed partition:") | ||||
|   print(f"  Seed   (uncompressed)\t\t{sum(sources['seed']) / 1000 / 1000:.2f} MB\t\t\t\tn = {len(sources['seed'])}") | ||||
|   sz, n = sum(sources['remote_uncompressed']), len(sources['remote_uncompressed']) | ||||
|   print(f"  Remote (uncompressed)\t\t{sz / 1000 / 1000:.2f} MB\t(avg {sz / 1000 / 1000 / n:4f} MB)\tn = {n}") | ||||
|   sz, n = sum(sources['remote_compressed']), len(sources['remote_compressed']) | ||||
|   print(f"  Remote (compressed download)\t{sz / 1000 / 1000:.2f} MB\t(avg {sz / 1000 / 1000 / n:4f} MB)\tn = {n}") | ||||
					Loading…
					
					
				
		Reference in new issue