You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
			
				
					57 lines
				
				3.5 KiB
			
		
		
			
		
	
	
					57 lines
				
				3.5 KiB
			| 
											4 days ago
										 | #!/usr/bin/env bash
 | ||
|  | # adapted from https://github.com/mlcommons/training/blob/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/README.md
 | ||
|  | 
 | ||
|  | # setup dirs
 | ||
|  | 
 | ||
|  | DATA=/raid/datasets/stable_diffusion
 | ||
|  | 
 | ||
|  | LAION=$DATA/laion-400m/webdataset-moments-filtered 
 | ||
|  | COCO=$DATA/coco2014
 | ||
|  | mkdir -p $LAION $COCO
 | ||
|  | 
 | ||
|  | CKPT=/raid/weights/stable_diffusion
 | ||
|  | mkdir -p $CKPT/clip $CKPT/sd $CKPT/inception
 | ||
|  | 
 | ||
|  | # download data
 | ||
|  | 
 | ||
|  | # if rclone isn't installed system-wide / in your PATH, put the executable path in quotes below
 | ||
|  | #RCLONE=""
 | ||
|  | RCLONE="rclone"
 | ||
|  | 
 | ||
|  | ## VAE-encoded image latents, from 6.1M image subset of laion-400m
 | ||
|  | ## about 1 TB for whole download
 | ||
|  | $RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 | ||
|  | $RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/ ${LAION} --include="*.tar" -P
 | ||
|  | $RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/sha512sums.txt ${LAION} -P
 | ||
|  | cd $LAION && grep -E '\.tar$' sha512sums.txt | sha512sum -c --quiet - && \
 | ||
|  |   echo "All .tar files verified" || { echo "Checksum failure when validating downloaded Laion moments"; exit 1; }
 | ||
|  | 
 | ||
|  | ## prompts and FID statistics from 30k image subset of coco2014
 | ||
|  | ## 33 MB
 | ||
|  | $RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 | ||
|  | $RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k.tsv ${COCO} -P
 | ||
|  | 
 | ||
|  | $RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
 | ||
|  | $RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k_stats.npz ${COCO} -P
 | ||
|  | 
 | ||
|  | # download checkpoints
 | ||
|  | 
 | ||
|  | ## clip (needed for text and vision encoders for validation)
 | ||
|  | CLIP_WEIGHTS_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin"
 | ||
|  | CLIP_WEIGHTS_SHA256="9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
 | ||
|  | CLIP_CONFIG_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/raw/main/open_clip_config.json"
 | ||
|  | wget -N -P ${CKPT}/clip ${CLIP_WEIGHTS_URL}
 | ||
|  | wget -N -P ${CKPT}/clip ${CLIP_CONFIG_URL}
 | ||
|  | echo "${CLIP_WEIGHTS_SHA256}  ${CKPT}/clip/open_clip_pytorch_model.bin"                    | sha256sum -c
 | ||
|  | 
 | ||
|  | ## sd (needed for latent->image decoder for validation, also has clip text encoder for training)
 | ||
|  | SD_WEIGHTS_URL='https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt'
 | ||
|  | SD_WEIGHTS_SHA256="d635794c1fedfdfa261e065370bea59c651fc9bfa65dc6d67ad29e11869a1824"
 | ||
|  | wget -N -P ${CKPT}/sd ${SD_WEIGHTS_URL}
 | ||
|  | echo "${SD_WEIGHTS_SHA256}  ${CKPT}/sd/512-base-ema.ckpt"                    | sha256sum -c
 | ||
|  | 
 | ||
|  | ## inception (needed for validation)
 | ||
|  | FID_WEIGHTS_URL='https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'
 | ||
|  | FID_WEIGHTS_SHA1="bd836944fd6db519dfd8d924aa457f5b3c8357ff"
 | ||
|  | wget -N -P ${CKPT}/inception ${FID_WEIGHTS_URL}
 | ||
|  | echo "${FID_WEIGHTS_SHA1}  ${CKPT}/inception/pt_inception-2015-12-05-6726825d.pth"                    | sha1sum -c
 |