openpilot_comma/tinygrad_repo/extra/datasets/openimages.py

import sys
import json
import numpy as np
from PIL import Image
from pathlib import Path
import boto3, botocore
from tinygrad import Tensor, dtypes
from tinygrad.helpers import fetch, tqdm, getenv
import pandas as pd
import concurrent.futures

BASEDIR = Path(__file__).parent / "open-images-v6-mlperf"
BUCKET_NAME = "open-images-dataset"
TRAIN_BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv"
VALIDATION_BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv"
MAP_CLASSES_URL = "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv"
MLPERF_CLASSES = ['Airplane', 'Antelope', 'Apple', 'Backpack', 'Balloon', 'Banana',
  'Barrel', 'Baseball bat', 'Baseball glove', 'Bee', 'Beer', 'Bench', 'Bicycle',
  'Bicycle helmet', 'Bicycle wheel', 'Billboard', 'Book', 'Bookcase', 'Boot',
  'Bottle', 'Bowl', 'Bowling equipment', 'Box', 'Boy', 'Brassiere', 'Bread',
  'Broccoli', 'Bronze sculpture', 'Bull', 'Bus', 'Bust', 'Butterfly', 'Cabinetry',
  'Cake', 'Camel', 'Camera', 'Candle', 'Candy', 'Cannon', 'Canoe', 'Carrot', 'Cart',
  'Castle', 'Cat', 'Cattle', 'Cello', 'Chair', 'Cheese', 'Chest of drawers', 'Chicken',
  'Christmas tree', 'Coat', 'Cocktail', 'Coffee', 'Coffee cup', 'Coffee table', 'Coin',
  'Common sunflower', 'Computer keyboard', 'Computer monitor', 'Convenience store',
  'Cookie', 'Countertop', 'Cowboy hat', 'Crab', 'Crocodile', 'Cucumber', 'Cupboard',
  'Curtain', 'Deer', 'Desk', 'Dinosaur', 'Dog', 'Doll', 'Dolphin', 'Door', 'Dragonfly',
  'Drawer', 'Dress', 'Drum', 'Duck', 'Eagle', 'Earrings', 'Egg (Food)', 'Elephant',
  'Falcon', 'Fedora', 'Flag', 'Flowerpot', 'Football', 'Football helmet', 'Fork',
  'Fountain', 'French fries', 'French horn', 'Frog', 'Giraffe', 'Girl', 'Glasses',
  'Goat', 'Goggles', 'Goldfish', 'Gondola', 'Goose', 'Grape', 'Grapefruit', 'Guitar',
  'Hamburger', 'Handbag', 'Harbor seal', 'Headphones', 'Helicopter', 'High heels',
  'Hiking equipment', 'Horse', 'House', 'Houseplant', 'Human arm', 'Human beard',
  'Human body', 'Human ear', 'Human eye', 'Human face', 'Human foot', 'Human hair',
  'Human hand', 'Human head', 'Human leg', 'Human mouth', 'Human nose', 'Ice cream',
  'Jacket', 'Jeans', 'Jellyfish', 'Juice', 'Kitchen & dining room table', 'Kite',
  'Lamp', 'Lantern', 'Laptop', 'Lavender (Plant)', 'Lemon', 'Light bulb', 'Lighthouse',
  'Lily', 'Lion', 'Lipstick', 'Lizard', 'Man', 'Maple', 'Microphone', 'Mirror',
  'Mixing bowl', 'Mobile phone', 'Monkey', 'Motorcycle', 'Muffin', 'Mug', 'Mule',
  'Mushroom', 'Musical keyboard', 'Necklace', 'Nightstand', 'Office building',
  'Orange', 'Owl', 'Oyster', 'Paddle', 'Palm tree', 'Parachute', 'Parrot', 'Pen',
  'Penguin', 'Personal flotation device', 'Piano', 'Picture frame', 'Pig', 'Pillow',
  'Pizza', 'Plate', 'Platter', 'Porch', 'Poster', 'Pumpkin', 'Rabbit', 'Rifle',
  'Roller skates', 'Rose', 'Salad', 'Sandal', 'Saucer', 'Saxophone', 'Scarf', 'Sea lion',
  'Sea turtle', 'Sheep', 'Shelf', 'Shirt', 'Shorts', 'Shrimp', 'Sink', 'Skateboard',
  'Ski', 'Skull', 'Skyscraper', 'Snake', 'Sock', 'Sofa bed', 'Sparrow', 'Spider', 'Spoon',
  'Sports uniform', 'Squirrel', 'Stairs', 'Stool', 'Strawberry', 'Street light',
  'Studio couch', 'Suit', 'Sun hat', 'Sunglasses', 'Surfboard', 'Sushi', 'Swan',
  'Swimming pool', 'Swimwear', 'Tank', 'Tap', 'Taxi', 'Tea', 'Teddy bear', 'Television',
  'Tent', 'Tie', 'Tiger', 'Tin can', 'Tire', 'Toilet', 'Tomato', 'Tortoise', 'Tower',
  'Traffic light', 'Train', 'Tripod', 'Truck', 'Trumpet', 'Umbrella', 'Van', 'Vase',
  'Vehicle registration plate', 'Violin', 'Wall clock', 'Waste container', 'Watch',
  'Whale', 'Wheel', 'Wheelchair', 'Whiteboard', 'Window', 'Wine', 'Wine glass', 'Woman',
  'Zebra', 'Zucchini',
]


def openimages(base_dir:Path, subset:str, ann_file:Path):
  valid_subsets = ['train', 'validation']
  if subset not in valid_subsets:
    raise ValueError(f"{subset=} must be one of {valid_subsets}")

  fetch_openimages(ann_file, base_dir, subset)

# this slows down the conversion a lot!
# maybe use https://raw.githubusercontent.com/scardine/image_size/master/get_image_size.py
def extract_dims(path): return Image.open(path).size[::-1]

def export_to_coco(class_map, annotations, image_list, dataset_path, output_path, subset, classes=MLPERF_CLASSES):
  output_path.parent.mkdir(parents=True, exist_ok=True)
  cats = [{"id": i, "name": c, "supercategory": None} for i, c in enumerate(classes)]
  categories_map = pd.DataFrame([(i, c) for i, c in enumerate(classes)], columns=["category_id", "category_name"])
  class_map = class_map.merge(categories_map, left_on="DisplayName", right_on="category_name", how="inner")
  annotations = annotations[annotations["ImageID"].isin(image_list)]
  annotations = annotations.merge(class_map, on="LabelName", how="inner")
  annotations["image_id"] = pd.factorize(annotations["ImageID"].tolist())[0]
  annotations[["height", "width"]] = annotations.apply(lambda x: extract_dims(dataset_path / f"{x['ImageID']}.jpg"), axis=1, result_type="expand")

  # Images
  imgs = [{"id": int(id + 1), "file_name": f"{image_id}.jpg", "height": row["height"], "width": row["width"], "subset": subset, "license": None, "coco_url": None}
    for (id, image_id), row in (annotations.groupby(["image_id", "ImageID"]).first().iterrows())
  ]

  # Annotations
  annots = []
  for i, row in annotations.iterrows():
    xmin, ymin, xmax, ymax, img_w, img_h = [row[k] for k in ["XMin", "YMin", "XMax", "YMax", "width", "height"]]
    x, y, w, h = xmin * img_w, ymin * img_h, (xmax - xmin) * img_w, (ymax - ymin) * img_h
    coco_annot = {"id": int(i) + 1, "image_id": int(row["image_id"] + 1), "category_id": int(row["category_id"]), "bbox": [x, y, w, h], "area": w * h}
    coco_annot.update({k: row[k] for k in ["IsOccluded", "IsInside", "IsDepiction", "IsTruncated", "IsGroupOf"]})
    coco_annot["iscrowd"] = int(row["IsGroupOf"])
    annots.append(coco_annot)

  info = {"dataset": "openimages_mlperf", "version": "v6"}
  coco_annotations = {"info": info, "licenses": [], "categories": cats, "images": imgs, "annotations": annots}
  with open(output_path, "w") as fp:
    json.dump(coco_annotations, fp)

def get_image_list(class_map, annotations, classes=MLPERF_CLASSES):
  labels = class_map[class_map["DisplayName"].isin(classes)]["LabelName"]
  image_ids = annotations[annotations["LabelName"].isin(labels)]["ImageID"].unique()
  return image_ids

def download_image(bucket, subset, image_id, data_dir):
  try:
    bucket.download_file(f"{subset}/{image_id}.jpg", f"{data_dir}/{image_id}.jpg")
  except botocore.exceptions.ClientError as exception:
    sys.exit(f"ERROR when downloading image `validation/{image_id}`: {str(exception)}")

def fetch_openimages(output_fn:str, base_dir:Path, subset:str):
  bucket = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)).Bucket(BUCKET_NAME)

  annotations_dir, data_dir = base_dir / "annotations", base_dir / f"{subset}/data"
  annotations_dir.mkdir(parents=True, exist_ok=True)
  data_dir.mkdir(parents=True, exist_ok=True)

  if subset == "train":
    annotations_fn = annotations_dir / TRAIN_BBOX_ANNOTATIONS_URL.split('/')[-1]
    fetch(TRAIN_BBOX_ANNOTATIONS_URL, annotations_fn)
  else:  # subset == validation
    annotations_fn = annotations_dir / VALIDATION_BBOX_ANNOTATIONS_URL.split('/')[-1]
    fetch(VALIDATION_BBOX_ANNOTATIONS_URL, annotations_fn)

  annotations = pd.read_csv(annotations_fn)

  classmap_fn = annotations_dir / MAP_CLASSES_URL.split('/')[-1]
  fetch(MAP_CLASSES_URL, classmap_fn)
  class_map = pd.read_csv(classmap_fn, names=["LabelName", "DisplayName"])

  image_list = get_image_list(class_map, annotations)

  with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(download_image, bucket, subset, image_id, data_dir) for image_id in image_list]
    for future in (t := tqdm(concurrent.futures.as_completed(futures), total=len(image_list))):
      t.set_description(f"Downloading images")
      future.result()

  print("Converting annotations to COCO format...")
  export_to_coco(class_map, annotations, image_list, data_dir, output_fn, subset)

def image_load(base_dir, subset, fn):
  img_folder = base_dir / f"{subset}/data"
  return Image.open(img_folder / fn).convert('RGB')

def prepare_target(annotations, img_id, img_size):
  boxes = [annot["bbox"] for annot in annotations]
  boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
  boxes[:, 2:] += boxes[:, :2]
  boxes[:, 0::2] = boxes[:, 0::2].clip(0, img_size[1])
  boxes[:, 1::2] = boxes[:, 1::2].clip(0, img_size[0])
  keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
  boxes = boxes[keep]
  classes = [annot["category_id"] for annot in annotations]
  classes = np.array(classes, dtype=np.int64)
  classes = classes[keep]
  return {"boxes": boxes, "labels": classes, "image_id": img_id, "image_size": img_size}

def iterate(coco, base_dir, bs=8):
  image_ids = sorted(coco.imgs.keys())
  for i in range(0, len(image_ids), bs):
    X, targets  = [], []
    for img_id in image_ids[i:i+bs]:
      img_dict = coco.loadImgs(img_id)[0]
      x, original_size = resize(image_load(base_dir, img_dict['subset'], img_dict["file_name"]))
      X.append(x)
      annotations = coco.loadAnns(coco.getAnnIds(img_id))
      targets.append(prepare_target(annotations, img_id, original_size))
    yield np.array(X), targets

def download_dataset(base_dir:Path, subset:str) -> Path:
  if (ann_file:=base_dir / f"{subset}/labels/openimages-mlperf.json").is_file(): print(f"{subset} dataset is already available")
  else:
    print(f"Downloading {subset} dataset...")
    openimages(base_dir, subset, ann_file)
    print("Done")

  return ann_file

def random_horizontal_flip(img, tgt, prob=0.5):
  import torch
  import torchvision.transforms.functional as F
  if torch.rand(1) < prob:
    w = img.size[0]
    img = F.hflip(img)
    tgt["boxes"][:, [0, 2]] = w - tgt["boxes"][:, [2, 0]]
  return img, tgt

def resize(img:Image, tgt:dict[str, np.ndarray|tuple]|None=None, size:tuple[int, int]=(800, 800)) -> tuple[np.ndarray, np.ndarray, tuple]|tuple[np.ndarray, tuple]:
  import torchvision.transforms.functional as F
  img_size = img.size[::-1]
  img = F.resize(img, size=size)
  img = np.array(img)

  if tgt is not None:
    ratios = [s / s_orig for s, s_orig in zip(size, img_size)]
    ratio_h, ratio_w = ratios
    x_min, y_min, x_max, y_max = [tgt["boxes"][:, i] for i in range(tgt["boxes"].shape[-1])]
    x_min = x_min * ratio_w
    x_max = x_max * ratio_w
    y_min = y_min * ratio_h
    y_max = y_max * ratio_h

    tgt["boxes"] = np.stack([x_min, y_min, x_max, y_max], axis=1)
    return img, tgt, img_size

  return img, img_size

if __name__ == "__main__":
  download_dataset(base_dir:=getenv("BASE_DIR", BASEDIR), "train")
  download_dataset(base_dir, "validation")
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago			`import sys`
			`import json`
			`import numpy as np`
			`from PIL import Image`
			`from pathlib import Path`
			`import boto3, botocore`
openpilot v0.9.9 release date: 2025-04-26T09:09:58 master commit: 9d17c73f6b8677fc986d2c02096950715c64694c 15 hours ago			`from tinygrad import Tensor, dtypes`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago			`from tinygrad.helpers import fetch, tqdm, getenv`
			`import pandas as pd`
			`import concurrent.futures`

			`BASEDIR = Path(__file__).parent / "open-images-v6-mlperf"`
			`BUCKET_NAME = "open-images-dataset"`
			`TRAIN_BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv"`
			`VALIDATION_BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv"`
			`MAP_CLASSES_URL = "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv"`
			`MLPERF_CLASSES = ['Airplane', 'Antelope', 'Apple', 'Backpack', 'Balloon', 'Banana',`
			`'Barrel', 'Baseball bat', 'Baseball glove', 'Bee', 'Beer', 'Bench', 'Bicycle',`
			`'Bicycle helmet', 'Bicycle wheel', 'Billboard', 'Book', 'Bookcase', 'Boot',`
			`'Bottle', 'Bowl', 'Bowling equipment', 'Box', 'Boy', 'Brassiere', 'Bread',`
			`'Broccoli', 'Bronze sculpture', 'Bull', 'Bus', 'Bust', 'Butterfly', 'Cabinetry',`
			`'Cake', 'Camel', 'Camera', 'Candle', 'Candy', 'Cannon', 'Canoe', 'Carrot', 'Cart',`
			`'Castle', 'Cat', 'Cattle', 'Cello', 'Chair', 'Cheese', 'Chest of drawers', 'Chicken',`
			`'Christmas tree', 'Coat', 'Cocktail', 'Coffee', 'Coffee cup', 'Coffee table', 'Coin',`
			`'Common sunflower', 'Computer keyboard', 'Computer monitor', 'Convenience store',`
			`'Cookie', 'Countertop', 'Cowboy hat', 'Crab', 'Crocodile', 'Cucumber', 'Cupboard',`
			`'Curtain', 'Deer', 'Desk', 'Dinosaur', 'Dog', 'Doll', 'Dolphin', 'Door', 'Dragonfly',`
			`'Drawer', 'Dress', 'Drum', 'Duck', 'Eagle', 'Earrings', 'Egg (Food)', 'Elephant',`
			`'Falcon', 'Fedora', 'Flag', 'Flowerpot', 'Football', 'Football helmet', 'Fork',`
			`'Fountain', 'French fries', 'French horn', 'Frog', 'Giraffe', 'Girl', 'Glasses',`
			`'Goat', 'Goggles', 'Goldfish', 'Gondola', 'Goose', 'Grape', 'Grapefruit', 'Guitar',`
			`'Hamburger', 'Handbag', 'Harbor seal', 'Headphones', 'Helicopter', 'High heels',`
			`'Hiking equipment', 'Horse', 'House', 'Houseplant', 'Human arm', 'Human beard',`
			`'Human body', 'Human ear', 'Human eye', 'Human face', 'Human foot', 'Human hair',`
			`'Human hand', 'Human head', 'Human leg', 'Human mouth', 'Human nose', 'Ice cream',`
			`'Jacket', 'Jeans', 'Jellyfish', 'Juice', 'Kitchen & dining room table', 'Kite',`
			`'Lamp', 'Lantern', 'Laptop', 'Lavender (Plant)', 'Lemon', 'Light bulb', 'Lighthouse',`
			`'Lily', 'Lion', 'Lipstick', 'Lizard', 'Man', 'Maple', 'Microphone', 'Mirror',`
			`'Mixing bowl', 'Mobile phone', 'Monkey', 'Motorcycle', 'Muffin', 'Mug', 'Mule',`
			`'Mushroom', 'Musical keyboard', 'Necklace', 'Nightstand', 'Office building',`
			`'Orange', 'Owl', 'Oyster', 'Paddle', 'Palm tree', 'Parachute', 'Parrot', 'Pen',`
			`'Penguin', 'Personal flotation device', 'Piano', 'Picture frame', 'Pig', 'Pillow',`
			`'Pizza', 'Plate', 'Platter', 'Porch', 'Poster', 'Pumpkin', 'Rabbit', 'Rifle',`
			`'Roller skates', 'Rose', 'Salad', 'Sandal', 'Saucer', 'Saxophone', 'Scarf', 'Sea lion',`
			`'Sea turtle', 'Sheep', 'Shelf', 'Shirt', 'Shorts', 'Shrimp', 'Sink', 'Skateboard',`
			`'Ski', 'Skull', 'Skyscraper', 'Snake', 'Sock', 'Sofa bed', 'Sparrow', 'Spider', 'Spoon',`
			`'Sports uniform', 'Squirrel', 'Stairs', 'Stool', 'Strawberry', 'Street light',`
			`'Studio couch', 'Suit', 'Sun hat', 'Sunglasses', 'Surfboard', 'Sushi', 'Swan',`
			`'Swimming pool', 'Swimwear', 'Tank', 'Tap', 'Taxi', 'Tea', 'Teddy bear', 'Television',`
			`'Tent', 'Tie', 'Tiger', 'Tin can', 'Tire', 'Toilet', 'Tomato', 'Tortoise', 'Tower',`
			`'Traffic light', 'Train', 'Tripod', 'Truck', 'Trumpet', 'Umbrella', 'Van', 'Vase',`
			`'Vehicle registration plate', 'Violin', 'Wall clock', 'Waste container', 'Watch',`
			`'Whale', 'Wheel', 'Wheelchair', 'Whiteboard', 'Window', 'Wine', 'Wine glass', 'Woman',`
			`'Zebra', 'Zucchini',`
			`]`


			`def openimages(base_dir:Path, subset:str, ann_file:Path):`
			`valid_subsets = ['train', 'validation']`
			`if subset not in valid_subsets:`
			`raise ValueError(f"{subset=} must be one of {valid_subsets}")`

			`fetch_openimages(ann_file, base_dir, subset)`

			`# this slows down the conversion a lot!`
			`# maybe use https://raw.githubusercontent.com/scardine/image_size/master/get_image_size.py`
			`def extract_dims(path): return Image.open(path).size[::-1]`

			`def export_to_coco(class_map, annotations, image_list, dataset_path, output_path, subset, classes=MLPERF_CLASSES):`
			`output_path.parent.mkdir(parents=True, exist_ok=True)`
			`cats = [{"id": i, "name": c, "supercategory": None} for i, c in enumerate(classes)]`
			`categories_map = pd.DataFrame([(i, c) for i, c in enumerate(classes)], columns=["category_id", "category_name"])`
			`class_map = class_map.merge(categories_map, left_on="DisplayName", right_on="category_name", how="inner")`
openpilot v0.9.9 release date: 2025-04-26T09:09:58 master commit: 9d17c73f6b8677fc986d2c02096950715c64694c 15 hours ago			`annotations = annotations[annotations["ImageID"].isin(image_list)]`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago			`annotations = annotations.merge(class_map, on="LabelName", how="inner")`
			`annotations["image_id"] = pd.factorize(annotations["ImageID"].tolist())[0]`
			`annotations[["height", "width"]] = annotations.apply(lambda x: extract_dims(dataset_path / f"{x['ImageID']}.jpg"), axis=1, result_type="expand")`

			`# Images`
			`imgs = [{"id": int(id + 1), "file_name": f"{image_id}.jpg", "height": row["height"], "width": row["width"], "subset": subset, "license": None, "coco_url": None}`
			`for (id, image_id), row in (annotations.groupby(["image_id", "ImageID"]).first().iterrows())`
			`]`

			`# Annotations`
			`annots = []`
			`for i, row in annotations.iterrows():`
			`xmin, ymin, xmax, ymax, img_w, img_h = [row[k] for k in ["XMin", "YMin", "XMax", "YMax", "width", "height"]]`
			`x, y, w, h = xmin * img_w, ymin * img_h, (xmax - xmin) * img_w, (ymax - ymin) * img_h`
			`coco_annot = {"id": int(i) + 1, "image_id": int(row["image_id"] + 1), "category_id": int(row["category_id"]), "bbox": [x, y, w, h], "area": w * h}`
			`coco_annot.update({k: row[k] for k in ["IsOccluded", "IsInside", "IsDepiction", "IsTruncated", "IsGroupOf"]})`
			`coco_annot["iscrowd"] = int(row["IsGroupOf"])`
			`annots.append(coco_annot)`

			`info = {"dataset": "openimages_mlperf", "version": "v6"}`
			`coco_annotations = {"info": info, "licenses": [], "categories": cats, "images": imgs, "annotations": annots}`
			`with open(output_path, "w") as fp:`
			`json.dump(coco_annotations, fp)`

			`def get_image_list(class_map, annotations, classes=MLPERF_CLASSES):`
openpilot v0.9.9 release date: 2025-04-26T09:09:58 master commit: 9d17c73f6b8677fc986d2c02096950715c64694c 15 hours ago			`labels = class_map[class_map["DisplayName"].isin(classes)]["LabelName"]`
			`image_ids = annotations[annotations["LabelName"].isin(labels)]["ImageID"].unique()`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago			`return image_ids`

			`def download_image(bucket, subset, image_id, data_dir):`
			`try:`
			`bucket.download_file(f"{subset}/{image_id}.jpg", f"{data_dir}/{image_id}.jpg")`
			`except botocore.exceptions.ClientError as exception:`
			sys.exit(f"ERROR when downloading image `validation/{image_id}`: {str(exception)}")

			`def fetch_openimages(output_fn:str, base_dir:Path, subset:str):`
			`bucket = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)).Bucket(BUCKET_NAME)`

			`annotations_dir, data_dir = base_dir / "annotations", base_dir / f"{subset}/data"`
			`annotations_dir.mkdir(parents=True, exist_ok=True)`
			`data_dir.mkdir(parents=True, exist_ok=True)`

			`if subset == "train":`
			`annotations_fn = annotations_dir / TRAIN_BBOX_ANNOTATIONS_URL.split('/')[-1]`
			`fetch(TRAIN_BBOX_ANNOTATIONS_URL, annotations_fn)`
			`else: # subset == validation`
			`annotations_fn = annotations_dir / VALIDATION_BBOX_ANNOTATIONS_URL.split('/')[-1]`
			`fetch(VALIDATION_BBOX_ANNOTATIONS_URL, annotations_fn)`

			`annotations = pd.read_csv(annotations_fn)`

			`classmap_fn = annotations_dir / MAP_CLASSES_URL.split('/')[-1]`
			`fetch(MAP_CLASSES_URL, classmap_fn)`
			`class_map = pd.read_csv(classmap_fn, names=["LabelName", "DisplayName"])`

			`image_list = get_image_list(class_map, annotations)`

			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`futures = [executor.submit(download_image, bucket, subset, image_id, data_dir) for image_id in image_list]`
			`for future in (t := tqdm(concurrent.futures.as_completed(futures), total=len(image_list))):`
			`t.set_description(f"Downloading images")`
			`future.result()`

			`print("Converting annotations to COCO format...")`
			`export_to_coco(class_map, annotations, image_list, data_dir, output_fn, subset)`

			`def image_load(base_dir, subset, fn):`
			`img_folder = base_dir / f"{subset}/data"`
openpilot v0.9.9 release date: 2025-04-26T09:09:58 master commit: 9d17c73f6b8677fc986d2c02096950715c64694c 15 hours ago			`return Image.open(img_folder / fn).convert('RGB')`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago
			`def prepare_target(annotations, img_id, img_size):`
			`boxes = [annot["bbox"] for annot in annotations]`
			`boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)`
			`boxes[:, 2:] += boxes[:, :2]`
			`boxes[:, 0::2] = boxes[:, 0::2].clip(0, img_size[1])`
			`boxes[:, 1::2] = boxes[:, 1::2].clip(0, img_size[0])`
			`keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])`
			`boxes = boxes[keep]`
			`classes = [annot["category_id"] for annot in annotations]`
			`classes = np.array(classes, dtype=np.int64)`
			`classes = classes[keep]`
			`return {"boxes": boxes, "labels": classes, "image_id": img_id, "image_size": img_size}`

			`def iterate(coco, base_dir, bs=8):`
			`image_ids = sorted(coco.imgs.keys())`
			`for i in range(0, len(image_ids), bs):`
			`X, targets = [], []`
			`for img_id in image_ids[i:i+bs]:`
			`img_dict = coco.loadImgs(img_id)[0]`
openpilot v0.9.9 release date: 2025-04-26T09:09:58 master commit: 9d17c73f6b8677fc986d2c02096950715c64694c 15 hours ago			`x, original_size = resize(image_load(base_dir, img_dict['subset'], img_dict["file_name"]))`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago			`X.append(x)`
			`annotations = coco.loadAnns(coco.getAnnIds(img_id))`
			`targets.append(prepare_target(annotations, img_id, original_size))`
			`yield np.array(X), targets`

			`def download_dataset(base_dir:Path, subset:str) -> Path:`
			`if (ann_file:=base_dir / f"{subset}/labels/openimages-mlperf.json").is_file(): print(f"{subset} dataset is already available")`
			`else:`
			`print(f"Downloading {subset} dataset...")`
			`openimages(base_dir, subset, ann_file)`
			`print("Done")`

			`return ann_file`

openpilot v0.9.9 release date: 2025-04-26T09:09:58 master commit: 9d17c73f6b8677fc986d2c02096950715c64694c 15 hours ago			`def random_horizontal_flip(img, tgt, prob=0.5):`
			`import torch`
			`import torchvision.transforms.functional as F`
			`if torch.rand(1) < prob:`
			`w = img.size[0]`
			`img = F.hflip(img)`
			`tgt["boxes"][:, [0, 2]] = w - tgt["boxes"][:, [2, 0]]`
			`return img, tgt`

			`def resize(img:Image, tgt:dict[str, np.ndarray\|tuple]\|None=None, size:tuple[int, int]=(800, 800)) -> tuple[np.ndarray, np.ndarray, tuple]\|tuple[np.ndarray, tuple]:`
			`import torchvision.transforms.functional as F`
			`img_size = img.size[::-1]`
			`img = F.resize(img, size=size)`
			`img = np.array(img)`

			`if tgt is not None:`
			`ratios = [s / s_orig for s, s_orig in zip(size, img_size)]`
			`ratio_h, ratio_w = ratios`
			`x_min, y_min, x_max, y_max = [tgt["boxes"][:, i] for i in range(tgt["boxes"].shape[-1])]`
			`x_min = x_min * ratio_w`
			`x_max = x_max * ratio_w`
			`y_min = y_min * ratio_h`
			`y_max = y_max * ratio_h`

			`tgt["boxes"] = np.stack([x_min, y_min, x_max, y_max], axis=1)`
			`return img, tgt, img_size`

			`return img, img_size`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 1 month ago
			`if __name__ == "__main__":`
			`download_dataset(base_dir:=getenv("BASE_DIR", BASEDIR), "train")`
			`download_dataset(base_dir, "validation")`