原文
# s02b: download every photo URL, score with CLIP, # write parquet shards to ./shared. 6K batches. from burla import remote_parallel_map import open_clip def score_batch(args): model, _, prep = open_clip.create_model_and_transforms( "ViT-B-32", pretrained="laion2b_s34b_b79k", cache_dir="./shared/clip_weights", ) # download -> encode -> cosine vs PROMPTS -> parquet return {"shard": shard, "n_ok": n_ok} remote_parallel_map( score_batch, batch_args, func_cpu=2, func_ram=8, max_parallelism=1000, # 1k concurrent at peak grow=True, )
# s04 tier 2: embed top 200K reviews with SBERT, # one parquet shard per worker on ./shared. from burla import remote_parallel_map from sentence_transformers import SentenceTransformer def embed_batch(args): model = SentenceTransformer( "all-MiniLM-L6-v2", cache_folder="./shared/sbert", ) rows = read_slice( args.input_path, args.row_start, args.row_end, ) vecs = model.encode( rows["comments"].tolist(), batch_size=128, ) write_shard(args.output_root, rows, vecs) return {"n_ok": len(rows)} remote_parallel_map( embed_batch, embed_args, func_cpu=2, func_ram=8, max_parallelism=200, grow=True, )
# s05c: Haiku Vision double-checks the CLIP # shortlists. Rate-limited at 64 workers. from burla import remote_parallel_map import anthropic, json def validate_pet(args): client = anthropic.Anthropic() rows = [] for url, listing_id in args.batch: msg = client.messages.create( model="claude-haiku-4-5", max_tokens=200, messages=pet_prompt(fetch(url)), ) verdict = json.loads(msg.content[0].text) rows.append({"listing_id": listing_id, **verdict}) write_shard(args.output_path, rows) return {"n_ok": len(rows)} remote_parallel_map( validate_pet, pet_batches, func_cpu=2, func_ram=8, max_parallelism=64, grow=True, )