{ "schemaKind": "protocol", "protocolVersion": "1.0", "meta": { "archetype": "ml_train", "eligibility": "eligible", "repo": { "owner": "trevin-creator", "name": "autoresearch-mlx", "defaultBranch": "main", "cloneUrl": "https://github.com/trevin-creator/autoresearch-mlx" }, "purposeStatement": "Improve the Apple Silicon MLX training recipe to minimize validation bits-per-byte under a fixed local training budget.", "createdAt": "2026-05-03T15:04:16Z", "updatedAt": "2026-05-03T15:04:16Z", "protocolBundleId": "trevin-creator-autoresearch-mlx-ba6ebf6-20260503" }, "environment": { "osHints": [ "darwin-arm64", "Apple Silicon Mac" ], "packageManagers": [ "uv" ], "setupCommands": [ "uv sync", "uv run prepare.py" ], "assetPrep": [ "Prepare local data and tokenizer cache at ~/.cache/autoresearch/ via uv run prepare.py." ], "constraints": { "noNewDependencies": true, "networkPolicy": "full" } }, "mutableSurface": { "allowedGlobs": [ "train.py" ], "forbiddenGlobs": [ "prepare.py", "pyproject.toml", "uv.lock" ], "allowedKinds": [ "code_edit" ] }, "immutableHarness": { "paths": [ "prepare.py" ], "rationale": "prepare.py defines the data preparation, tokenizer, dataloader, fixed constants, and evaluate_bpb metric; modifying it would invalidate comparisons." }, "execution": { "command": "uv run train.py", "cwd": ".", "stopCondition": { "type": "wall_clock", "trainingSecondsBudget": 300, "excludeCompilationFromBudget": true, "notes": "The script runs a fixed 5-minute training budget, with startup/compilation and final eval overhead outside the training timer." }, "hardTimeoutSeconds": 900, "determinism": { "seedPolicy": "optional", "notes": "Compare fresh baseline and trials on the same Apple Silicon hardware and prepared data snapshot." } }, "measurement": { "primaryMetric": { "name": "val_bpb", "direction": "minimize", "extract": { "kind": "regex", "pattern": "^val_bpb:\\s+([0-9]+(?:\\.[0-9]+)?)", "exampleStdout": "---\nval_bpb: 2.534000\ntraining_seconds: 312.4\ntotal_seconds: 405.7\npeak_vram_mb: 27528.9\nmfu_percent: 0.00\ntotal_tokens_M: 39.8\nnum_steps: 46\nnum_params_M: 50.3\ndepth: 8" } }, "secondaryMetrics": [ { "name": "peak_vram_mb", "direction": "minimize", "extract": { "kind": "regex", "pattern": "^peak_vram_mb:\\s+([0-9]+(?:\\.[0-9]+)?)", "exampleStdout": "peak_vram_mb: 27528.9" } }, { "name": "num_steps", "direction": "maximize", "extract": { "kind": "regex", "pattern": "^num_steps:\\s+([0-9]+)", "exampleStdout": "num_steps: 46" } } ], "baselinePolicy": { "establishOnHardware": true, "sameDataSnapshot": true, "baselineNotes": "Baseline was established on this hardware after uv run prepare.py using local ~/.cache/autoresearch/ data/tokenizer snapshot. FINAL_EVAL_BATCH_SIZE was reduced from 256 to 64 in train.py to fit this machine's Metal buffer limit while preserving the same total eval token budget." } }, "provenance": { "resultsLog": { "format": "tsv", "path": "results.tsv", "columns": [ "commit", "val_bpb", "memory_gb", "status", "description" ] }, "gitWorkflow": { "branchPattern": "autoresearch/", "commitScope": "Only stage experiment changes and result log updates for this repo.", "stagingExample": "git add train.py results.tsv" }, "baselineArtifactPath": ".autoresearch/create/baseline_run.log" }, "safety": { "oomPolicy": "reduce_batch", "crashStatus": "discard" }, "agentRules": { "simplicityCriterion": true, "autonomy": { "noAskHumanToContinue": true }, "experimentTimeoutNotes": "Treat runs exceeding 15 minutes as failed or hung.", "logRedirectExample": "uv run train.py > run.log 2>&1" }, "archetypeExtensions": { "ml_train": { "dataSnapshot": "Local ~/.cache/autoresearch/ data and tokenizer prepared by uv run prepare.py.", "hardwareNotes": "Apple Silicon MLX results are hardware-sensitive; compare against a fresh baseline on the same machine or clearly compatible hardware class." } } }