| { |
| "title": "Xperience-10M Foundation Model Plan", |
| "status": "planning_artifact", |
| "current_boundary": "A first held-out multi-episode Qwen3-Omni diagnostic pilot is verified in this repo, but it is not a strong model result. The current foundation-model work should treat it as the baseline train/eval/package loop before validation-aware Qwen reruns, Cosmos-style world modeling, or policy/VLA branches.", |
| "backbone_registry": { |
| "config_dir": "configs/omni_backbones", |
| "validator": "scripts/omni/backbone_registry.py --validate --json", |
| "extension_contract": "OMNI_MODEL_EXTENSION_CONTRACT.md", |
| "implemented_backbone": "qwen3_omni_lora", |
| "planned_backbones": [ |
| "cosmos_world_model", |
| "policy_vla_branch" |
| ] |
| }, |
| "decision": { |
| "immediate_trainable_backbone": "Qwen3-Omni", |
| "first_world_model_branch": "Cosmos 3", |
| "first_policy_branch_candidates": [ |
| "OpenVLA / OpenVLA-OFT", |
| "openpi pi0/pi0.5", |
| "NVIDIA GR00T" |
| ], |
| "external_reasoning_reference": "Gemini Robotics", |
| "long_term_native_pretraining_goal": "Xperience Embodied Foundation Model" |
| }, |
| "future_pretraining_goal": { |
| "name": "Xperience Embodied Foundation Model", |
| "status": "future_planning_goal", |
| "role": "Domain-specific embodied foundation model pretrained on full Xperience-10M if full-corpus data, storage, and compute become available.", |
| "not_current_result": true, |
| "document": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md", |
| "entry_conditions": [ |
| "Selected multi-episode Qwen3-Omni pilot trains and evaluates cleanly.", |
| "Scaling from 128 episodes to thousands of episodes shows measurable value.", |
| "Full-corpus storage, derived-shard storage, and fast active-cache capacity are available.", |
| "Distributed training, checkpoint/restart, and provenance tracking are reliable.", |
| "Evaluation covers held-out episodes, sessions, activities, objects, and missing-modality robustness." |
| ], |
| "target_modules": [ |
| "multi-view video encoder", |
| "audio encoder", |
| "depth and geometry encoder", |
| "pose/SLAM encoder", |
| "hand/body mocap encoder", |
| "IMU encoder", |
| "language encoder/decoder", |
| "temporal fusion transformer", |
| "task heads and decoders" |
| ], |
| "pretraining_objectives": [ |
| "masked multimodal modeling", |
| "cross-modal contrastive alignment", |
| "future-state prediction", |
| "ego-motion and hand-motion forecasting", |
| "action and procedure prediction", |
| "language grounding and captioning", |
| "contact and affordance prediction", |
| "optional policy-style targets after action conversion" |
| ], |
| "hardware_ranges": [ |
| { |
| "goal": "0.3B-1B pilot", |
| "compute": "8-32 modern 80GB-class data-center GPUs", |
| "use": "prove objectives and data loaders" |
| }, |
| { |
| "goal": "1B-3B domain model", |
| "compute": "32-128 GPUs", |
| "use": "research-scale Xperience representation learning" |
| }, |
| { |
| "goal": "3B-7B full-corpus domain model", |
| "compute": "128-512 GPUs", |
| "use": "first realistic full Xperience-native foundation model" |
| }, |
| { |
| "goal": "30B-class omni model from scratch", |
| "compute": "512-2000+ GPUs", |
| "use": "lab-scale project after scaling curves justify cost" |
| } |
| ] |
| }, |
| "model_families": [ |
| { |
| "priority": 1, |
| "family": "Qwen3-Omni", |
| "category": "omni_instruction_model", |
| "openness": "open_weights_available_from_official_hf_repo", |
| "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.", |
| "xperience10m_fit": [ |
| "RGB/fisheye video, embedded audio, and language prompts can enter directly.", |
| "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.", |
| "Matches current task outputs: labels, structured JSON, captions, and short decisions." |
| ], |
| "current_decision": "keep_as_first_pilot", |
| "entry_condition": "Selected episodes prepared with held-out episode split.", |
| "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" |
| }, |
| { |
| "priority": 2, |
| "family": "Cosmos 3", |
| "category": "world_foundation_model", |
| "openness": "track_official_nvidia_release_and_available_weights", |
| "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.", |
| "xperience10m_fit": [ |
| "Uses video streams as visual state.", |
| "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.", |
| "Better aligned with prediction/generation objectives than simple label classification." |
| ], |
| "current_decision": "add_as_first_world_model_branch_after_data_gate", |
| "entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.", |
| "public_source": "https://www.nvidia.com/en-us/ai/cosmos/" |
| }, |
| { |
| "priority": 3, |
| "family": "NVIDIA GR00T", |
| "category": "humanoid_policy_foundation_model", |
| "openness": "track_official_nvidia_release_and_tooling", |
| "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.", |
| "xperience10m_fit": [ |
| "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.", |
| "Egocentric video plus human motion can support affordance and interaction tasks." |
| ], |
| "current_decision": "track_as_humanoid_policy_branch", |
| "entry_condition": "Retargeting artifact and action-space definition exist.", |
| "public_source": "https://developer.nvidia.com/isaac/gr00t" |
| }, |
| { |
| "priority": 4, |
| "family": "OpenVLA / OpenVLA-OFT", |
| "category": "vision_language_action_policy", |
| "openness": "open_project_and_weights", |
| "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.", |
| "xperience10m_fit": [ |
| "Good candidate when each window is expressed as visual observation, instruction/context, and action token.", |
| "Requires an explicit action target; current human egocentric labels are not robot controls by default." |
| ], |
| "current_decision": "candidate_after_action_space_design", |
| "entry_condition": "Window-to-action-token conversion is implemented and checked.", |
| "public_source": "https://openvla.github.io/" |
| }, |
| { |
| "priority": 5, |
| "family": "openpi pi0/pi0.5", |
| "category": "robot_policy_model", |
| "openness": "open_source_policy_training_stack", |
| "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.", |
| "xperience10m_fit": [ |
| "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.", |
| "Better for policy branch than for current structured task JSON outputs." |
| ], |
| "current_decision": "candidate_policy_branch", |
| "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.", |
| "public_source": "https://github.com/Physical-Intelligence/openpi" |
| }, |
| { |
| "priority": 6, |
| "family": "Gemini Robotics", |
| "category": "closed_embodied_reasoning_reference", |
| "openness": "closed_or_limited_access", |
| "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.", |
| "xperience10m_fit": [ |
| "Can help reason over egocentric scenes and task descriptions.", |
| "Not a local fine-tune target for this repo." |
| ], |
| "current_decision": "external_reference_only", |
| "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.", |
| "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" |
| }, |
| { |
| "priority": 7, |
| "family": "Octo / SmolVLA-style lightweight policies", |
| "category": "lightweight_robot_policy_baselines", |
| "openness": "open_projects", |
| "best_role": "Cheaper policy baselines for observation-to-action experiments.", |
| "xperience10m_fit": [ |
| "Useful after action target design.", |
| "Less directly omni-modal than Qwen3-Omni or Cosmos 3." |
| ], |
| "current_decision": "optional_baseline_after_data_staging", |
| "entry_condition": "Action labels and baseline protocol exist.", |
| "public_source": "https://github.com/huggingface/lerobot" |
| }, |
| { |
| "priority": 8, |
| "family": "Xperience Embodied Foundation Model", |
| "category": "xperience_native_pretraining_goal", |
| "openness": "future project-specific model if full-corpus access and compute exist", |
| "best_role": "Domain model over synchronized embodied experience.", |
| "xperience10m_fit": [ |
| "Uses the full aligned modality stack rather than treating sensors as auxiliary metadata.", |
| "Targets temporal embodied representation learning across perception, motion, geometry, audio, and language.", |
| "Can become the shared pretraining backbone for Qwen-style instruction tasks, Cosmos-style world modeling, and policy/action branches." |
| ], |
| "current_decision": "future_goal_after_scaling_evidence", |
| "entry_condition": "Full-corpus data path, PB-scale storage, multi-node compute, and positive smaller-run scaling evidence.", |
| "public_source": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md" |
| } |
| ], |
| "execution_order": [ |
| { |
| "step": 1, |
| "name": "Data gate", |
| "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split." |
| }, |
| { |
| "step": 2, |
| "name": "First held-out baseline", |
| "action": "Run validation-aware Qwen3-Omni LoRA to improve the verified diagnostic baseline." |
| }, |
| { |
| "step": 3, |
| "name": "Model-selection dry run", |
| "action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate." |
| }, |
| { |
| "step": 4, |
| "name": "World-model branch", |
| "action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute." |
| }, |
| { |
| "step": 5, |
| "name": "Policy branch", |
| "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable." |
| }, |
| { |
| "step": 6, |
| "name": "Publishing threshold", |
| "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples." |
| }, |
| { |
| "step": 7, |
| "name": "Xperience-native pretraining", |
| "action": "Start a from-scratch Xperience Embodied Foundation Model only after smaller scaling stages, full-corpus storage, multi-node compute, and held-out evaluation protocols are in place." |
| } |
| ], |
| "evaluation_additions": [ |
| { |
| "target": "structured_task_prediction", |
| "metrics": [ |
| "JSON validity", |
| "macro-F1", |
| "accuracy", |
| "micro-F1" |
| ], |
| "model_families": [ |
| "Qwen3-Omni", |
| "Gemini Robotics reference" |
| ] |
| }, |
| { |
| "target": "future_state_prediction", |
| "metrics": [ |
| "retrieval rank", |
| "temporal consistency", |
| "feature reconstruction", |
| "qualitative visual inspection" |
| ], |
| "model_families": [ |
| "Cosmos 3" |
| ] |
| }, |
| { |
| "target": "action_conditioned_dynamics", |
| "metrics": [ |
| "transition accuracy", |
| "contact accuracy", |
| "next-action accuracy" |
| ], |
| "model_families": [ |
| "Cosmos 3", |
| "OpenVLA", |
| "openpi", |
| "GR00T" |
| ] |
| }, |
| { |
| "target": "cross_episode_generalization", |
| "metrics": [ |
| "held-out episode metrics", |
| "held-out session metrics", |
| "leakage checks" |
| ], |
| "model_families": [ |
| "all trainable branches" |
| ] |
| } |
| ], |
| "source_links": [ |
| { |
| "label": "Qwen3-Omni official HF model", |
| "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct" |
| }, |
| { |
| "label": "NVIDIA Cosmos", |
| "url": "https://www.nvidia.com/en-us/ai/cosmos/" |
| }, |
| { |
| "label": "NVIDIA Isaac GR00T", |
| "url": "https://developer.nvidia.com/isaac/gr00t" |
| }, |
| { |
| "label": "OpenVLA", |
| "url": "https://openvla.github.io/" |
| }, |
| { |
| "label": "openpi", |
| "url": "https://github.com/Physical-Intelligence/openpi" |
| }, |
| { |
| "label": "Gemini Robotics", |
| "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/" |
| }, |
| { |
| "label": "Octo", |
| "url": "https://octo-models.github.io/" |
| }, |
| { |
| "label": "LeRobot / SmolVLA", |
| "url": "https://github.com/huggingface/lerobot" |
| }, |
| { |
| "label": "Xperience Embodied Foundation Model pretraining plan", |
| "url": "XPERIENCE_EMBODIED_FOUNDATION_MODEL_PRETRAINING.md" |
| } |
| ] |
| } |
|
|