{ "title": "Verified Qwen3-Omni LoRA Validation-Aware Held-Out Pilot", "status": "verified_validation_aware_diagnostic_pilot", "status_date": "2026-06-06", "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct", "adapter": "Qwen3-Omni LoRA", "dataset": "Ropedia Xperience-10M selected 128-episode pilot", "split_policy": { "unit": "episode", "selected_episode_counts": { "train": 96, "val": 16, "test": 16 }, "exported_window_counts": { "train": 2848, "val": 512, "test": 448 }, "exported_episode_counts": { "train": 89, "val": 16, "test": 14 }, "skipped_selected_episodes": 9, "leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation." }, "training": { "num_processes": 8, "epochs": 1, "lora_rank": 16, "lora_alpha": 32, "lora_dropout": 0.05, "num_train_samples": 2848, "num_val_samples": 512, "history": [ { "epoch": 1, "train_loss": 0.41304643672440994, "val_loss": 0.0330660454928875, "global_step": 356 } ], "loss": "answer-token cross entropy over supervised JSON tokens", "note": "This validation-aware run uses the selected validation split during training and preserves the held-out test split for final evaluation." }, "evaluation": { "split": "test", "num_samples": 448, "held_out_episode_count": 14, "json_validity_rate": 0.875, "action_macro_f1": 0.0026621494447581404, "subtask_accuracy": 0.006696428571428571, "transition_accuracy": 0.8504464285714286, "next_action_accuracy": 0.024553571428571428, "contact_accuracy": 0.6450892857142857, "object_micro_f1": 0.22299431459254582, "quality_target": { "json_validity_rate": 0.98, "status": "not_met" }, "previous_diagnostic_json_validity_rate": 0.8526785714285714 }, "interpretation": "This is a real held-out multi-episode validation-aware diagnostic pilot proving the export, LoRA training with validation monitoring, evaluation, validation, and public-safe packaging loop. JSON validity improved over the earlier no-validation diagnostic run, but task-quality metrics remain weak, so it should be used as a baseline and error-analysis starting point rather than a strong Xperience-10M model.", "public_package": { "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval", "audit_status": "pass", "contains_raw_xperience10m_data": false, "contains_qwen_base_weights": false, "contains_lora_weights": false, "error_analysis": { "status": "pass", "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/error_analysis_summary.json", "markdown_report": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/ERROR_ANALYSIS.md", "groupings": [ "episode", "action_family", "train_seen_status", "required_modality_state", "object_category" ], "key_readouts": { "parsed_prediction_rate": 0.8772321428571429, "weakest_action_family": "locomotion", "weakest_action_family_samples": 23, "weakest_action_family_parsed_prediction_rate": 0.2608695652173913, "seen_action_exact_rate": 0.04580152671755725, "unseen_action_exact_rate": 0.015772870662460567, "required_modality_state": "rrd_missing_only_required_modalities_present" } } }, "required_next_steps": [ "Improve JSON-format reliability through prompt, decoding, constrained parsing, or target formatting changes.", "Use the published held-out error analysis to prioritize JSON constraints, action/subtask formatting, object vocabulary handling, and missing-modality robustness.", "Run a second validation-aware Qwen3-Omni pass only after the JSON/output contract is tightened.", "Keep the same verified package contract for Cosmos-style world-model and VLA/policy branches." ] }