{
  "title": "Ropedia Xperience-10M Research Takeaways",
  "status": "pass",
  "generated_at_utc": "2026-06-06T13:49:32+00:00",
  "source_files": [
    "docs/data/summary_metrics.json",
    "results/episode_task_suite/summary_report.json",
    "results/episode_task_suite/neural_mlp/*/metrics.json",
    "docs/data/audio_ablation_summary.json",
    "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md"
  ],
  "scope": {
    "validated_episode_count": 1,
    "num_frames": 5821,
    "num_windows": 1161,
    "feature_dim": 8546,
    "audio_featurized": true,
    "raw_data_redistributed": false
  },
  "takeaways": [
    {
      "id": "episode_to_benchmark",
      "title": "One episode can become a real benchmark contract",
      "readout": "The public sample is converted into 5,821 frames, 1,161 aligned 20-frame windows, and an 8,546-dimensional feature contract.",
      "evidence": [
        {
          "label": "frames",
          "value": 5821
        },
        {
          "label": "windows",
          "value": 1161
        },
        {
          "label": "feature_dim",
          "value": 8546
        }
      ],
      "source": "docs/data/summary_metrics.json",
      "current_scope": "This benchmark defines the task contract; cross-episode generalization is evaluated in the multi-episode stage."
    },
    {
      "id": "chronological_split_exposes_class_shift",
      "title": "Chronological splits expose action-class shift",
      "readout": "Earlier all-feature action classifiers reach high macro-F1 on their local split, but the 12-task chronological action/subtask heads are much harder because later held-out windows include unseen labels.",
      "evidence": [
        {
          "label": "all_feature_action_macro_f1",
          "value": 0.9828810433408773
        },
        {
          "label": "suite_action_macro_f1",
          "value": 0.05
        },
        {
          "label": "suite_subtask_macro_f1",
          "value": 0.05056355513846935
        },
        {
          "label": "unseen_action_test_classes",
          "value": 4
        }
      ],
      "source": "results/episode_task_suite/summary_report.json",
      "current_scope": "This split is useful for studying label shift; broad action-recognition conclusions need held-out episodes."
    },
    {
      "id": "neural_heads_help_dynamics",
      "title": "Small neural heads help dynamic and temporal probes",
      "readout": "The MLP heads substantially improve hand trajectory forecasting, temporal-order verification, and motion/visual synchronization.",
      "evidence": [
        {
          "label": "hand_mpjpe_minimal",
          "value": 0.8646570444107056
        },
        {
          "label": "hand_mpjpe_neural",
          "value": 0.10785018652677536
        },
        {
          "label": "hand_mpjpe_relative_improvement",
          "value": 0.8752682497367739
        },
        {
          "label": "temporal_order_f1_minimal",
          "value": 0.5399515738498789
        },
        {
          "label": "temporal_order_f1_neural",
          "value": 0.8520179372197308
        },
        {
          "label": "misalignment_f1_minimal",
          "value": 0.5051698670605613
        },
        {
          "label": "misalignment_f1_neural",
          "value": 0.7152682255845944
        }
      ],
      "source": "results/episode_task_suite/neural_mlp/*/metrics.json",
      "current_scope": "These gains are measured within one episode and are candidates for held-out-episode testing."
    },
    {
      "id": "retrieval_and_reconstruction_remain_open",
      "title": "Retrieval and reconstruction remain the harder multimodal problems",
      "readout": "Ridge/cosine retrieval remains stronger than the neural projection on this sample, and cross-modal reconstruction still has negative R2.",
      "evidence": [
        {
          "label": "retrieval_mrr_minimal",
          "value": 0.26925966892956127
        },
        {
          "label": "retrieval_mrr_neural",
          "value": 0.1299971898648288
        },
        {
          "label": "retrieval_top5_minimal",
          "value": 0.367816091954023
        },
        {
          "label": "reconstruction_r2_minimal",
          "value": -0.015271898913936655
        },
        {
          "label": "reconstruction_r2_neural",
          "value": -0.010171410134180991
        }
      ],
      "source": "results/episode_task_suite/cross_modal_retrieval/metrics.json",
      "current_scope": "The current reconstruction task predicts feature vectors; depth, mesh, NeRF, and Gaussian-splatting outputs are future task variants."
    },
    {
      "id": "audio_contribution_is_task_specific",
      "title": "Audio helps some tasks and hurts others on the public sample",
      "readout": "Audio improves the primary metric on 6 of 12 tasks, while raw log-mel replacement improves over the current handcrafted block on 6 of 12 tasks. The largest current-audio gain appears in feature reconstruction, not in action classification.",
      "evidence": [
        {
          "label": "tasks_where_current_audio_improves",
          "value": 6
        },
        {
          "label": "mean_current_audio_delta",
          "value": 0.041849794979543296
        },
        {
          "label": "tasks_where_raw_replacement_improves",
          "value": 6
        },
        {
          "label": "mean_raw_replacement_delta_vs_current",
          "value": 0.09362598132150173
        },
        {
          "label": "reconstruction_current_audio_delta",
          "value": 0.6524486541748047
        },
        {
          "label": "object_relevance_current_audio_delta",
          "value": 0.010206249894598368
        }
      ],
      "source": "results/audio_ablation/audio_ablation_summary.json",
      "current_scope": "This is a single-episode ablation over fixed ridge heads. It validates that audio is wired into the task suite and shows where it changes metrics; it does not prove cross-episode audio generalization."
    },
    {
      "id": "scale_requires_episodes",
      "title": "The next scientific unit is held-out episodes, not more adjacent windows",
      "readout": "The selected Qwen3-Omni path now has a verified validation-aware held-out diagnostic pilot. It proves the cross-episode train/validation/eval loop, but the weak metrics show that structured-output reliability and task-quality error analysis are the next modeling problems.",
      "evidence": [
        {
          "label": "selected_episodes",
          "value": 128
        },
        {
          "label": "held_out_test_windows",
          "value": 448
        },
        {
          "label": "json_validity_rate",
          "value": 0.875
        },
        {
          "label": "action_macro_f1",
          "value": 0.0026621494447581404
        }
      ],
      "source": "docs/data/omni_finetune_verified_result.json",
      "current_scope": "The selected-episode Qwen3-Omni validation-aware diagnostic pilot is verified, but held-out quality is still weak and JSON validity remains below the 98% target."
    }
  ]
}