{ "description": "Measured audio contribution variants over the single public Xperience-10M sample episode.", "scope": "single public sample episode; chronological split; ridge heads over fixed feature contracts", "raw_audio_metadata": { "source": "local_public_sample/fisheye_cam0.mp4", "exists": true, "has_audio": true, "sample_rate": 16000, "fps": 20.00137419266181, "num_samples": 4656994, "num_windows": 1161, "feature_dim": 588, "mel_bands": 64, "fft_size": 512, "hop_length": 160, "feature_description": "Per-window raw waveform STFT log-mel statistics plus delta and waveform envelope statistics." }, "num_tasks": 12, "variants": { "all_handcrafted_audio": "All Current Features", "all_except_audio": "All Except Audio", "handcrafted_audio_only": "Audio Only", "raw_logmel_audio_only": "Raw Log-Mel Audio Only", "replace_handcrafted_with_raw": "Audio Representation Replacement", "all_plus_raw_logmel": "All Current Features + Raw Log-Mel" }, "task_summaries": [ { "task": "timeline_action", "task_display": "Current Action Recognition", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.00905456968081885, "all_except_audio": 0.008771929824561405, "handcrafted_audio_delta": 0.0002826398562574446, "raw_logmel_audio_only": 0.0, "replace_handcrafted_with_raw": 0.0013495276653171392, "raw_replacement_delta_vs_no_audio": -0.007422402159244265, "raw_replacement_delta_vs_handcrafted": -0.00770504201550171, "all_plus_raw_logmel": 0.002734107997265892, "all_plus_raw_delta_vs_handcrafted": -0.006320461683552957 }, { "task": "timeline_subtask", "task_display": "Current Subtask Recognition", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.011256354393609296, "all_except_audio": 0.0111731843575419, "handcrafted_audio_delta": 8.317003606739606e-05, "raw_logmel_audio_only": 0.0016722408026755855, "replace_handcrafted_with_raw": 0.0008257638315441783, "raw_replacement_delta_vs_no_audio": -0.01034742052599772, "raw_replacement_delta_vs_handcrafted": -0.010430590562065117, "all_plus_raw_logmel": 0.0017889087656529517, "all_plus_raw_delta_vs_handcrafted": -0.009467445627956345 }, { "task": "transition_detection", "task_display": "Action Transition Detection", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.46213292117465227, "all_except_audio": 0.46870229007633585, "handcrafted_audio_delta": -0.006569368901683581, "raw_logmel_audio_only": 0.4637904468412942, "replace_handcrafted_with_raw": 0.4792100707180375, "raw_replacement_delta_vs_no_audio": 0.010507780641701658, "raw_replacement_delta_vs_handcrafted": 0.01707714954338524, "all_plus_raw_logmel": 0.4816233470132239, "all_plus_raw_delta_vs_handcrafted": 0.019490425838571634 }, { "task": "next_action", "task_display": "Next-Action Prediction", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.01058201058201058, "all_except_audio": 0.010709504685408301, "handcrafted_audio_delta": -0.0001274941033977215, "raw_logmel_audio_only": 0.0017301038062283738, "replace_handcrafted_with_raw": 0.006006006006006006, "raw_replacement_delta_vs_no_audio": -0.004703498679402295, "raw_replacement_delta_vs_handcrafted": -0.004576004576004574, "all_plus_raw_logmel": 0.0058479532163742695, "all_plus_raw_delta_vs_handcrafted": -0.00473405736563631 }, { "task": "hand_trajectory_forecast", "task_display": "Future Hand Motion Forecasting", "primary_metric": "mae", "higher_is_better": false, "all_handcrafted_audio": 4.466395378112793, "all_except_audio": 4.303755283355713, "handcrafted_audio_delta": -0.16264009475708008, "raw_logmel_audio_only": 3.1172122955322266, "replace_handcrafted_with_raw": 4.305870532989502, "raw_replacement_delta_vs_no_audio": -0.0021152496337890625, "raw_replacement_delta_vs_handcrafted": 0.16052484512329102, "all_plus_raw_logmel": 4.1367621421813965, "all_plus_raw_delta_vs_handcrafted": 0.3296332359313965 }, { "task": "contact_prediction", "task_display": "Contact State Prediction", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 1.0, "all_except_audio": 1.0, "handcrafted_audio_delta": 0.0, "raw_logmel_audio_only": 1.0, "replace_handcrafted_with_raw": 1.0, "raw_replacement_delta_vs_no_audio": 0.0, "raw_replacement_delta_vs_handcrafted": 0.0, "all_plus_raw_logmel": 1.0, "all_plus_raw_delta_vs_handcrafted": 0.0 }, { "task": "object_relevance", "task_display": "Relevant Object Prediction", "primary_metric": "micro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.15813953488372093, "all_except_audio": 0.14793328498912256, "handcrafted_audio_delta": 0.010206249894598368, "raw_logmel_audio_only": 0.15894868585732164, "replace_handcrafted_with_raw": 0.17871759890859482, "raw_replacement_delta_vs_no_audio": 0.030784313919472256, "raw_replacement_delta_vs_handcrafted": 0.020578064024873888, "all_plus_raw_logmel": 0.18262653898768813, "all_plus_raw_delta_vs_handcrafted": 0.024487004103967203 }, { "task": "caption_grounding", "task_display": "Language-to-Time Grounding", "primary_metric": "mrr", "higher_is_better": true, "all_handcrafted_audio": 0.03208567947149277, "all_except_audio": 0.027228528633713722, "handcrafted_audio_delta": 0.004857150837779045, "raw_logmel_audio_only": 0.014815197326242924, "replace_handcrafted_with_raw": 0.02484782598912716, "raw_replacement_delta_vs_no_audio": -0.002380702644586563, "raw_replacement_delta_vs_handcrafted": -0.007237853482365608, "all_plus_raw_logmel": 0.02719014883041382, "all_plus_raw_delta_vs_handcrafted": -0.004895530641078949 }, { "task": "cross_modal_retrieval", "task_display": "Cross-Modal Window Retrieval", "primary_metric": "mrr", "higher_is_better": true, "all_handcrafted_audio": 0.3751238286495209, "all_except_audio": 0.38921058177948, "handcrafted_audio_delta": -0.014086753129959106, "raw_logmel_audio_only": 0.01806792803108692, "replace_handcrafted_with_raw": 0.32749155163764954, "raw_replacement_delta_vs_no_audio": -0.061719030141830444, "raw_replacement_delta_vs_handcrafted": -0.04763227701187134, "all_plus_raw_logmel": 0.31795138120651245, "all_plus_raw_delta_vs_handcrafted": -0.05717244744300842 }, { "task": "modality_reconstruction", "task_display": "Sensor-to-Visual Reconstruction", "primary_metric": "mae", "higher_is_better": false, "all_handcrafted_audio": 9.79421329498291, "all_except_audio": 10.446661949157715, "handcrafted_audio_delta": 0.6524486541748047, "raw_logmel_audio_only": 2.6225292682647705, "replace_handcrafted_with_raw": 8.830678939819336, "raw_replacement_delta_vs_no_audio": 1.615983009338379, "raw_replacement_delta_vs_handcrafted": 0.9635343551635742, "all_plus_raw_logmel": 8.392388343811035, "all_plus_raw_delta_vs_handcrafted": 1.401824951171875 }, { "task": "temporal_order", "task_display": "Temporal Order Verification", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.5172413793103449, "all_except_audio": 0.4942528735632184, "handcrafted_audio_delta": 0.022988505747126464, "raw_logmel_audio_only": 0.5028735632183908, "replace_handcrafted_with_raw": 0.5301714439065678, "raw_replacement_delta_vs_no_audio": 0.03591857034334939, "raw_replacement_delta_vs_handcrafted": 0.012930064596222923, "all_plus_raw_logmel": 0.5330450130569861, "all_plus_raw_delta_vs_handcrafted": 0.015803633746641288 }, { "task": "misalignment_detection", "task_display": "Cross-Modal Misalignment Detection", "primary_metric": "macro_f1", "higher_is_better": true, "all_handcrafted_audio": 0.41734045375379186, "all_except_audio": 0.42258557365378524, "handcrafted_audio_delta": -0.005245119899993378, "raw_logmel_audio_only": 0.47823544277887897, "replace_handcrafted_with_raw": 0.44378951880827355, "raw_replacement_delta_vs_no_audio": 0.021203945154488313, "raw_replacement_delta_vs_handcrafted": 0.02644906505448169, "all_plus_raw_logmel": 0.4373795761078998, "all_plus_raw_delta_vs_handcrafted": 0.02003912235410793 } ], "aggregate": { "mean_handcrafted_audio_delta": 0.041849794979543296, "tasks_where_handcrafted_audio_improves": 6, "mean_raw_replacement_delta_vs_handcrafted": 0.09362598132150173, "tasks_where_raw_replacement_improves_over_handcrafted": 6 }, "provenance": { "suite_dir": "results/episode_task_suite", "shared_windows": "results/episode_task_suite/shared_windows.npz", "feature_manifest": "results/episode_task_suite/feature_manifest.json", "audio_source": "local_public_sample/fisheye_cam0.mp4", "annotation_source": "local_public_sample/annotation.hdf5", "homie_toolkit_available": true } }