|
| 1 | +--- |
| 2 | +{ |
| 3 | + "name": "E-CelebV-HQ", |
| 4 | + "aliases": [], |
| 5 | + "year": 2025, |
| 6 | + "modalities": [ |
| 7 | + "Vision" |
| 8 | + ], |
| 9 | + "sensors": [ |
| 10 | + "V2E" |
| 11 | + ], |
| 12 | + "other_sensors": [], |
| 13 | + "category": "Human-centric Recordings", |
| 14 | + "tags": [], |
| 15 | + "description": "Facial keypoint alignment dataset", |
| 16 | + "dataset_properties": { |
| 17 | + "available_online": false, |
| 18 | + "has_real_data": false, |
| 19 | + "has_simulated_data": true, |
| 20 | + "has_ground_truth": true, |
| 21 | + "has_frames": true, |
| 22 | + "has_biases": false, |
| 23 | + "distribution_methods": [], |
| 24 | + "file_formats": [], |
| 25 | + "availability_comment": "", |
| 26 | + "dataset_links": [], |
| 27 | + "size_gb": 0.0, |
| 28 | + "size_type": "Compressed" |
| 29 | + }, |
| 30 | + "paper": { |
| 31 | + "title": "Event-based Facial Keypoint Alignment via Cross-Modal Fusion Attention and Self-Supervised Multi-Event Representation Learning", |
| 32 | + "doi": "10.48550/arXiv.2509.24968", |
| 33 | + "authors": [ |
| 34 | + "Donghwa Kang", |
| 35 | + "Junho Kim", |
| 36 | + "Dongwoo Kang" |
| 37 | + ], |
| 38 | + "abstract": "Event cameras offer unique advantages for facial keypoint alignment under challenging conditions, such as low light and rapid motion, due to their high temporal resolution and robustness to varying illumination. However, existing RGB facial keypoint alignment methods do not perform well on event data, and training solely on event data often leads to suboptimal performance because of its limited spatial information. Moreover, the lack of comprehensive labeled event datasets further hinders progress in this area. To address these issues, we propose a novel framework based on cross-modal fusion attention (CMFA) and self-supervised multi-event representation learning (SSMER) for event-based facial keypoint alignment. Our framework employs CMFA to integrate corresponding RGB data, guiding the model to extract robust facial features from event input images. In parallel, SSMER enables effective feature learning from unlabeled event data, overcoming spatial limitations. Extensive experiments on our real-event E-SIE dataset and a synthetic-event version of the public WFLW-V benchmark show that our approach consistently surpasses state-of-the-art methods across multiple evaluation metrics.", |
| 39 | + "open_access": false |
| 40 | + }, |
| 41 | + "citation_counts": [], |
| 42 | + "links": [ |
| 43 | + { |
| 44 | + "type": "preprint", |
| 45 | + "url": "https://arxiv.org/abs/2509.24968" |
| 46 | + } |
| 47 | + ], |
| 48 | + "full_name": "", |
| 49 | + "additional_metadata": { |
| 50 | + "num_recordings": "35664", |
| 51 | + "source_dataset": "CelebV-HQ" |
| 52 | + }, |
| 53 | + "bibtex": { |
| 54 | + "copyright": "Creative Commons Attribution 4.0 International", |
| 55 | + "year": 2025, |
| 56 | + "publisher": "arXiv", |
| 57 | + "title": "Event-based Facial Keypoint Alignment via Cross-Modal Fusion Attention and Self-Supervised Multi-Event Representation Learning", |
| 58 | + "keywords": "Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences", |
| 59 | + "author": "Kang, Donghwa and Kim, Junho and Kang, Dongwoo", |
| 60 | + "url": "https://arxiv.org/abs/2509.24968", |
| 61 | + "doi": "10.48550/ARXIV.2509.24968", |
| 62 | + "type": "misc", |
| 63 | + "key": "https://doi.org/10.48550/arxiv.2509.24968" |
| 64 | + } |
| 65 | +} |
| 66 | +--- |
| 67 | + |
| 68 | +# Dataset Description |
| 69 | + |
| 70 | +The synthetic dataset ECelebV-HQ was constructed to serve as the primary large-scale dataset for training and ablation studies. The v2e event simulator with frame interpolation was employed, with the event threshold parameter set to 0.2. A total of 35,664 event streams were generated from the CelebV-HQ videos and segmented into 25 fps intervals, producing three event representations: frame, voxel, and timesurface. From these representations, it was observed that segments with minimal motion produced very few events. To mitigate this issue and ensure data quality, the single frame with the highest event count from each video was selected. The corresponding RGB frames were then used for annotation, with 98 facial keypoints placed via SLPT and used as the ground-truth labels. The curated data were subsequently divided into 28,531 samples for training, 713 for validation, and 6,420 for testing. To obtain a high-confidence evaluation subset from the test split, the pseudo labels were manually verified, resulting in a final set of 1,554 retained images. |
0 commit comments