Detecting critical moments, such as emotional outbursts or changes in decisions during conversations, is crucial for understanding shifts in human behavior and their consequences. Our work introduces a novel problem setting that focuses on these moments as "turning points (TPs)," accompanied by a meticulously curated, high-consensus, human-annotated multi-modal dataset. We provide precise timestamps, descriptions, and visual-textual evidence highlighting changes in emotions, behaviors, perspectives, and decisions at these turning points. Additionally, we propose a framework, TPMaven, which utilizes state-of-the-art vision-language models to construct a narrative from the videos and large language models. Evaluation results show that TPMaven achieves an F1-score of 0.88 in classification and 0.61 in detection, with additional explanations aligning with human expectations.
Statistic | Value |
---|---|
Total number of conversation videos | 340 |
Total duration (h) | 13.3 |
Total number of utterance-level videos | 12,351 |
Total number of words in all transcripts | 81,909 |
Average length of conversation transcripts | 241.5 |
Maximum length of conversation transcripts | 460 |
Average length of conversation videos (s) | 1.9 |
Maximum length of conversation videos (m) | 2.5 |
Total number of TPs videos | 214 |
Here is the Data Usage Agreement. Please sign it and send the signed document back to this email: dinhhogiabao@gmail.com with the subject title: "Request Access for the MTP Dataset." We will send the full dataset link to your email upon receiving the signed document.
The full dataset comprising 340 conversations, totaling approximately 13.3 hours of video content. The dataset will include additional utterance-level videos, transcripts, speaker IDs, and annotation files for turning points. Currently, we have provided some sample files in this link to enhance the reviewing process.
@article{bigbangtheory,
title={The Big Bang Theory},
author={Chuck Lorre and Bill Prady},
year={2007},
journal={CBS},
url={https://www.cbs.com/shows/big_bang_theory/}}
@inproceedings{ho-etal-2024-mtp,
title = "{MTP}: A Dataset for Multi-Modal Turning Points in Casual Conversations",
author = "Ho, Gia-Bao and Tan, Chang and Darban, Zahra and Salehi, Mahsa and Haf, Reza and Buntine, Wray",
editor = "Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-short.30",
pages = "314--326",
abstract = "Detecting critical moments, such as emotional outbursts or changes in decisions during conversations, is crucial for understanding shifts in human behavior and their consequences. Our work introduces a novel problem setting focusing on these moments as turning points (TPs), accompanied by a meticulously curated, high-consensus, human-annotated multi-modal dataset. We provide precise timestamps, descriptions, and visual-textual evidence high-lighting changes in emotions, behaviors, perspectives, and decisions at these turning points. We also propose a framework, TPMaven, utilizing state-of-the-art vision-language models to construct a narrative from the videos and large language models to classify and detect turning points in our multi-modal dataset. Evaluation results show that TPMaven achieves an F1-score of 0.88 in classification and 0.61 in detection, with additional explanations aligning with human expectations.",
}
@article{ho2024mtp,
title={MTP: A Dataset for Multi-Modal Turning Points in Casual Conversations},
author={Ho, Gia-Bao Dinh and Tan, Chang Wei and Darban, Zahra Zamanzadeh and Salehi, Mahsa and Haffari, Gholamreza and Buntine, Wray},
journal={arXiv preprint arXiv:2409.14801},
url={https://arxiv.org/abs/2409.14801},
year={2024}
}