2025
|
Soltandoost, Elahe; Plesh, Richard; Schuckers, Stephanie; Peer, Peter; Struc, Vitomir Extracting Local Information from Global Representations for Interpretable Deepfake Detection Proceedings Article In: Proceedings of IEEE/CFV Winter Conference on Applications in Computer Vision - Workshops (WACV-W) 2025, pp. 1-11, Tucson, USA, 2025. @inproceedings{Elahe_WACV2025,
title = {Extracting Local Information from Global Representations for Interpretable Deepfake Detection},
author = {Elahe Soltandoost and Richard Plesh and Stephanie Schuckers and Peter Peer and Vitomir Struc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2025/01/ElahePaperF.pdf},
year = {2025},
date = {2025-03-01},
booktitle = {Proceedings of IEEE/CFV Winter Conference on Applications in Computer Vision - Workshops (WACV-W) 2025},
pages = {1-11},
address = {Tucson, USA},
abstract = {The detection of deepfakes has become increasingly challenging due to the sophistication of manipulation techniques that produce highly convincing fake videos. Traditional detection methods often lack transparency and provide limited insight into their decision-making processes. To address these challenges, we propose in this paper a Locally-Explainable Self-Blended (LESB) DeepFake detector that in addition to the final fake-vs-real classification decision also provides information, on which local facial region (i.e., eyes, mouth or nose) contributed the most to the decision process.~At the heart of the detector is a novel Local Feature Discovery (LFD) technique that can be applied to the embedding space of pretrained DeepFake detectors and allows identifying embedding space directions that encode variations in the appearance of local facial features. We demonstrate the merits of the proposed LFD technique and LESB detector in comprehensive experiments on four popular datasets, i.e., Celeb-DF, DeepFake Detection Challenge, Face Forensics in the Wild and FaceForensics++, and show that the proposed detector is not only competitive in comparison to strong baselines, but also exhibits enhanced transparency in the decision-making process by providing insights on the contribution of local face parts in the final detection decision. },
keywords = {CNN, deepfake DAD, deepfakes, faceforensics++, media forensics, xai},
pubstate = {published},
tppubtype = {inproceedings}
}
The detection of deepfakes has become increasingly challenging due to the sophistication of manipulation techniques that produce highly convincing fake videos. Traditional detection methods often lack transparency and provide limited insight into their decision-making processes. To address these challenges, we propose in this paper a Locally-Explainable Self-Blended (LESB) DeepFake detector that in addition to the final fake-vs-real classification decision also provides information, on which local facial region (i.e., eyes, mouth or nose) contributed the most to the decision process.~At the heart of the detector is a novel Local Feature Discovery (LFD) technique that can be applied to the embedding space of pretrained DeepFake detectors and allows identifying embedding space directions that encode variations in the appearance of local facial features. We demonstrate the merits of the proposed LFD technique and LESB detector in comprehensive experiments on four popular datasets, i.e., Celeb-DF, DeepFake Detection Challenge, Face Forensics in the Wild and FaceForensics++, and show that the proposed detector is not only competitive in comparison to strong baselines, but also exhibits enhanced transparency in the decision-making process by providing insights on the contribution of local face parts in the final detection decision. |
2024
|
Dragar, Luka; Rot, Peter; Peer, Peter; Štruc, Vitomir; Batagelj, Borut W-TDL: Window-Based Temporal Deepfake Localization Proceedings Article In: Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing (MRAC ’24), Proceedings of the 32nd ACM International Conference on Multimedia (MM’24), ACM, 2024. @inproceedings{MRAC2024,
title = {W-TDL: Window-Based Temporal Deepfake Localization},
author = {Luka Dragar and Peter Rot and Peter Peer and Vitomir Štruc and Borut Batagelj},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2024/09/ACM_1M_DeepFakes.pdf},
year = {2024},
date = {2024-11-01},
booktitle = {Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing (MRAC ’24), Proceedings of the 32nd ACM International Conference on Multimedia (MM’24)},
publisher = {ACM},
abstract = {The quality of synthetic data has advanced to such a degree of realism that distinguishing it from genuine data samples is increasingly challenging. Deepfake content, including images, videos, and audio, is often used maliciously, necessitating effective detection methods. While numerous competitions have propelled the development of deepfake detectors, a significant gap remains in accurately pinpointing the temporal boundaries of manipulations. Addressing this, we propose an approach for temporal deepfake localization (TDL) utilizing a window-based method for audio (W-TDL) and a complementary visual frame-based model. Our contributions include an effective method for detecting and localizing fake video and audio segments and addressing unbalanced training labels in spoofed audio datasets. Our approach leverages the EVA visual transformer for frame-level analysis and a modified TDL method for audio, achieving competitive results in the 1M-DeepFakes Detection Challenge. Comprehensive experiments on the AV-Deepfake1M dataset demonstrate the effectiveness of our method, providing an effective solution to detect and localize deepfake manipulations.},
keywords = {CNN, deepfake DAD, deepfakes, deeplearning, detection, localization},
pubstate = {published},
tppubtype = {inproceedings}
}
The quality of synthetic data has advanced to such a degree of realism that distinguishing it from genuine data samples is increasingly challenging. Deepfake content, including images, videos, and audio, is often used maliciously, necessitating effective detection methods. While numerous competitions have propelled the development of deepfake detectors, a significant gap remains in accurately pinpointing the temporal boundaries of manipulations. Addressing this, we propose an approach for temporal deepfake localization (TDL) utilizing a window-based method for audio (W-TDL) and a complementary visual frame-based model. Our contributions include an effective method for detecting and localizing fake video and audio segments and addressing unbalanced training labels in spoofed audio datasets. Our approach leverages the EVA visual transformer for frame-level analysis and a modified TDL method for audio, achieving competitive results in the 1M-DeepFakes Detection Challenge. Comprehensive experiments on the AV-Deepfake1M dataset demonstrate the effectiveness of our method, providing an effective solution to detect and localize deepfake manipulations. |
Brodarič, Marko; Peer, Peter; Struc, Vitomir Towards Improving Backbones for Deepfake Detection Proceedings Article In: Proceedings of ERK 2024, pp. 1-4, 2024. @inproceedings{ERK_2024_Deepfakes,
title = {Towards Improving Backbones for Deepfake Detection},
author = {Marko Brodarič and Peter Peer and Vitomir Struc},
year = {2024},
date = {2024-09-25},
booktitle = {Proceedings of ERK 2024},
pages = {1-4},
keywords = {CNN, deep learning, deepfake detection, deepfakes, media forensics, transformer},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Rot, Peter; Terhorst, Philipp; Peer, Peter; Štruc, Vitomir ASPECD: Adaptable Soft-Biometric Privacy-Enhancement Using Centroid Decoding for Face Verification Proceedings Article In: Proceedings of the IEEE International Conference on Automatic Face and Gesture Recognition (FG), pp. 1-9, 2024. @inproceedings{Rot_FG2024,
title = {ASPECD: Adaptable Soft-Biometric Privacy-Enhancement Using Centroid Decoding for Face Verification},
author = {Peter Rot and Philipp Terhorst and Peter Peer and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2024/03/PeterRot_FG2024.pdf},
year = {2024},
date = {2024-05-28},
booktitle = {Proceedings of the IEEE International Conference on Automatic Face and Gesture Recognition (FG)},
pages = {1-9},
abstract = {State-of-the-art face recognition models commonly extract information-rich biometric templates from the input images that are then used for comparison purposes and identity inference. While these templates encode identity information in a highly discriminative manner, they typically also capture other potentially sensitive facial attributes, such as age, gender or ethnicity. To address this issue, Soft-Biometric Privacy-Enhancing Techniques (SB-PETs) were proposed in the literature that aim to suppress such attribute information, and, in turn, alleviate the privacy risks associated with the extracted biometric templates. While various SB-PETs were presented so far, existing approaches do not provide dedicated mechanisms to determine which soft-biometrics to exclude and which to retain. In this paper, we address this gap and introduce ASPECD, a modular framework designed to selectively suppress binary and categorical soft-biometrics based on users' privacy preferences. ASPECD consists of multiple sequentially connected components, each dedicated for privacy-enhancement of an individual soft-biometric attribute. The proposed framework suppresses attribute information using a Moment-based Disentanglement process coupled with a centroid decoding procedure, ensuring that the privacy-enhanced templates are directly comparable to the templates in the original embedding space, regardless of the soft-biometric modality being suppressed.
To validate the performance of ASPECD, we conduct experiments on a large-scale face dataset and with five state-of-the-art face recognition models, demonstrating the effectiveness of the proposed approach in suppressing single and multiple soft-biometric attributes. Our approach achieves a competitive privacy-utility trade-off compared to the state-of-the-art methods in scenarios that involve enhancing privacy w.r.t. gender and ethnicity attributes. Source code will be made publicly available.},
keywords = {deepfake, deepfakes, face, face analysis, face deidentification, face image processing, face images, face synthesis, face verification, privacy, privacy enhancement, privacy protection, privacy-enhancing techniques, soft biometric privacy, soft biometrics},
pubstate = {published},
tppubtype = {inproceedings}
}
State-of-the-art face recognition models commonly extract information-rich biometric templates from the input images that are then used for comparison purposes and identity inference. While these templates encode identity information in a highly discriminative manner, they typically also capture other potentially sensitive facial attributes, such as age, gender or ethnicity. To address this issue, Soft-Biometric Privacy-Enhancing Techniques (SB-PETs) were proposed in the literature that aim to suppress such attribute information, and, in turn, alleviate the privacy risks associated with the extracted biometric templates. While various SB-PETs were presented so far, existing approaches do not provide dedicated mechanisms to determine which soft-biometrics to exclude and which to retain. In this paper, we address this gap and introduce ASPECD, a modular framework designed to selectively suppress binary and categorical soft-biometrics based on users' privacy preferences. ASPECD consists of multiple sequentially connected components, each dedicated for privacy-enhancement of an individual soft-biometric attribute. The proposed framework suppresses attribute information using a Moment-based Disentanglement process coupled with a centroid decoding procedure, ensuring that the privacy-enhanced templates are directly comparable to the templates in the original embedding space, regardless of the soft-biometric modality being suppressed.
To validate the performance of ASPECD, we conduct experiments on a large-scale face dataset and with five state-of-the-art face recognition models, demonstrating the effectiveness of the proposed approach in suppressing single and multiple soft-biometric attributes. Our approach achieves a competitive privacy-utility trade-off compared to the state-of-the-art methods in scenarios that involve enhancing privacy w.r.t. gender and ethnicity attributes. Source code will be made publicly available. |
Brodarič, Marko; Peer, Peter; Štruc, Vitomir Cross-Dataset Deepfake Detection: Evaluating the Generalization Capabilities of Modern DeepFake Detectors Proceedings Article In: Proceedings of the 27th Computer Vision Winter Workshop (CVWW), pp. 1-10, 2024. @inproceedings{MarkoCVWW,
title = {Cross-Dataset Deepfake Detection: Evaluating the Generalization Capabilities of Modern DeepFake Detectors},
author = {Marko Brodarič and Peter Peer and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2024/01/MarkoCVWW24_compressed.pdf},
year = {2024},
date = {2024-01-31},
booktitle = {Proceedings of the 27th Computer Vision Winter Workshop (CVWW)},
pages = {1-10},
abstract = {Due to the recent advances in generative deep learning, numerous techniques have been proposed in the literature that allow for the creation of so-called deepfakes, i.e., forged facial images commonly used for malicious purposes. These developments have triggered a need for effective deepfake detectors, capable of identifying forged and manipulated imagery as robustly as possible. While a considerable number of detection techniques has been proposed over the years, generalization across a wide spectrum of deepfake-generation techniques still remains an open problem. In this paper, we study a representative set of deepfake generation methods and analyze their performance in a cross-dataset setting with the goal of better understanding the reasons behind the observed generalization performance. To this end, we conduct a comprehensive analysis on the FaceForensics++ dataset and adopt Gradient-weighted Class Activation Mappings (Grad-CAM) to provide insights into the behavior of the evaluated detectors. Since a new class of deepfake generation techniques based on diffusion models recently appeared in the literature, we introduce a new subset of the FaceForensics++ dataset with diffusion-based deepfake and include it in our analysis. The results of our experiments show that most detectors overfit to the specific image artifacts induced by a given deepfake-generation model and mostly focus on local image areas where such artifacts can be expected. Conversely, good generalization appears to be correlated with class activations that cover a broad spatial area and hence capture different image artifacts that appear in various part of the facial region.},
keywords = {data integrity, deepfake, deepfake detection, deepfakes, difussion, face, faceforensics++, media forensics},
pubstate = {published},
tppubtype = {inproceedings}
}
Due to the recent advances in generative deep learning, numerous techniques have been proposed in the literature that allow for the creation of so-called deepfakes, i.e., forged facial images commonly used for malicious purposes. These developments have triggered a need for effective deepfake detectors, capable of identifying forged and manipulated imagery as robustly as possible. While a considerable number of detection techniques has been proposed over the years, generalization across a wide spectrum of deepfake-generation techniques still remains an open problem. In this paper, we study a representative set of deepfake generation methods and analyze their performance in a cross-dataset setting with the goal of better understanding the reasons behind the observed generalization performance. To this end, we conduct a comprehensive analysis on the FaceForensics++ dataset and adopt Gradient-weighted Class Activation Mappings (Grad-CAM) to provide insights into the behavior of the evaluated detectors. Since a new class of deepfake generation techniques based on diffusion models recently appeared in the literature, we introduce a new subset of the FaceForensics++ dataset with diffusion-based deepfake and include it in our analysis. The results of our experiments show that most detectors overfit to the specific image artifacts induced by a given deepfake-generation model and mostly focus on local image areas where such artifacts can be expected. Conversely, good generalization appears to be correlated with class activations that cover a broad spatial area and hence capture different image artifacts that appear in various part of the facial region. |
2023
|
Larue, Nicolas; Vu, Ngoc-Son; Štruc, Vitomir; Peer, Peter; Christophides, Vassilis SeeABLE: Soft Discrepancies and Bounded Contrastive Learning for Exposing Deepfakes Proceedings Article In: Proceedings of the International Conference on Computer Vision (ICCV), pp. 21011 - 21021, IEEE 2023. @inproceedings{NicolasCCV,
title = {SeeABLE: Soft Discrepancies and Bounded Contrastive Learning for Exposing Deepfakes},
author = {Nicolas Larue and Ngoc-Son Vu and Vitomir Štruc and Peter Peer and Vassilis Christophides},
url = {https://openaccess.thecvf.com/content/ICCV2023/papers/Larue_SeeABLE_Soft_Discrepancies_and_Bounded_Contrastive_Learning_for_Exposing_Deepfakes_ICCV_2023_paper.pdf
https://lmi.fe.uni-lj.si/wp-content/uploads/2024/01/SeeABLE_compressed.pdf
https://lmi.fe.uni-lj.si/wp-content/uploads/2024/01/SeeABLE_supplementary_compressed.pdf},
year = {2023},
date = {2023-10-01},
urldate = {2023-10-01},
booktitle = {Proceedings of the International Conference on Computer Vision (ICCV)},
pages = {21011 - 21021},
organization = {IEEE},
abstract = {Modern deepfake detectors have achieved encouraging results, when training and test images are drawn from the same data collection. However, when these detectors are applied to images produced with unknown deepfake-generation techniques, considerable performance degradations are commonly observed. In this paper, we propose a novel deepfake detector, called SeeABLE, that formalizes the detection problem as a (one-class) out-of-distribution detection task and generalizes better to unseen deepfakes. Specifically, SeeABLE first generates local image perturbations (referred to as soft-discrepancies) and then pushes the perturbed faces towards predefined prototypes using a novel regression-based bounded contrastive loss. To strengthen the generalization performance of SeeABLE to unknown deepfake types, we generate a rich set of soft discrepancies and train the detector: (i) to localize, which part of the face was modified, and (ii) to identify the alteration type. To demonstrate the capabilities of SeeABLE, we perform rigorous experiments on several widely-used deepfake datasets and show that our model convincingly outperforms competing state-of-the-art detectors, while exhibiting highly encouraging generalization capabilities. The source code for SeeABLE is available from: https://github.com/anonymous-author-sub/seeable.
},
keywords = {CNN, deepfake detection, deepfakes, face, media forensics, one-class learning, representation learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Modern deepfake detectors have achieved encouraging results, when training and test images are drawn from the same data collection. However, when these detectors are applied to images produced with unknown deepfake-generation techniques, considerable performance degradations are commonly observed. In this paper, we propose a novel deepfake detector, called SeeABLE, that formalizes the detection problem as a (one-class) out-of-distribution detection task and generalizes better to unseen deepfakes. Specifically, SeeABLE first generates local image perturbations (referred to as soft-discrepancies) and then pushes the perturbed faces towards predefined prototypes using a novel regression-based bounded contrastive loss. To strengthen the generalization performance of SeeABLE to unknown deepfake types, we generate a rich set of soft discrepancies and train the detector: (i) to localize, which part of the face was modified, and (ii) to identify the alteration type. To demonstrate the capabilities of SeeABLE, we perform rigorous experiments on several widely-used deepfake datasets and show that our model convincingly outperforms competing state-of-the-art detectors, while exhibiting highly encouraging generalization capabilities. The source code for SeeABLE is available from: https://github.com/anonymous-author-sub/seeable.
|
Peng, Bo; Sun, Xianyun; Wang, Caiyong; Wang, Wei; Dong, Jing; Sun, Zhenan; Zhang, Rongyu; Cong, Heng; Fu, Lingzhi; Wang, Hao; Zhang, Yusheng; Zhang, HanYuan; Zhang, Xin; Liu, Boyuan; Ling, Hefei; Dragar, Luka; Batagelj, Borut; Peer, Peter; Struc, Vitomir; Zhou, Xinghui; Liu, Kunlin; Feng, Weitao; Zhang, Weiming; Wang, Haitao; Diao, Wenxiu DFGC-VRA: DeepFake Game Competition on Visual Realism Assessment Proceedings Article In: IEEE International Joint Conference on Biometrics (IJCB 2023), pp. 1-9, Ljubljana, Slovenia, 2023. @inproceedings{Deepfake_comp2023,
title = {DFGC-VRA: DeepFake Game Competition on Visual Realism Assessment},
author = {Bo Peng and Xianyun Sun and Caiyong Wang and Wei Wang and Jing Dong and Zhenan Sun and Rongyu Zhang and Heng Cong and Lingzhi Fu and Hao Wang and Yusheng Zhang and HanYuan Zhang and Xin Zhang and Boyuan Liu and Hefei Ling and Luka Dragar and Borut Batagelj and Peter Peer and Vitomir Struc and Xinghui Zhou and Kunlin Liu and Weitao Feng and Weiming Zhang and Haitao Wang and Wenxiu Diao},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/09/CameraReady-225.pdf},
year = {2023},
date = {2023-09-01},
booktitle = {IEEE International Joint Conference on Biometrics (IJCB 2023)},
pages = {1-9},
address = {Ljubljana, Slovenia},
abstract = {This paper presents the summary report on the DeepFake
Game Competition on Visual Realism Assessment (DFGCVRA).
Deep-learning based face-swap videos, also known
as deepfakes, are becoming more and more realistic and
deceiving. The malicious usage of these face-swap videos
has caused wide concerns. There is a ongoing deepfake
game between its creators and detectors, with the human in
the loop. The research community has been focusing on
the automatic detection of these fake videos, but the assessment
of their visual realism, as perceived by human
eyes, is still an unexplored dimension. Visual realism assessment,
or VRA, is essential for assessing the potential
impact that may be brought by a specific face-swap video,
and it is also useful as a quality metric to compare different
face-swap methods. This is the third edition of DFGC
competitions, which focuses on the new visual realism assessment
topic, different from previous ones that compete
creators versus detectors. With this competition, we conduct
a comprehensive study of the SOTA performance on
the new task. We also release our MindSpore codes to fur-
*Jing Dong (jdong@nlpr.ia.ac.cn) is the corresponding author.
ther facilitate research in this field (https://github.
com/bomb2peng/DFGC-VRA-benckmark).},
keywords = {competition IJCB, deepfake detection, deepfakes, face, realism assessment},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper presents the summary report on the DeepFake
Game Competition on Visual Realism Assessment (DFGCVRA).
Deep-learning based face-swap videos, also known
as deepfakes, are becoming more and more realistic and
deceiving. The malicious usage of these face-swap videos
has caused wide concerns. There is a ongoing deepfake
game between its creators and detectors, with the human in
the loop. The research community has been focusing on
the automatic detection of these fake videos, but the assessment
of their visual realism, as perceived by human
eyes, is still an unexplored dimension. Visual realism assessment,
or VRA, is essential for assessing the potential
impact that may be brought by a specific face-swap video,
and it is also useful as a quality metric to compare different
face-swap methods. This is the third edition of DFGC
competitions, which focuses on the new visual realism assessment
topic, different from previous ones that compete
creators versus detectors. With this competition, we conduct
a comprehensive study of the SOTA performance on
the new task. We also release our MindSpore codes to fur-
*Jing Dong (jdong@nlpr.ia.ac.cn) is the corresponding author.
ther facilitate research in this field (https://github.
com/bomb2peng/DFGC-VRA-benckmark). |
2021
|
Ivanovska, Marija; Štruc, Vitomir A Comparative Study on Discriminative and One--Class Learning Models for Deepfake Detection Proceedings Article In: Proceedings of ERK 2021, pp. 1–4, 2021. @inproceedings{ERK_Marija_2021,
title = {A Comparative Study on Discriminative and One--Class Learning Models for Deepfake Detection},
author = {Marija Ivanovska and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2021/10/ERK_2021__A_Comparative_Study_of_Discriminative_and_One__Class_Learning_Models_for_Deepfake_Detection.pdf},
year = {2021},
date = {2021-09-20},
booktitle = {Proceedings of ERK 2021},
pages = {1--4},
abstract = {Deepfakes or manipulated face images, where a donor's face is swapped with the face of a target person, have gained enormous popularity among the general public recently. With the advancements in artificial intelligence and generative modeling
such images can nowadays be easily generated and used to spread misinformation and harm individuals, businesses or society. As the tools for generating deepfakes are rapidly improving, it is critical for deepfake detection models to be able to recognize advanced, sophisticated data manipulations, including those that have not been seen during training. In this paper, we explore the use of one--class learning models as an alternative to discriminative methods for the detection of deepfakes. We conduct a comparative study with three popular deepfake datasets and investigate the performance of selected (discriminative and one-class) detection models in matched- and cross-dataset experiments. Our results show that disciminative models significantly outperform one-class models when training and testing data come from the same dataset, but degrade considerably when the characteristics of the testing data deviate from the training setting. In such cases, one-class models tend to generalize much better.},
keywords = {biometrics, comparative study, computer vision, deepfake detection, deepfakes, detection, face, one-class learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Deepfakes or manipulated face images, where a donor's face is swapped with the face of a target person, have gained enormous popularity among the general public recently. With the advancements in artificial intelligence and generative modeling
such images can nowadays be easily generated and used to spread misinformation and harm individuals, businesses or society. As the tools for generating deepfakes are rapidly improving, it is critical for deepfake detection models to be able to recognize advanced, sophisticated data manipulations, including those that have not been seen during training. In this paper, we explore the use of one--class learning models as an alternative to discriminative methods for the detection of deepfakes. We conduct a comparative study with three popular deepfake datasets and investigate the performance of selected (discriminative and one-class) detection models in matched- and cross-dataset experiments. Our results show that disciminative models significantly outperform one-class models when training and testing data come from the same dataset, but degrade considerably when the characteristics of the testing data deviate from the training setting. In such cases, one-class models tend to generalize much better. |