2025
|
Manojlovska, Anastasija; Ramachandra, Raghavendra; Spathoulas, Georgios; Struc, Vitomir; Grm, Klemen Interpreting Face Recognition Templates using Natural Language Descriptions Proceedings Article In: Proceedings of IEEE/CFV Winter Conference on Applications in Computer Vision - Workshops (WACV-W) 2025, pp. 1-10, Tucson, USA, 2025. @inproceedings{Anastasija_WACV25,
title = {Interpreting Face Recognition Templates using Natural Language Descriptions},
author = {Anastasija Manojlovska and Raghavendra Ramachandra and Georgios Spathoulas and Vitomir Struc and Klemen Grm},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2025/01/WACV_2025_RWS_workshop_clanek.pdf},
year = {2025},
date = {2025-03-01},
urldate = {2025-03-01},
booktitle = {Proceedings of IEEE/CFV Winter Conference on Applications in Computer Vision - Workshops (WACV-W) 2025},
pages = {1-10},
address = {Tucson, USA},
abstract = {Explainable artificial intelligence (XAI) aims to ensure an AI system's decisions are transparent and understandable by humans, which is particularly important in potentially sensitive application scenarios in surveillance, security and law enforcement. In these and related areas, understanding the internal mechanisms governing the decision-making process of AI-based systems can increase trust and consequently user acceptance. While various methods have been developed to provide insights into the behavior of AI-based models, solutions capable of explaining different aspects of the models using Natural Language are still limited in the literature. In this paper, we therefore propose a novel approach for interpreting the information content encoded in face templates, produced by state-of-the-art (SOTA) face recognition models. Specifically, we utilize the Text Encoder from the Contrastive Language-Image Pretraining (CLIP) model and generate natural language descriptions of various face attributes present in the face templates. We implement two versions of our approach, with the off-the-shelf CLIP text-encoder and a fine-tuned version using the VGGFace2 and MAADFace datasets.~Our experimental results indicate that the fine-tuned text encoder under the contrastive training paradigm increases the attribute-based explainability of face recognition templates, while both models provide valuable human-understandable insights into modern face recognition models.},
keywords = {CLIP, explainability, face recognition, natural language, symbolic representations, xai},
pubstate = {published},
tppubtype = {inproceedings}
}
Explainable artificial intelligence (XAI) aims to ensure an AI system's decisions are transparent and understandable by humans, which is particularly important in potentially sensitive application scenarios in surveillance, security and law enforcement. In these and related areas, understanding the internal mechanisms governing the decision-making process of AI-based systems can increase trust and consequently user acceptance. While various methods have been developed to provide insights into the behavior of AI-based models, solutions capable of explaining different aspects of the models using Natural Language are still limited in the literature. In this paper, we therefore propose a novel approach for interpreting the information content encoded in face templates, produced by state-of-the-art (SOTA) face recognition models. Specifically, we utilize the Text Encoder from the Contrastive Language-Image Pretraining (CLIP) model and generate natural language descriptions of various face attributes present in the face templates. We implement two versions of our approach, with the off-the-shelf CLIP text-encoder and a fine-tuned version using the VGGFace2 and MAADFace datasets.~Our experimental results indicate that the fine-tuned text encoder under the contrastive training paradigm increases the attribute-based explainability of face recognition templates, while both models provide valuable human-understandable insights into modern face recognition models. |
Soltandoost, Elahe; Plesh, Richard; Schuckers, Stephanie; Peer, Peter; Struc, Vitomir Extracting Local Information from Global Representations for Interpretable Deepfake Detection Proceedings Article In: Proceedings of IEEE/CFV Winter Conference on Applications in Computer Vision - Workshops (WACV-W) 2025, pp. 1-11, Tucson, USA, 2025. @inproceedings{Elahe_WACV2025,
title = {Extracting Local Information from Global Representations for Interpretable Deepfake Detection},
author = {Elahe Soltandoost and Richard Plesh and Stephanie Schuckers and Peter Peer and Vitomir Struc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2025/01/ElahePaperF.pdf},
year = {2025},
date = {2025-03-01},
booktitle = {Proceedings of IEEE/CFV Winter Conference on Applications in Computer Vision - Workshops (WACV-W) 2025},
pages = {1-11},
address = {Tucson, USA},
abstract = {The detection of deepfakes has become increasingly challenging due to the sophistication of manipulation techniques that produce highly convincing fake videos. Traditional detection methods often lack transparency and provide limited insight into their decision-making processes. To address these challenges, we propose in this paper a Locally-Explainable Self-Blended (LESB) DeepFake detector that in addition to the final fake-vs-real classification decision also provides information, on which local facial region (i.e., eyes, mouth or nose) contributed the most to the decision process.~At the heart of the detector is a novel Local Feature Discovery (LFD) technique that can be applied to the embedding space of pretrained DeepFake detectors and allows identifying embedding space directions that encode variations in the appearance of local facial features. We demonstrate the merits of the proposed LFD technique and LESB detector in comprehensive experiments on four popular datasets, i.e., Celeb-DF, DeepFake Detection Challenge, Face Forensics in the Wild and FaceForensics++, and show that the proposed detector is not only competitive in comparison to strong baselines, but also exhibits enhanced transparency in the decision-making process by providing insights on the contribution of local face parts in the final detection decision. },
keywords = {CNN, deepfake DAD, deepfakes, faceforensics++, media forensics, xai},
pubstate = {published},
tppubtype = {inproceedings}
}
The detection of deepfakes has become increasingly challenging due to the sophistication of manipulation techniques that produce highly convincing fake videos. Traditional detection methods often lack transparency and provide limited insight into their decision-making processes. To address these challenges, we propose in this paper a Locally-Explainable Self-Blended (LESB) DeepFake detector that in addition to the final fake-vs-real classification decision also provides information, on which local facial region (i.e., eyes, mouth or nose) contributed the most to the decision process.~At the heart of the detector is a novel Local Feature Discovery (LFD) technique that can be applied to the embedding space of pretrained DeepFake detectors and allows identifying embedding space directions that encode variations in the appearance of local facial features. We demonstrate the merits of the proposed LFD technique and LESB detector in comprehensive experiments on four popular datasets, i.e., Celeb-DF, DeepFake Detection Challenge, Face Forensics in the Wild and FaceForensics++, and show that the proposed detector is not only competitive in comparison to strong baselines, but also exhibits enhanced transparency in the decision-making process by providing insights on the contribution of local face parts in the final detection decision. |
2024
|
Manojlovska, Anastasija; Štruc, Vitomir; Grm, Klemen Interpretacija mehanizmov obraznih biometričnih modelov s kontrastnim multimodalnim učenjem Proceedings Article In: Proceedings of ERK 2024, pp. 1-4, Portorož, Slovenia, 2024. @inproceedings{Anastasija_ERK24,
title = {Interpretacija mehanizmov obraznih biometričnih modelov s kontrastnim multimodalnim učenjem},
author = {Anastasija Manojlovska and Vitomir Štruc and Klemen Grm},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2024/10/ERK2024_Copy.pdf},
year = {2024},
date = {2024-09-26},
booktitle = {Proceedings of ERK 2024},
pages = {1-4},
address = {Portorož, Slovenia},
abstract = {Razložljiva umetna inteligenca (XAI) povečuje transparentnost sistemov umetne inteligence. Ta študija uporablja model CLIP (Contrastive Language-Image Pretraining) podjetja OpenAI za prepoznavanje obraznih atributov v podatkovni zbirki VGGFace2 z uporabo anotacij atributov iz podatkovne zbirke MAADFace. Z poravnavo slik in opisov v naravnem jeziku prepoznamo atribute, kot so starost, spol in pričeska, ter ustvarimo razlage v naravnem jeziku. Raziskujemo tudi integracijo predhodno naučenih modelov za prepoznavanje obrazov in dodajanje razvrščevalnih plasti za izboljšanje razvrščanja atributov. Prednaučeni model CLIP, se je izkazal najboljši pri prepoznavanju atributov Moški in Črn, saj je dosegel vrednosti AUC 0,9891 oz. 0,9829.},
keywords = {CNN, deep learning, face recognition, xai},
pubstate = {published},
tppubtype = {inproceedings}
}
Razložljiva umetna inteligenca (XAI) povečuje transparentnost sistemov umetne inteligence. Ta študija uporablja model CLIP (Contrastive Language-Image Pretraining) podjetja OpenAI za prepoznavanje obraznih atributov v podatkovni zbirki VGGFace2 z uporabo anotacij atributov iz podatkovne zbirke MAADFace. Z poravnavo slik in opisov v naravnem jeziku prepoznamo atribute, kot so starost, spol in pričeska, ter ustvarimo razlage v naravnem jeziku. Raziskujemo tudi integracijo predhodno naučenih modelov za prepoznavanje obrazov in dodajanje razvrščevalnih plasti za izboljšanje razvrščanja atributov. Prednaučeni model CLIP, se je izkazal najboljši pri prepoznavanju atributov Moški in Črn, saj je dosegel vrednosti AUC 0,9891 oz. 0,9829. |
Plesh, Richard; Križaj, Janez; Bahmani, Keivan; Banavar, Mahesh; Struc, Vitomir; Schuckers, Stephanie Discovering Interpretable Feature Directions in the Embedding Space of Face Recognition Models Proceedings Article In: International Joint Conference on Biometrics (IJCB 2024), pp. 1-10, 2024. @inproceedings{Krizaj,
title = {Discovering Interpretable Feature Directions in the Embedding Space of Face Recognition Models},
author = {Richard Plesh and Janez Križaj and Keivan Bahmani and Mahesh Banavar and Vitomir Struc and Stephanie Schuckers},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2024/08/107.pdf
https://lmi.fe.uni-lj.si/wp-content/uploads/2024/08/107-supp.pdf},
year = {2024},
date = {2024-09-15},
booktitle = {International Joint Conference on Biometrics (IJCB 2024)},
pages = {1-10},
abstract = {Modern face recognition (FR) models, particularly their convolutional neural network based implementations, often raise concerns regarding privacy and ethics due to their “black-box” nature. To enhance the explainability of FR models and the interpretability of their embedding space, we introduce in this paper three novel techniques for discovering semantically meaningful feature directions (or axes). The first technique uses a dedicated facial-region blending procedure together with principal component analysis to discover embedding space direction that correspond to spatially isolated semantic face areas, providing a new perspective on facial feature interpretation. The other two proposed techniques exploit attribute labels to discern feature directions that correspond to intra-identity variations, such as pose, illumination angle, and expression, but do so either through a cluster analysis or a dedicated regression procedure. To validate the capabilities of the developed techniques, we utilize a powerful template decoder that inverts the image embedding back into the pixel space. Using the decoder, we visualize linear movements along the discovered directions, enabling a clearer understanding of the internal representations within face recognition models. The source code will be made publicly available.},
keywords = {biometrics, CNN, deep learning, face recognition, feature space understanding, xai},
pubstate = {published},
tppubtype = {inproceedings}
}
Modern face recognition (FR) models, particularly their convolutional neural network based implementations, often raise concerns regarding privacy and ethics due to their “black-box” nature. To enhance the explainability of FR models and the interpretability of their embedding space, we introduce in this paper three novel techniques for discovering semantically meaningful feature directions (or axes). The first technique uses a dedicated facial-region blending procedure together with principal component analysis to discover embedding space direction that correspond to spatially isolated semantic face areas, providing a new perspective on facial feature interpretation. The other two proposed techniques exploit attribute labels to discern feature directions that correspond to intra-identity variations, such as pose, illumination angle, and expression, but do so either through a cluster analysis or a dedicated regression procedure. To validate the capabilities of the developed techniques, we utilize a powerful template decoder that inverts the image embedding back into the pixel space. Using the decoder, we visualize linear movements along the discovered directions, enabling a clearer understanding of the internal representations within face recognition models. The source code will be made publicly available. |
Križaj, Janez; Plesh, Richard O.; Banavar, Mahesh; Schuckers, Stephanie; Štruc, Vitomir Deep Face Decoder: Towards understanding the embedding space of convolutional networks through visual reconstruction of deep face templates Journal Article In: Engineering Applications of Artificial Intelligence, vol. 132, iss. 107941, pp. 1-20, 2024. @article{KrizajEAAI2024,
title = {Deep Face Decoder: Towards understanding the embedding space of convolutional networks through visual reconstruction of deep face templates},
author = {Janez Križaj and Richard O. Plesh and Mahesh Banavar and Stephanie Schuckers and Vitomir Štruc},
url = {https://www.sciencedirect.com/science/article/abs/pii/S095219762400099X
https://lmi.fe.uni-lj.si/wp-content/uploads/2025/02/Deep_Face_Decoder__Elsevier_template_.pdf},
doi = {https://doi.org/10.1016/j.engappai.2024.107941},
year = {2024},
date = {2024-01-30},
urldate = {2024-01-30},
journal = {Engineering Applications of Artificial Intelligence},
volume = {132},
issue = {107941},
pages = {1-20},
abstract = {Advances in deep learning and convolutional neural networks (ConvNets) have driven remarkable face recognition (FR) progress recently. However, the black-box nature of modern ConvNet-based face recognition models makes it challenging to interpret their decision-making process, to understand the reasoning behind specific success and failure cases, or to predict their responses to unseen data characteristics. It is, therefore, critical to design mechanisms that explain the inner workings of contemporary FR models and offer insight into their behavior. To address this challenge, we present in this paper a novel textit{template-inversion approach} capable of reconstructing high-fidelity face images from the embeddings (templates, feature-space representations) produced by modern FR techniques. Our approach is based on a novel Deep Face Decoder (DFD) trained in a regression setting to visualize the information encoded in the embedding space with the goal of fostering explainability. We utilize the developed DFD model in comprehensive experiments on multiple unconstrained face datasets, namely Visual Geometry Group Face dataset 2 (VGGFace2), Labeled Faces in the Wild (LFW), and Celebrity Faces Attributes Dataset High Quality (CelebA-HQ). Our analysis focuses on the embedding spaces of two distinct face recognition models with backbones based on the Visual Geometry Group 16-layer model (VGG-16) and the 50-layer Residual Network (ResNet-50). The results reveal how information is encoded in the two considered models and how perturbations in image appearance due to rotations, translations, scaling, occlusion, or adversarial attacks, are propagated into the embedding space. Our study offers researchers a deeper comprehension of the underlying mechanisms of ConvNet-based FR models, ultimately promoting advancements in model design and explainability. },
keywords = {CNN, embedding space, face, face images, face recognition, face synthesis, template reconstruction, xai},
pubstate = {published},
tppubtype = {article}
}
Advances in deep learning and convolutional neural networks (ConvNets) have driven remarkable face recognition (FR) progress recently. However, the black-box nature of modern ConvNet-based face recognition models makes it challenging to interpret their decision-making process, to understand the reasoning behind specific success and failure cases, or to predict their responses to unseen data characteristics. It is, therefore, critical to design mechanisms that explain the inner workings of contemporary FR models and offer insight into their behavior. To address this challenge, we present in this paper a novel textit{template-inversion approach} capable of reconstructing high-fidelity face images from the embeddings (templates, feature-space representations) produced by modern FR techniques. Our approach is based on a novel Deep Face Decoder (DFD) trained in a regression setting to visualize the information encoded in the embedding space with the goal of fostering explainability. We utilize the developed DFD model in comprehensive experiments on multiple unconstrained face datasets, namely Visual Geometry Group Face dataset 2 (VGGFace2), Labeled Faces in the Wild (LFW), and Celebrity Faces Attributes Dataset High Quality (CelebA-HQ). Our analysis focuses on the embedding spaces of two distinct face recognition models with backbones based on the Visual Geometry Group 16-layer model (VGG-16) and the 50-layer Residual Network (ResNet-50). The results reveal how information is encoded in the two considered models and how perturbations in image appearance due to rotations, translations, scaling, occlusion, or adversarial attacks, are propagated into the embedding space. Our study offers researchers a deeper comprehension of the underlying mechanisms of ConvNet-based FR models, ultimately promoting advancements in model design and explainability. |