Publications – Laboratory for Machine Intelligence

2026

Ivanovska, Marija; Todorov, Leon; Peer, Peter; Štruc, Vitomir

SelfMAD++: Self-Supervised Foundation Model with Local Feature Enhancement for Generalized Morphing Attack Detection Journal Article

In: Information Fusion, vol. 127, Part C, no. 103921, pp. 1-16, 2026.

Abstract | Links | BibTeX | Tags: anomaly detection, biometrics, CLIP, computer vision, face morphing detection, face recognition, foundation models

@article{InfoFUS_Marija,

title = {SelfMAD++: Self-Supervised Foundation Model with Local Feature Enhancement for Generalized Morphing Attack Detection},

author = {Marija Ivanovska and Leon Todorov and Peter Peer and Vitomir Štruc},

url = {https://www.sciencedirect.com/science/article/pii/S1566253525009832},

doi = {https://doi.org/10.1016/j.inffus.2025.103921},

year  = {2026},

date = {2026-03-01},

journal = {Information Fusion},

volume = {127, Part C},

number = {103921},

pages = {1-16},

abstract = {Face morphing attacks pose a growing threat to biometric systems, exacerbated by the rapid emergence of powerful generative techniques that enable realistic and seamless facial image manipulations. To address this challenge, we introduce SelfMAD++, a robust and generalized single-image morphing attack detection (S-MAD) framework. Unlike our previous work SelfMAD, which introduced a data augmentation technique to train off-the-shelf classifiers for attack detection, SelfMAD++ advances this paradigm by integrating the artifact-driven augmentation with foundation models and fine-grained spatial reasoning. At its core, SelfMAD++ builds on CLIP—a vision-language foundation model—adapted via Low-Rank Adaptation (LoRA) to align image representations with task-specific text prompts. To enhance sensitivity to spatially subtle and fine-grained artifacts, we integrate a parallel multi-scale convolutional branch specialized in dense, multi-scale feature extraction. This branch is guided by an auxiliary segmentation module, which acts as a regularizer by disentangling bona fide facial regions from potentially manipulated ones. The dual-branch features are adaptively fused through a gated attention mechanism, capturing both semantic context and fine-grained spatial cues indicative of morphing. SelfMAD++ is trained end-to-end using a multi-objective loss that balances semantic alignment, segmentation consistency, and classification accuracy. Extensive experiments across nine standard benchmark datasets demonstrate that SelfMAD++ achieves state-of-the-art performance, with an average Equal Error Rate (EER) of 3.91%, outperforming both supervised and unsupervised MAD methods by large margins. Notably, SelfMAD++ excels on modern, high-quality morphs generated by GAN and diffusion--based morphing methods, demonstrating its robustness and strong generalization capability. SelfMAD++ code and supplementary resources are publicly available at: https://github.com/LeonTodorov/SelfMADpp.},

keywords = {anomaly detection, biometrics, CLIP, computer vision, face morphing detection, face recognition, foundation models},

pubstate = {published},

tppubtype = {article}

}

Face morphing attacks pose a growing threat to biometric systems, exacerbated by the rapid emergence of powerful generative techniques that enable realistic and seamless facial image manipulations. To address this challenge, we introduce SelfMAD++, a robust and generalized single-image morphing attack detection (S-MAD) framework. Unlike our previous work SelfMAD, which introduced a data augmentation technique to train off-the-shelf classifiers for attack detection, SelfMAD++ advances this paradigm by integrating the artifact-driven augmentation with foundation models and fine-grained spatial reasoning. At its core, SelfMAD++ builds on CLIP—a vision-language foundation model—adapted via Low-Rank Adaptation (LoRA) to align image representations with task-specific text prompts. To enhance sensitivity to spatially subtle and fine-grained artifacts, we integrate a parallel multi-scale convolutional branch specialized in dense, multi-scale feature extraction. This branch is guided by an auxiliary segmentation module, which acts as a regularizer by disentangling bona fide facial regions from potentially manipulated ones. The dual-branch features are adaptively fused through a gated attention mechanism, capturing both semantic context and fine-grained spatial cues indicative of morphing. SelfMAD++ is trained end-to-end using a multi-objective loss that balances semantic alignment, segmentation consistency, and classification accuracy. Extensive experiments across nine standard benchmark datasets demonstrate that SelfMAD++ achieves state-of-the-art performance, with an average Equal Error Rate (EER) of 3.91%, outperforming both supervised and unsupervised MAD methods by large margins. Notably, SelfMAD++ excels on modern, high-quality morphs generated by GAN and diffusion--based morphing methods, demonstrating its robustness and strong generalization capability. SelfMAD++ code and supplementary resources are publicly available at: https://github.com/LeonTodorov/SelfMADpp.

2024

Ivanovska, Marija; Štruc, Vitomir

Y-GAN: Learning Dual Data Representations for Anomaly Detection in Images Journal Article

In: Expert Systems with Applications (ESWA), vol. 248, no. 123410, pp. 1-7, 2024.

Abstract | Links | BibTeX | Tags: anomaly detection, CNN, deep learning, one-class learning, y-gan

@article{ESWA2024,

title = {Y-GAN: Learning Dual Data Representations for Anomaly Detection in Images},

author = {Marija Ivanovska and Vitomir Štruc},

url = {https://www.sciencedirect.com/science/article/pii/S0957417424002756

https://lmi.fe.uni-lj.si/wp-content/uploads/2024/02/YGAN_Marija.pdf},

doi = {https://doi.org/10.1016/j.eswa.2024.123410},

year  = {2024},

date = {2024-03-01},

urldate = {2024-03-01},

journal = {Expert Systems with Applications (ESWA)},

volume = {248},

number = {123410},

pages = {1-7},

abstract = {We propose a novel reconstruction-based model for anomaly detection in image data, called 'Y-GAN'. The model consists of a Y-shaped auto-encoder and represents images in two separate latent spaces. The first captures meaningful image semantics, which are key for representing (normal) training data, whereas the second encodes low-level residual image characteristics. To ensure the dual representations encode mutually exclusive information, a disentanglement procedure is designed around a latent (proxy) classifier. Additionally, a novel representation-consistency mechanism is proposed to prevent information leakage between the latent spaces. The model is trained in a one-class learning setting using only normal training data. Due to the separation of semantically-relevant and residual information, Y-GAN is able to derive informative data representations that allow for efficacious anomaly detection across a diverse set of anomaly detection tasks. The model is evaluated in comprehensive experiments with several recent anomaly detection models using four popular image datasets, i.e., MNIST, FMNIST, CIFAR10, and PlantVillage. Experimental results show that Y-GAN outperforms all tested models by a considerable margin and yields state-of-the-art results. The source code for the model is made publicly available at https://github.com/MIvanovska/Y-GAN. },

keywords = {anomaly detection, CNN, deep learning, one-class learning, y-gan},

pubstate = {published},

tppubtype = {article}

}