2023
|
Babnik, Žiga; Peer, Peter; Štruc, Vitomir DifFIQA: Face Image Quality Assessment Using Denoising Diffusion Probabilistic Models Proceedings Article In: IEEE International Joint Conference on Biometrics , pp. 1-10, IEEE, Ljubljana, Slovenia, 2023. @inproceedings{Diffiqa_2023,
title = {DifFIQA: Face Image Quality Assessment Using Denoising Diffusion Probabilistic Models},
author = {Žiga Babnik and Peter Peer and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/09/121.pdf
https://lmi.fe.uni-lj.si/wp-content/uploads/2023/09/121-supp.pdf},
year = {2023},
date = {2023-09-01},
booktitle = {IEEE International Joint Conference on Biometrics },
pages = {1-10},
publisher = {IEEE},
address = {Ljubljana, Slovenia},
abstract = {Modern face recognition (FR) models excel in constrained
scenarios, but often suffer from decreased performance
when deployed in unconstrained (real-world) environments
due to uncertainties surrounding the quality
of the captured facial data. Face image quality assessment
(FIQA) techniques aim to mitigate these performance
degradations by providing FR models with sample-quality
predictions that can be used to reject low-quality samples
and reduce false match errors. However, despite steady improvements,
ensuring reliable quality estimates across facial
images with diverse characteristics remains challenging.
In this paper, we present a powerful new FIQA approach,
named DifFIQA, which relies on denoising diffusion
probabilistic models (DDPM) and ensures highly competitive
results. The main idea behind the approach is to utilize
the forward and backward processes of DDPMs to perturb
facial images and quantify the impact of these perturbations
on the corresponding image embeddings for quality
prediction. Because the diffusion-based perturbations are
computationally expensive, we also distill the knowledge
encoded in DifFIQA into a regression-based quality predictor,
called DifFIQA(R), that balances performance and
execution time. We evaluate both models in comprehensive
experiments on 7 diverse datasets, with 4 target FR models
and against 10 state-of-the-art FIQA techniques with
highly encouraging results. The source code is available
from: https://github.com/LSIbabnikz/DifFIQA.},
keywords = {biometrics, deep learning, denoising diffusion probabilistic models, diffusion, face, face image quality assesment, face recognition, FIQA, quality},
pubstate = {published},
tppubtype = {inproceedings}
}
Modern face recognition (FR) models excel in constrained
scenarios, but often suffer from decreased performance
when deployed in unconstrained (real-world) environments
due to uncertainties surrounding the quality
of the captured facial data. Face image quality assessment
(FIQA) techniques aim to mitigate these performance
degradations by providing FR models with sample-quality
predictions that can be used to reject low-quality samples
and reduce false match errors. However, despite steady improvements,
ensuring reliable quality estimates across facial
images with diverse characteristics remains challenging.
In this paper, we present a powerful new FIQA approach,
named DifFIQA, which relies on denoising diffusion
probabilistic models (DDPM) and ensures highly competitive
results. The main idea behind the approach is to utilize
the forward and backward processes of DDPMs to perturb
facial images and quantify the impact of these perturbations
on the corresponding image embeddings for quality
prediction. Because the diffusion-based perturbations are
computationally expensive, we also distill the knowledge
encoded in DifFIQA into a regression-based quality predictor,
called DifFIQA(R), that balances performance and
execution time. We evaluate both models in comprehensive
experiments on 7 diverse datasets, with 4 target FR models
and against 10 state-of-the-art FIQA techniques with
highly encouraging results. The source code is available
from: https://github.com/LSIbabnikz/DifFIQA. |
Kolf, Jan Niklas; Boutros, Fadi; Elliesen, Jurek; Theuerkauf, Markus; Damer, Naser; Alansari, Mohamad Y; Hay, Oussama Abdul; Alansari, Sara Yousif; Javed, Sajid; Werghi, Naoufel; Grm, Klemen; Struc, Vitomir; Alonso-Fernandez, Fernando; Hernandez-Diaz, Kevin; Bigun, Josef; George, Anjith; Ecabert, Christophe; Shahreza, Hatef Otroshi; Kotwal, Ketan; Marcel, Sébastien; Medvedev, Iurii; Bo, Jin; Nunes, Diogo; Hassanpour, Ahmad; Khatiwada, Pankaj; Toor, Aafan Ahmad; Yang, Bian EFaR 2023: Efficient Face Recognition Competition Proceedings Article In: IEEE International Joint Conference on Biometrics (IJCB 2023), pp. 1-12, Ljubljana, Slovenia, 2023. @inproceedings{EFAR2023_2023,
title = {EFaR 2023: Efficient Face Recognition Competition},
author = {Jan Niklas Kolf and Fadi Boutros and Jurek Elliesen and Markus Theuerkauf and Naser Damer and Mohamad Y Alansari and Oussama Abdul Hay and Sara Yousif Alansari and Sajid Javed and Naoufel Werghi and Klemen Grm and Vitomir Struc and Fernando Alonso-Fernandez and Kevin Hernandez-Diaz and Josef Bigun and Anjith George and Christophe Ecabert and Hatef Otroshi Shahreza and Ketan Kotwal and Sébastien Marcel and Iurii Medvedev and Jin Bo and Diogo Nunes and Ahmad Hassanpour and Pankaj Khatiwada and Aafan Ahmad Toor and Bian Yang},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/09/CameraReady-231.pdf},
year = {2023},
date = {2023-09-01},
booktitle = {IEEE International Joint Conference on Biometrics (IJCB 2023)},
pages = {1-12},
address = {Ljubljana, Slovenia},
abstract = {This paper presents the summary of the Efficient Face
Recognition Competition (EFaR) held at the 2023 International
Joint Conference on Biometrics (IJCB 2023). The
competition received 17 submissions from 6 different teams.
To drive further development of efficient face recognition
models, the submitted solutions are ranked based on a
weighted score of the achieved verification accuracies on a
diverse set of benchmarks, as well as the deployability given
by the number of floating-point operations and model size.
The evaluation of submissions is extended to bias, crossquality,
and large-scale recognition benchmarks. Overall,
the paper gives an overview of the achieved performance
values of the submitted solutions as well as a diverse set of
baselines. The submitted solutions use small, efficient network
architectures to reduce the computational cost, some
solutions apply model quantization. An outlook on possible
techniques that are underrepresented in current solutions is
given as well.},
keywords = {biometrics, deep learning, face, face recognition, lightweight models},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper presents the summary of the Efficient Face
Recognition Competition (EFaR) held at the 2023 International
Joint Conference on Biometrics (IJCB 2023). The
competition received 17 submissions from 6 different teams.
To drive further development of efficient face recognition
models, the submitted solutions are ranked based on a
weighted score of the achieved verification accuracies on a
diverse set of benchmarks, as well as the deployability given
by the number of floating-point operations and model size.
The evaluation of submissions is extended to bias, crossquality,
and large-scale recognition benchmarks. Overall,
the paper gives an overview of the achieved performance
values of the submitted solutions as well as a diverse set of
baselines. The submitted solutions use small, efficient network
architectures to reduce the computational cost, some
solutions apply model quantization. An outlook on possible
techniques that are underrepresented in current solutions is
given as well. |
Das, Abhijit; Atreya, Saurabh K; Mukherjee, Aritra; Vitek, Matej; Li, Haiqing; Wang, Caiyong; Guangzhe, Zhao; Boutros, Fadi; Siebke, Patrick; Kolf, Jan Niklas; Damer, Naser; Sun, Ye; Hexin, Lu; Aobo, Fab; Sheng, You; Nathan, Sabari; Ramamoorthy, Suganya; S, Rampriya R; G, Geetanjali; Sihag, Prinaka; Nigam, Aditya; Peer, Peter; Pal, Umapada; Struc, Vitomir Sclera Segmentation and Joint Recognition Benchmarking Competition: SSRBC 2023 Proceedings Article In: IEEE International Joint Conference on Biometrics (IJCB 2023), pp. 1-10, Ljubljana, Slovenia, 2023. @inproceedings{SSBRC2023,
title = {Sclera Segmentation and Joint Recognition Benchmarking Competition: SSRBC 2023},
author = {Abhijit Das and Saurabh K Atreya and Aritra Mukherjee and Matej Vitek and Haiqing Li and Caiyong Wang and Zhao Guangzhe and Fadi Boutros and Patrick Siebke and Jan Niklas Kolf and Naser Damer and Ye Sun and Lu Hexin and Fab Aobo and You Sheng and Sabari Nathan and Suganya Ramamoorthy and Rampriya R S and Geetanjali G and Prinaka Sihag and Aditya Nigam and Peter Peer and Umapada Pal and Vitomir Struc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/09/CameraReady-233.pdf},
year = {2023},
date = {2023-09-01},
booktitle = {IEEE International Joint Conference on Biometrics (IJCB 2023)},
pages = {1-10},
address = {Ljubljana, Slovenia},
abstract = {This paper presents the summary of the Sclera Segmentation
and Joint Recognition Benchmarking Competition (SSRBC
2023) held in conjunction with IEEE International
Joint Conference on Biometrics (IJCB 2023). Different from
the previous editions of the competition, SSRBC 2023 not
only explored the performance of the latest and most advanced
sclera segmentation models, but also studied the impact
of segmentation quality on recognition performance.
Five groups took part in SSRBC 2023 and submitted a total
of six segmentation models and one recognition technique
for scoring. The submitted solutions included a wide
variety of conceptually diverse deep-learning models and
were rigorously tested on three publicly available datasets,
i.e., MASD, SBVPI and MOBIUS. Most of the segmentation
models achieved encouraging segmentation and recognition
performance. Most importantly, we observed that better
segmentation results always translate into better verification
performance.},
keywords = {biometrics, competition IJCB, computer vision, deep learning, sclera, sclera segmentation},
pubstate = {published},
tppubtype = {inproceedings}
}
This paper presents the summary of the Sclera Segmentation
and Joint Recognition Benchmarking Competition (SSRBC
2023) held in conjunction with IEEE International
Joint Conference on Biometrics (IJCB 2023). Different from
the previous editions of the competition, SSRBC 2023 not
only explored the performance of the latest and most advanced
sclera segmentation models, but also studied the impact
of segmentation quality on recognition performance.
Five groups took part in SSRBC 2023 and submitted a total
of six segmentation models and one recognition technique
for scoring. The submitted solutions included a wide
variety of conceptually diverse deep-learning models and
were rigorously tested on three publicly available datasets,
i.e., MASD, SBVPI and MOBIUS. Most of the segmentation
models achieved encouraging segmentation and recognition
performance. Most importantly, we observed that better
segmentation results always translate into better verification
performance. |
Emersic, Ziga; Ohki, Tetsushi; Akasaka, Muku; Arakawa, Takahiko; Maeda, Soshi; Okano, Masora; Sato, Yuya; George, Anjith; Marcel, Sébastien; Ganapathi, Iyyakutti Iyappan; Ali, Syed Sadaf; Javed, Sajid; Werghi, Naoufel; Işık, Selin Gök; Sarıtaş, Erdi; Ekenel, Hazim Kemal; Hudovernik, Valter; Kolf, Jan Niklas; Boutros, Fadi; Damer, Naser; Sharma, Geetanjali; Kamboj, Aman; Nigam, Aditya; Jain, Deepak Kumar; Cámara, Guillermo; Peer, Peter; Struc, Vitomir The Unconstrained Ear Recognition Challenge 2023: Maximizing Performance and Minimizing Bias Proceedings Article In: IEEE International Joint Conference on Biometrics (IJCB 2023), pp. 1-10, Ljubljana, Slovenia, 2023. @inproceedings{UERC2023,
title = {The Unconstrained Ear Recognition Challenge 2023: Maximizing Performance and Minimizing Bias},
author = {Ziga Emersic and Tetsushi Ohki and Muku Akasaka and Takahiko Arakawa and Soshi Maeda and Masora Okano and Yuya Sato and Anjith George and Sébastien Marcel and Iyyakutti Iyappan Ganapathi and Syed Sadaf Ali and Sajid Javed and Naoufel Werghi and Selin Gök Işık and Erdi Sarıtaş and Hazim Kemal Ekenel and Valter Hudovernik and Jan Niklas Kolf and Fadi Boutros and Naser Damer and Geetanjali Sharma and Aman Kamboj and Aditya Nigam and Deepak Kumar Jain and Guillermo Cámara and Peter Peer and Vitomir Struc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/09/CameraReady-234.pdf},
year = {2023},
date = {2023-09-01},
booktitle = {IEEE International Joint Conference on Biometrics (IJCB 2023)},
pages = {1-10},
address = {Ljubljana, Slovenia},
abstract = {The paper provides a summary of the 2023 Unconstrained
Ear Recognition Challenge (UERC), a benchmarking
effort focused on ear recognition from images acquired
in uncontrolled environments. The objective of the challenge
was to evaluate the effectiveness of current ear recognition
techniques on a challenging ear dataset while analyzing
the techniques from two distinct aspects, i.e., verification
performance and bias with respect to specific demographic
factors, i.e., gender and ethnicity. Seven research
groups participated in the challenge and submitted
a seven distinct recognition approaches that ranged from
descriptor-based methods and deep-learning models to ensemble
techniques that relied on multiple data representations
to maximize performance and minimize bias. A comprehensive
investigation into the performance of the submitted
models is presented, as well as an in-depth analysis of
bias and associated performance differentials due to differences
in gender and ethnicity. The results of the challenge
suggest that a wide variety of models (e.g., transformers,
convolutional neural networks, ensemble models) is capable
of achieving competitive recognition results, but also
that all of the models still exhibit considerable performance
differentials with respect to both gender and ethnicity. To
promote further development of unbiased and effective ear
recognition models, the starter kit of UERC 2023 together
with the baseline model, and training and test data is made
available from: http://ears.fri.uni-lj.si/.},
keywords = {biometrics, competition, computer vision, deep learning, ear, ear biometrics, UERC 2023},
pubstate = {published},
tppubtype = {inproceedings}
}
The paper provides a summary of the 2023 Unconstrained
Ear Recognition Challenge (UERC), a benchmarking
effort focused on ear recognition from images acquired
in uncontrolled environments. The objective of the challenge
was to evaluate the effectiveness of current ear recognition
techniques on a challenging ear dataset while analyzing
the techniques from two distinct aspects, i.e., verification
performance and bias with respect to specific demographic
factors, i.e., gender and ethnicity. Seven research
groups participated in the challenge and submitted
a seven distinct recognition approaches that ranged from
descriptor-based methods and deep-learning models to ensemble
techniques that relied on multiple data representations
to maximize performance and minimize bias. A comprehensive
investigation into the performance of the submitted
models is presented, as well as an in-depth analysis of
bias and associated performance differentials due to differences
in gender and ethnicity. The results of the challenge
suggest that a wide variety of models (e.g., transformers,
convolutional neural networks, ensemble models) is capable
of achieving competitive recognition results, but also
that all of the models still exhibit considerable performance
differentials with respect to both gender and ethnicity. To
promote further development of unbiased and effective ear
recognition models, the starter kit of UERC 2023 together
with the baseline model, and training and test data is made
available from: http://ears.fri.uni-lj.si/. |
Ivanovska, Marija; Štruc, Vitomir; Perš, Janez TomatoDIFF: On–plant Tomato Segmentation with Denoising Diffusion Models Proceedings Article In: 18th International Conference on Machine Vision and Applications (MVA 2023), pp. 1-6, 2023. @inproceedings{MarijaTomato2023,
title = {TomatoDIFF: On–plant Tomato Segmentation with Denoising Diffusion Models},
author = {Marija Ivanovska and Vitomir Štruc and Janez Perš },
url = {https://arxiv.org/pdf/2307.01064.pdf
https://ieeexplore.ieee.org/document/10215774},
doi = {10.23919/MVA57639.2023.10215774},
year = {2023},
date = {2023-07-23},
urldate = {2023-07-23},
booktitle = {18th International Conference on Machine Vision and Applications (MVA 2023)},
pages = {1-6},
abstract = {Artificial intelligence applications enable farmers to optimize crop growth and production while reducing costs and environmental impact. Computer vision-based algorithms in particular, are commonly used for fruit segmentation, enabling in-depth analysis of the harvest quality and accurate yield estimation. In this paper, we propose TomatoDIFF, a novel diffusion-based model for semantic segmentation of on-plant tomatoes. When evaluated against other competitive methods, our model demonstrates
state-of-the-art (SOTA) performance, even in challenging environments with highly occluded fruits. Additionally, we introduce Tomatopia, a new, large and challenging dataset of greenhouse tomatoes. The dataset comprises high-resolution RGB-D images and pixel-level annotations of the fruits. The source code of TomatoDIFF and Tomatopia are available at https://github. com/MIvanovska/TomatoDIFF},
keywords = {agriculture, dataset, deep learning, diffusion, plan segmentation, plant monitoring, robotics, segmentation, tomato dataset},
pubstate = {published},
tppubtype = {inproceedings}
}
Artificial intelligence applications enable farmers to optimize crop growth and production while reducing costs and environmental impact. Computer vision-based algorithms in particular, are commonly used for fruit segmentation, enabling in-depth analysis of the harvest quality and accurate yield estimation. In this paper, we propose TomatoDIFF, a novel diffusion-based model for semantic segmentation of on-plant tomatoes. When evaluated against other competitive methods, our model demonstrates
state-of-the-art (SOTA) performance, even in challenging environments with highly occluded fruits. Additionally, we introduce Tomatopia, a new, large and challenging dataset of greenhouse tomatoes. The dataset comprises high-resolution RGB-D images and pixel-level annotations of the fruits. The source code of TomatoDIFF and Tomatopia are available at https://github. com/MIvanovska/TomatoDIFF |
Vitek, Matej; Bizjak, Matic; Peer, Peter; Štruc, Vitomir IPAD: Iterative Pruning with Activation Deviation for Sclera Biometrics Journal Article In: Journal of King Saud University - Computer and Information Sciences, vol. 35, no. 8, pp. 1-21, 2023. @article{VitekSaud2023,
title = {IPAD: Iterative Pruning with Activation Deviation for Sclera Biometrics},
author = {Matej Vitek and Matic Bizjak and Peter Peer and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/07/PublishedVersion.pdf},
doi = {https://doi.org/10.1016/j.jksuci.2023.101630},
year = {2023},
date = {2023-07-10},
journal = {Journal of King Saud University - Computer and Information Sciences},
volume = {35},
number = {8},
pages = {1-21},
abstract = {The sclera has recently been gaining attention as a biometric modality due to its various desirable characteristics. A key step in any type of ocular biometric recognition, including sclera recognition, is the segmentation of the relevant part(s) of the eye. However, the high computational complexity of the (deep) segmentation models used in this task can limit their applicability on resource-constrained devices such as smartphones or head-mounted displays. As these devices are a common desired target for such biometric systems, lightweight solutions for ocular segmentation are critically needed. To address this issue, this paper introduces IPAD (Iterative Pruning with Activation Deviation), a novel method for developing lightweight convolutional networks, that is based on model pruning. IPAD uses a novel filter-activation-based criterion (ADC) to determine low-importance filters and employs an iterative model pruning procedure to derive the final lightweight model. To evaluate the proposed pruning procedure, we conduct extensive experiments with two diverse segmentation models, over four publicly available datasets (SBVPI, SLD, SMD and MOBIUS), in four distinct problem configurations and in comparison to state-of-the-art methods from the literature. The results of the experiments show that the proposed filter-importance criterion outperforms the standard L1 and L2 approaches from the literature. Furthermore, the results also suggest that: 1) the pruned models are able to retain (or even improve on) the performance of the unpruned originals, as long as they are not over-pruned, with RITnet and U-Net at 50% of their original FLOPs reaching up to 4% and 7% higher IoU values than their unpruned versions, respectively, 2) smaller models require more careful pruning, as the pruning process can hurt the model’s generalization capabilities, and 3) the novel criterion most convincingly outperforms the classic approaches when sufficient training data is available, implying that the abundance of data leads to more robust activation-based importance computation.},
keywords = {biometrics, CNN, deep learning, model compression, pruning, sclera, sclera segmentation},
pubstate = {published},
tppubtype = {article}
}
The sclera has recently been gaining attention as a biometric modality due to its various desirable characteristics. A key step in any type of ocular biometric recognition, including sclera recognition, is the segmentation of the relevant part(s) of the eye. However, the high computational complexity of the (deep) segmentation models used in this task can limit their applicability on resource-constrained devices such as smartphones or head-mounted displays. As these devices are a common desired target for such biometric systems, lightweight solutions for ocular segmentation are critically needed. To address this issue, this paper introduces IPAD (Iterative Pruning with Activation Deviation), a novel method for developing lightweight convolutional networks, that is based on model pruning. IPAD uses a novel filter-activation-based criterion (ADC) to determine low-importance filters and employs an iterative model pruning procedure to derive the final lightweight model. To evaluate the proposed pruning procedure, we conduct extensive experiments with two diverse segmentation models, over four publicly available datasets (SBVPI, SLD, SMD and MOBIUS), in four distinct problem configurations and in comparison to state-of-the-art methods from the literature. The results of the experiments show that the proposed filter-importance criterion outperforms the standard L1 and L2 approaches from the literature. Furthermore, the results also suggest that: 1) the pruned models are able to retain (or even improve on) the performance of the unpruned originals, as long as they are not over-pruned, with RITnet and U-Net at 50% of their original FLOPs reaching up to 4% and 7% higher IoU values than their unpruned versions, respectively, 2) smaller models require more careful pruning, as the pruning process can hurt the model’s generalization capabilities, and 3) the novel criterion most convincingly outperforms the classic approaches when sufficient training data is available, implying that the abundance of data leads to more robust activation-based importance computation. |
Pernuš, Martin; Bhatnagar, Mansi; Samad, Badr; Singh, Divyanshu; Peer, Peter; Štruc, Vitomir; Dobrišek, Simon ChildNet: Structural Kinship Face Synthesis Model With Appearance Control Mechanisms Journal Article In: IEEE Access, pp. 1-22, 2023, ISSN: 2169-3536. @article{AccessMartin2023,
title = {ChildNet: Structural Kinship Face Synthesis Model With Appearance Control Mechanisms},
author = {Martin Pernuš and Mansi Bhatnagar and Badr Samad and Divyanshu Singh and Peter Peer and Vitomir Štruc and Simon Dobrišek},
url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10126110},
doi = {10.1109/ACCESS.2023.3276877},
issn = {2169-3536},
year = {2023},
date = {2023-05-17},
journal = {IEEE Access},
pages = {1-22},
abstract = {Kinship face synthesis is an increasingly popular topic within the computer vision community, particularly the task of predicting the child appearance using parental images. Previous work has been limited in terms of model capacity and inadequate training data, which is comprised of low-resolution and tightly cropped images, leading to lower synthesis quality. In this paper, we propose ChildNet, a method for kinship face synthesis that leverages the facial image generation capabilities of a state-of-the-art Generative Adversarial Network (GAN), and resolves the aforementioned problems. ChildNet is designed within the GAN latent space and is able to predict a child appearance that bears high resemblance to real parents’ children. To ensure fine-grained control, we propose an age and gender manipulation module that allows precise manipulation of the child synthesis result. ChildNet is capable of generating multiple child images per parent pair input, while providing a way to control the image generation variability. Additionally, we introduce a mechanism to control the dominant parent image. Finally, to facilitate the task of kinship face synthesis, we introduce a new kinship dataset, called Next of Kin. This dataset contains 3690 high-resolution face images with a diverse range of ethnicities and ages. We evaluate ChildNet in comprehensive experiments against three competing kinship face synthesis models, using two kinship datasets. The experiments demonstrate the superior performance of ChildNet in terms of identity similarity, while exhibiting high perceptual image quality. The source code for the model is publicly available at: https://github.com/MartinPernus/ChildNet.},
keywords = {artificial intelligence, CNN, deep learning, face generation, face synthesis, GAN, GAN inversion, kinship, kinship synthesis, StyleGAN2},
pubstate = {published},
tppubtype = {article}
}
Kinship face synthesis is an increasingly popular topic within the computer vision community, particularly the task of predicting the child appearance using parental images. Previous work has been limited in terms of model capacity and inadequate training data, which is comprised of low-resolution and tightly cropped images, leading to lower synthesis quality. In this paper, we propose ChildNet, a method for kinship face synthesis that leverages the facial image generation capabilities of a state-of-the-art Generative Adversarial Network (GAN), and resolves the aforementioned problems. ChildNet is designed within the GAN latent space and is able to predict a child appearance that bears high resemblance to real parents’ children. To ensure fine-grained control, we propose an age and gender manipulation module that allows precise manipulation of the child synthesis result. ChildNet is capable of generating multiple child images per parent pair input, while providing a way to control the image generation variability. Additionally, we introduce a mechanism to control the dominant parent image. Finally, to facilitate the task of kinship face synthesis, we introduce a new kinship dataset, called Next of Kin. This dataset contains 3690 high-resolution face images with a diverse range of ethnicities and ages. We evaluate ChildNet in comprehensive experiments against three competing kinship face synthesis models, using two kinship datasets. The experiments demonstrate the superior performance of ChildNet in terms of identity similarity, while exhibiting high perceptual image quality. The source code for the model is publicly available at: https://github.com/MartinPernus/ChildNet. |
Grabner, Miha; Wang, Yi; Wen, Qingsong; Blažič, Boštjan; Štruc, Vitomir A global modeling framework for load forecasting in distribution networks Journal Article In: IEEE Transactions on Smart Grid, 2023, ISSN: 1949-3061. @article{Grabner_TSG,
title = {A global modeling framework for load forecasting in distribution networks},
author = {Miha Grabner and Yi Wang and Qingsong Wen and Boštjan Blažič and Vitomir Štruc},
url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10092804},
doi = {10.1109/TSG.2023.3264525},
issn = {1949-3061},
year = {2023},
date = {2023-04-05},
journal = {IEEE Transactions on Smart Grid},
abstract = {With the increasing numbers of smart meter installations, scalable and efficient load forecasting techniques are critically needed to ensure sustainable situation awareness within the distribution networks. Distribution networks include a large amount of different loads at various aggregation levels, such as individual consumers, low-voltage feeders, and transformer stations. It is impractical to develop individual (or so-called local) forecasting models for each load separately. Additionally, such local models also (i) (largely) ignore the strong dependencies between different loads that might be present due to their spatial proximity and the characteristics of the distribution network, (ii) require historical data for each load to be able to make forecasts, and (iii) are incapable of adjusting to changes in the load behavior without retraining. To address these issues, we propose a global modeling framework for load forecasting in distribution networks that, unlike its local competitors, relies on a single global model to generate forecasts for a large number of loads. The global nature of the framework, significantly reduces the computational burden typically required when training multiple local forecasting models, efficiently exploits the cross-series information shared among different loads, and facilitates forecasts even when historical data for a load is missing or the behavior of a load evolves over time. To further improve on the performance of the proposed framework, an unsupervised localization mechanism and optimal ensemble construction strategy are also proposed to localize/personalize the global forecasting model to different load characteristics. Our experimental results show that the proposed framework outperforms naive benchmarks by more than 25% (in terms of Mean Absolute Error) on real-world dataset while exhibiting highly desirable characteristics when compared to the local models that are predominantly used in the literature. All source code and data are made publicly available to enable reproducibility: https://github.com/mihagrabner/GlobalModelingFramework},
keywords = {deep learning, global modeling, load forecasting, prediction, smart grid, time series analysis, time series forecasting},
pubstate = {published},
tppubtype = {article}
}
With the increasing numbers of smart meter installations, scalable and efficient load forecasting techniques are critically needed to ensure sustainable situation awareness within the distribution networks. Distribution networks include a large amount of different loads at various aggregation levels, such as individual consumers, low-voltage feeders, and transformer stations. It is impractical to develop individual (or so-called local) forecasting models for each load separately. Additionally, such local models also (i) (largely) ignore the strong dependencies between different loads that might be present due to their spatial proximity and the characteristics of the distribution network, (ii) require historical data for each load to be able to make forecasts, and (iii) are incapable of adjusting to changes in the load behavior without retraining. To address these issues, we propose a global modeling framework for load forecasting in distribution networks that, unlike its local competitors, relies on a single global model to generate forecasts for a large number of loads. The global nature of the framework, significantly reduces the computational burden typically required when training multiple local forecasting models, efficiently exploits the cross-series information shared among different loads, and facilitates forecasts even when historical data for a load is missing or the behavior of a load evolves over time. To further improve on the performance of the proposed framework, an unsupervised localization mechanism and optimal ensemble construction strategy are also proposed to localize/personalize the global forecasting model to different load characteristics. Our experimental results show that the proposed framework outperforms naive benchmarks by more than 25% (in terms of Mean Absolute Error) on real-world dataset while exhibiting highly desirable characteristics when compared to the local models that are predominantly used in the literature. All source code and data are made publicly available to enable reproducibility: https://github.com/mihagrabner/GlobalModelingFramework |
Meden, Blaž; Gonzalez-Hernandez, Manfred; Peer, Peter; Štruc, Vitomir Face deidentification with controllable privacy protection Journal Article In: Image and Vision Computing, vol. 134, no. 104678, pp. 1-19, 2023. @article{MedenDeID2023,
title = {Face deidentification with controllable privacy protection},
author = {Blaž Meden and Manfred Gonzalez-Hernandez and Peter Peer and Vitomir Štruc},
url = {https://reader.elsevier.com/reader/sd/pii/S0262885623000525?token=BC1E21411C50118E666720B002A89C9EB3DB4CFEEB5EB18D7BD7B0613085030A96621C8364583BFE7BAE025BE3646096&originRegion=eu-west-1&originCreation=20230516115322},
doi = {https://doi.org/10.1016/j.imavis.2023.104678},
year = {2023},
date = {2023-04-01},
journal = {Image and Vision Computing},
volume = {134},
number = {104678},
pages = {1-19},
abstract = {Privacy protection has become a crucial concern in today’s digital age. Particularly sensitive here are facial images, which typically not only reveal a person’s identity, but also other sensitive personal information. To address this problem, various face deidentification techniques have been presented in the literature. These techniques try to remove or obscure personal information from facial images while still preserving their usefulness for further analysis. While a considerable amount of work has been proposed on face deidentification, most state-of-theart solutions still suffer from various drawbacks, and (a) deidentify only a narrow facial area, leaving potentially important contextual information unprotected, (b) modify facial images to such degrees, that image naturalness and facial diversity is suffering in the deidentify images, (c) offer no flexibility in the level of privacy protection ensured, leading to suboptimal deployment in various applications, and (d) often offer an unsatisfactory tradeoff between the ability to obscure identity information, quality and naturalness of the deidentified images, and sufficient utility preservation. In this paper, we address these shortcomings with a novel controllable face deidentification technique that balances image quality, identity protection, and data utility for further analysis. The proposed approach utilizes a powerful generative model (StyleGAN2), multiple auxiliary classification models, and carefully designed constraints to guide the deidentification process. The approach is validated across four diverse datasets (CelebA-HQ, RaFD, XM2VTS, AffectNet) and in comparison to 7 state-of-the-art competitors. The results of the experiments demonstrate that the proposed solution leads to: (a) a considerable level of identity protection, (b) valuable preservation of data utility, (c) sufficient diversity among the deidentified faces, and (d) encouraging overall performance.},
keywords = {CNN, deep learning, deidentification, face recognition, GAN, GAN inversion, privacy, privacy protection, StyleGAN2},
pubstate = {published},
tppubtype = {article}
}
Privacy protection has become a crucial concern in today’s digital age. Particularly sensitive here are facial images, which typically not only reveal a person’s identity, but also other sensitive personal information. To address this problem, various face deidentification techniques have been presented in the literature. These techniques try to remove or obscure personal information from facial images while still preserving their usefulness for further analysis. While a considerable amount of work has been proposed on face deidentification, most state-of-theart solutions still suffer from various drawbacks, and (a) deidentify only a narrow facial area, leaving potentially important contextual information unprotected, (b) modify facial images to such degrees, that image naturalness and facial diversity is suffering in the deidentify images, (c) offer no flexibility in the level of privacy protection ensured, leading to suboptimal deployment in various applications, and (d) often offer an unsatisfactory tradeoff between the ability to obscure identity information, quality and naturalness of the deidentified images, and sufficient utility preservation. In this paper, we address these shortcomings with a novel controllable face deidentification technique that balances image quality, identity protection, and data utility for further analysis. The proposed approach utilizes a powerful generative model (StyleGAN2), multiple auxiliary classification models, and carefully designed constraints to guide the deidentification process. The approach is validated across four diverse datasets (CelebA-HQ, RaFD, XM2VTS, AffectNet) and in comparison to 7 state-of-the-art competitors. The results of the experiments demonstrate that the proposed solution leads to: (a) a considerable level of identity protection, (b) valuable preservation of data utility, (c) sufficient diversity among the deidentified faces, and (d) encouraging overall performance. |
Ivanovska, Marija; Štruc, Vitomir Face Morphing Attack Detection with Denoising Diffusion Probabilistic Models Proceedings Article In: Proceedings of the International Workshop on Biometrics and Forensics (IWBF), pp. 1-6, 2023. @inproceedings{IWBF2023_Marija,
title = {Face Morphing Attack Detection with Denoising Diffusion Probabilistic Models},
author = {Marija Ivanovska and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/03/IWBF2023_Morphing.pdf},
year = {2023},
date = {2023-02-28},
booktitle = {Proceedings of the International Workshop on Biometrics and Forensics (IWBF)},
pages = {1-6},
abstract = {Morphed face images have recently become a growing concern for existing face verification systems, as they are relatively easy to generate and can be used to impersonate someone's identity for various malicious purposes. Efficient Morphing Attack Detection (MAD) that generalizes well across different morphing techniques is, therefore, of paramount importance. Existing MAD techniques predominantly rely on discriminative models that learn from examples of bona fide and morphed images and, as a result, often exhibit sub-optimal generalization performance when confronted with unknown types of morphing attacks. To address this problem, we propose a novel, diffusion--based MAD method in this paper that learns only from the characteristics of bona fide images. Various forms of morphing attacks are then detected by our model as out-of-distribution samples. We perform rigorous experiments over four different datasets (CASIA-WebFace, FRLL-Morphs, FERET-Morphs and FRGC-Morphs) and compare the proposed solution to both discriminatively-trained and once-class MAD models. The experimental results show that our MAD model achieves highly competitive results on all considered datasets.},
keywords = {biometrics, deep learning, denoising diffusion probabilistic models, diffusion, face, face morphing attack, morphing attack, morphing attack detection},
pubstate = {published},
tppubtype = {inproceedings}
}
Morphed face images have recently become a growing concern for existing face verification systems, as they are relatively easy to generate and can be used to impersonate someone's identity for various malicious purposes. Efficient Morphing Attack Detection (MAD) that generalizes well across different morphing techniques is, therefore, of paramount importance. Existing MAD techniques predominantly rely on discriminative models that learn from examples of bona fide and morphed images and, as a result, often exhibit sub-optimal generalization performance when confronted with unknown types of morphing attacks. To address this problem, we propose a novel, diffusion--based MAD method in this paper that learns only from the characteristics of bona fide images. Various forms of morphing attacks are then detected by our model as out-of-distribution samples. We perform rigorous experiments over four different datasets (CASIA-WebFace, FRLL-Morphs, FERET-Morphs and FRGC-Morphs) and compare the proposed solution to both discriminatively-trained and once-class MAD models. The experimental results show that our MAD model achieves highly competitive results on all considered datasets. |
Grm, Klemen; Ozata, Berk; Struc, Vitomir; Ekenel, Hazim K. Meet-in-the-middle: Multi-scale upsampling and matching for cross-resolution face recognition Proceedings Article In: WACV workshops, pp. 120-129, 2023. @inproceedings{WACVW2023,
title = {Meet-in-the-middle: Multi-scale upsampling and matching for cross-resolution face recognition},
author = {Klemen Grm and Berk Ozata and Vitomir Struc and Hazim K. Ekenel},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/01/Meet_in_the_middle.pdf
https://arxiv.org/abs/2211.15225
https://openaccess.thecvf.com/content/WACV2023W/RWS/papers/Grm_Meet-in-the-Middle_Multi-Scale_Upsampling_and_Matching_for_Cross-Resolution_Face_Recognition_WACVW_2023_paper.pdf
},
year = {2023},
date = {2023-01-06},
booktitle = {WACV workshops},
pages = {120-129},
abstract = {In this paper, we aim to address the large domain gap between high-resolution face images, e.g., from professional portrait photography, and low-quality surveillance images, e.g., from security cameras. Establishing an identity match between disparate sources like this is a classical surveillance face identification scenario, which continues to be a challenging problem for modern face recognition techniques. To that end, we propose a method that combines face super-resolution, resolution matching, and multi-scale template accumulation to reliably recognize faces from long-range surveillance footage, including from low quality sources. The proposed approach does not require training or fine-tuning on the target dataset of real surveillance images. Extensive experiments show that our proposed method is able to outperform even existing methods fine-tuned to the SCFace dataset.},
keywords = {deep learning, face, face recognition, multi-scale matching, smart surveillance, surveillance, surveillance technology},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we aim to address the large domain gap between high-resolution face images, e.g., from professional portrait photography, and low-quality surveillance images, e.g., from security cameras. Establishing an identity match between disparate sources like this is a classical surveillance face identification scenario, which continues to be a challenging problem for modern face recognition techniques. To that end, we propose a method that combines face super-resolution, resolution matching, and multi-scale template accumulation to reliably recognize faces from long-range surveillance footage, including from low quality sources. The proposed approach does not require training or fine-tuning on the target dataset of real surveillance images. Extensive experiments show that our proposed method is able to outperform even existing methods fine-tuned to the SCFace dataset. |
Pernuš, Martin; Štruc, Vitomir; Dobrišek, Simon MaskFaceGAN: High Resolution Face Editing With Masked GAN Latent Code Optimization Journal Article In: IEEE Transactions on Image Processing, 2023. @article{MaskFaceGAN,
title = {MaskFaceGAN: High Resolution Face Editing With Masked GAN Latent Code Optimization},
author = {Martin Pernuš and Vitomir Štruc and Simon Dobrišek},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2023/02/MaskFaceGAN_compressed.pdf
https://arxiv.org/pdf/2103.11135.pdf},
year = {2023},
date = {2023-01-02},
journal = {IEEE Transactions on Image Processing},
abstract = {Face editing represents a popular research topic within the computer vision and image processing communities. While significant progress has been made recently in this area, existing solutions: (i) are still largely focused on low-resolution images, (ii) often generate editing results with visual artefacts, or (iii) lack fine-grained control over the editing procedure and alter multiple (entangled) attributes simultaneously, when trying to generate the desired facial semantics. In this paper, we aim to address these issues through a novel editing approach, called MaskFaceGAN that focuses on local attribute editing. The proposed approach is based on an optimization procedure that directly optimizes the latent code of a pre-trained (state-of-the-art) Generative Adversarial Network (i.e., StyleGAN2) with respect to several constraints that ensure: (i) preservation of relevant image content, (ii) generation of the targeted facial attributes, and (iii) spatially-selective treatment of local image regions. Th constraints are enforced with the help of an (differentiable) attribute classifier and face parser that provide the necessary reference information for the optimization procedure.
MaskFaceGAN is evaluated in extensive experiments on the CelebA-HQ, Helen and SiblingsDB-HQf datasets and in comparison with several state-of-the-art techniques from the literature. Our experimental results show that the proposed approach is able to edit face images with respect to several local facial attributes with unprecedented image quality and at high-resolutions (1024x1024), while exhibiting considerably less problems with attribute entanglement than competing solutions. The source code is publicly available from: https://github.com/MartinPernus/MaskFaceGAN.},
keywords = {CNN, computer vision, deep learning, face editing, face image processing, GAN, generative models, StyleGAN},
pubstate = {published},
tppubtype = {article}
}
Face editing represents a popular research topic within the computer vision and image processing communities. While significant progress has been made recently in this area, existing solutions: (i) are still largely focused on low-resolution images, (ii) often generate editing results with visual artefacts, or (iii) lack fine-grained control over the editing procedure and alter multiple (entangled) attributes simultaneously, when trying to generate the desired facial semantics. In this paper, we aim to address these issues through a novel editing approach, called MaskFaceGAN that focuses on local attribute editing. The proposed approach is based on an optimization procedure that directly optimizes the latent code of a pre-trained (state-of-the-art) Generative Adversarial Network (i.e., StyleGAN2) with respect to several constraints that ensure: (i) preservation of relevant image content, (ii) generation of the targeted facial attributes, and (iii) spatially-selective treatment of local image regions. Th constraints are enforced with the help of an (differentiable) attribute classifier and face parser that provide the necessary reference information for the optimization procedure.
MaskFaceGAN is evaluated in extensive experiments on the CelebA-HQ, Helen and SiblingsDB-HQf datasets and in comparison with several state-of-the-art techniques from the literature. Our experimental results show that the proposed approach is able to edit face images with respect to several local facial attributes with unprecedented image quality and at high-resolutions (1024x1024), while exhibiting considerably less problems with attribute entanglement than competing solutions. The source code is publicly available from: https://github.com/MartinPernus/MaskFaceGAN. |
Hrovatič, Anja; Peer, Peter; Štruc, Vitomir; Emeršič, Žiga Efficient ear alignment using a two-stack hourglass network Journal Article In: IET Biometrics , pp. 1-14, 2023, ISSN: 2047-4938. @article{UhljiIETZiga,
title = {Efficient ear alignment using a two-stack hourglass network},
author = {Anja Hrovatič and Peter Peer and Vitomir Štruc and Žiga Emeršič},
url = {https://ietresearch.onlinelibrary.wiley.com/doi/epdf/10.1049/bme2.12109},
doi = {10.1049/bme2.12109},
issn = {2047-4938},
year = {2023},
date = {2023-01-01},
journal = {IET Biometrics },
pages = {1-14},
abstract = {Ear images have been shown to be a reliable modality for biometric recognition with desirable characteristics, such as high universality, distinctiveness, measurability and permanence. While a considerable amount of research has been directed towards ear recognition techniques, the problem of ear alignment is still under-explored in the open literature. Nonetheless, accurate alignment of ear images, especially in unconstrained acquisition scenarios, where the ear appearance is expected to vary widely due to pose and view point variations, is critical for the performance of all downstream tasks, including ear recognition. Here, the authors address this problem and present a framework for ear alignment that relies on a two-step procedure: (i) automatic landmark detection and (ii) fiducial point alignment. For the first (landmark detection) step, the authors implement and train a Two-Stack Hourglass model (2-SHGNet) capable of accurately predicting 55 landmarks on diverse ear images captured in uncontrolled conditions. For the second (alignment) step, the authors use the Random Sample Consensus (RANSAC) algorithm to align the estimated landmark/fiducial points with a pre-defined ear shape (i.e. a collection of average ear landmark positions). The authors evaluate the proposed framework in comprehensive experiments on the AWEx and ITWE datasets and show that the 2-SHGNet model leads to more accurate landmark predictions than competing state-of-the-art models from the literature. Furthermore, the authors also demonstrate that the alignment step significantly improves recognition accuracy with ear images from unconstrained environments compared to unaligned imagery.},
keywords = {biometrics, CNN, deep learning, ear, ear alignment, ear recognition},
pubstate = {published},
tppubtype = {article}
}
Ear images have been shown to be a reliable modality for biometric recognition with desirable characteristics, such as high universality, distinctiveness, measurability and permanence. While a considerable amount of research has been directed towards ear recognition techniques, the problem of ear alignment is still under-explored in the open literature. Nonetheless, accurate alignment of ear images, especially in unconstrained acquisition scenarios, where the ear appearance is expected to vary widely due to pose and view point variations, is critical for the performance of all downstream tasks, including ear recognition. Here, the authors address this problem and present a framework for ear alignment that relies on a two-step procedure: (i) automatic landmark detection and (ii) fiducial point alignment. For the first (landmark detection) step, the authors implement and train a Two-Stack Hourglass model (2-SHGNet) capable of accurately predicting 55 landmarks on diverse ear images captured in uncontrolled conditions. For the second (alignment) step, the authors use the Random Sample Consensus (RANSAC) algorithm to align the estimated landmark/fiducial points with a pre-defined ear shape (i.e. a collection of average ear landmark positions). The authors evaluate the proposed framework in comprehensive experiments on the AWEx and ITWE datasets and show that the 2-SHGNet model leads to more accurate landmark predictions than competing state-of-the-art models from the literature. Furthermore, the authors also demonstrate that the alignment step significantly improves recognition accuracy with ear images from unconstrained environments compared to unaligned imagery. |
2022
|
Gan, Chenquan; Yang, Yucheng; Zhub, Qingyi; Jain, Deepak Kumar; Struc, Vitomir DHF-Net: A hierarchical feature interactive fusion network for dialogue emotion recognition Journal Article In: Expert Systems with Applications, vol. 210, 2022. @article{TextEmotionESWA,
title = {DHF-Net: A hierarchical feature interactive fusion network for dialogue emotion recognition},
author = {Chenquan Gan and Yucheng Yang and Qingyi Zhub and Deepak Kumar Jain and Vitomir Struc},
url = {https://www.sciencedirect.com/science/article/pii/S0957417422016025?via%3Dihub},
doi = {https://doi.org/10.1016/j.eswa.2022.118525},
year = {2022},
date = {2022-12-30},
urldate = {2022-08-01},
journal = {Expert Systems with Applications},
volume = {210},
abstract = {To balance the trade-off between contextual information and fine-grained information in identifying specific emotions during a dialogue and combine the interaction of hierarchical feature related information, this paper proposes a hierarchical feature interactive fusion network (named DHF-Net), which not only can retain the integrity of the context sequence information but also can extract more fine-grained information. To obtain a deep semantic information, DHF-Net processes the task of recognizing dialogue emotion and dialogue act/intent separately, and then learns the cross-impact of two tasks through collaborative attention. Also, a bidirectional gate recurrent unit (Bi-GRU) connected hybrid convolutional neural network (CNN) group method is designed, by which the sequence information is smoothly sent to the multi-level local information layers for feature exaction. Experimental results show that, on two open session datasets, the performance of DHF-Net is improved by 1.8% and 1.2%, respectively.},
keywords = {attention, CNN, deep learning, dialogue, emotion recognition, fusion, fusion network, nlp, semantics, text, text processing},
pubstate = {published},
tppubtype = {article}
}
To balance the trade-off between contextual information and fine-grained information in identifying specific emotions during a dialogue and combine the interaction of hierarchical feature related information, this paper proposes a hierarchical feature interactive fusion network (named DHF-Net), which not only can retain the integrity of the context sequence information but also can extract more fine-grained information. To obtain a deep semantic information, DHF-Net processes the task of recognizing dialogue emotion and dialogue act/intent separately, and then learns the cross-impact of two tasks through collaborative attention. Also, a bidirectional gate recurrent unit (Bi-GRU) connected hybrid convolutional neural network (CNN) group method is designed, by which the sequence information is smoothly sent to the multi-level local information layers for feature exaction. Experimental results show that, on two open session datasets, the performance of DHF-Net is improved by 1.8% and 1.2%, respectively. |
Tomašević, Darian; Peer, Peter; Štruc, Vitomir BiOcularGAN: Bimodal Synthesis and Annotation of Ocular Images Proceedings Article In: IEEE/IAPR International Joint Conference on Biometrics (IJCB 2022) , pp. 1-10, 2022. @inproceedings{TomasevicIJCBBiOcular,
title = {BiOcularGAN: Bimodal Synthesis and Annotation of Ocular Images},
author = {Darian Tomašević and Peter Peer and Vitomir Štruc },
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2022/12/BiModal_StyleGAN.pdf
https://arxiv.org/pdf/2205.01536.pdf},
year = {2022},
date = {2022-10-20},
urldate = {2022-10-20},
booktitle = {IEEE/IAPR International Joint Conference on Biometrics (IJCB 2022) },
pages = {1-10},
abstract = {Current state-of-the-art segmentation techniques for ocular images are critically dependent on large-scale annotated datasets, which are labor-intensive to gather and often raise privacy concerns. In this paper, we present a novel framework, called BiOcularGAN, capable of generating synthetic large-scale datasets of photorealistic (visible light and near-infrared) ocular images, together with corresponding segmentation labels to address these issues. At its core, the framework relies on a novel Dual-Branch StyleGAN2 (DB-StyleGAN2) model that facilitates bimodal image generation, and a Semantic Mask Generator (SMG) component that produces semantic annotations by exploiting latent features of the DB-StyleGAN2 model. We evaluate BiOcularGAN through extensive experiments across five diverse ocular datasets and analyze the effects of bimodal data generation on image quality and the produced annotations. Our experimental results show that BiOcularGAN is able to produce high-quality matching bimodal images and annotations (with minimal manual intervention) that can be used to train highly competitive (deep) segmentation models (in a privacy aware-manner) that perform well across multiple real-world datasets. The source code for the BiOcularGAN framework is publicly available at: https://github.com/dariant/BiOcularGAN.},
keywords = {biometrics, CNN, data synthesis, deep learning, ocular, segmentation, StyleGAN, synthetic data},
pubstate = {published},
tppubtype = {inproceedings}
}
Current state-of-the-art segmentation techniques for ocular images are critically dependent on large-scale annotated datasets, which are labor-intensive to gather and often raise privacy concerns. In this paper, we present a novel framework, called BiOcularGAN, capable of generating synthetic large-scale datasets of photorealistic (visible light and near-infrared) ocular images, together with corresponding segmentation labels to address these issues. At its core, the framework relies on a novel Dual-Branch StyleGAN2 (DB-StyleGAN2) model that facilitates bimodal image generation, and a Semantic Mask Generator (SMG) component that produces semantic annotations by exploiting latent features of the DB-StyleGAN2 model. We evaluate BiOcularGAN through extensive experiments across five diverse ocular datasets and analyze the effects of bimodal data generation on image quality and the produced annotations. Our experimental results show that BiOcularGAN is able to produce high-quality matching bimodal images and annotations (with minimal manual intervention) that can be used to train highly competitive (deep) segmentation models (in a privacy aware-manner) that perform well across multiple real-world datasets. The source code for the BiOcularGAN framework is publicly available at: https://github.com/dariant/BiOcularGAN. |
Huber, Marco; Boutros, Fadi; Luu, Anh Thi; Raja, Kiran; Ramachandra, Raghavendra; Damer, Naser; Neto, Pedro C.; Goncalves, Tiago; Sequeira, Ana F.; Cardoso, Jaime S.; Tremoco, João; Lourenco, Miguel; Serra, Sergio; Cermeno, Eduardo; Ivanovska, Marija; Batagelj, Borut; Kronovšek, Andrej; Peer, Peter; Štruc, Vitomir SYN-MAD 2022: Competition on Face Morphing Attack Detection based on Privacy-aware Synthetic Training Data Proceedings Article In: IEEE International Joint Conference on Biometrics (IJCB), pp. 1-10, 2022, ISBN: 978-1-6654-6394-2. @inproceedings{IvanovskaSYNMAD,
title = {SYN-MAD 2022: Competition on Face Morphing Attack Detection based on Privacy-aware Synthetic Training Data},
author = {Marco Huber and Fadi Boutros and Anh Thi Luu and Kiran Raja and Raghavendra Ramachandra and Naser Damer and Pedro C. Neto and Tiago Goncalves and Ana F. Sequeira and Jaime S. Cardoso and João Tremoco and Miguel Lourenco and Sergio Serra and Eduardo Cermeno and Marija Ivanovska and Borut Batagelj and Andrej Kronovšek and Peter Peer and Vitomir Štruc},
url = {https://ieeexplore.ieee.org/iel7/10007927/10007928/10007950.pdf?casa_token=k7CV1Vs4DUsAAAAA:xMvzvPAyLBoPv1PqtJQTmZQ9S3TJOlExgcxOeuZPNEuVFKVuIfofx30CgN-jnhVB8_5o_Ne3nJLB},
doi = {10.1109/IJCB54206.2022.10007950},
isbn = {978-1-6654-6394-2},
year = {2022},
date = {2022-09-01},
urldate = {2022-09-01},
booktitle = {IEEE International Joint Conference on Biometrics (IJCB)},
pages = {1-10},
keywords = {data synthesis, deep learning, face, face PAD, pad, synthetic data},
pubstate = {published},
tppubtype = {inproceedings}
}
|
Šircelj, Jaka; Peer, Peter; Solina, Franc; Štruc, Vitomir Hierarchical Superquadric Decomposition with Implicit Space Separation Proceedings Article In: Proceedings of ERK 2022, pp. 1-4, 2022. @inproceedings{SirceljSuperQuadrics,
title = {Hierarchical Superquadric Decomposition with Implicit Space Separation},
author = {Jaka Šircelj and Peter Peer and Franc Solina and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2022/08/sq_erk.pdf},
year = {2022},
date = {2022-08-01},
urldate = {2022-08-01},
booktitle = {Proceedings of ERK 2022},
pages = {1-4},
abstract = {We introduce a new method to reconstruct 3D objects using a set of volumetric primitives, i.e., superquadrics. The method hierarchically decomposes a target 3D object into pairs of superquadrics recovering finer and finer details. While such hierarchical methods have been studied before, we introduce a new way of splitting the object space using only properties of the predicted superquadrics. The method is trained and evaluated on the ShapeNet dataset. The results of our experiments suggest that reasonable reconstructions can be obtained with the proposed approach for a diverse set of objects with complex geometry.},
keywords = {CNN, deep learning, depth estimation, iterative procedure, model fitting, recursive model, superquadric, superquadrics, volumetric primitive},
pubstate = {published},
tppubtype = {inproceedings}
}
We introduce a new method to reconstruct 3D objects using a set of volumetric primitives, i.e., superquadrics. The method hierarchically decomposes a target 3D object into pairs of superquadrics recovering finer and finer details. While such hierarchical methods have been studied before, we introduce a new way of splitting the object space using only properties of the predicted superquadrics. The method is trained and evaluated on the ShapeNet dataset. The results of our experiments suggest that reasonable reconstructions can be obtained with the proposed approach for a diverse set of objects with complex geometry. |
Dvoršak, Grega; Dwivedi, Ankita; Štruc, Vitomir; Peer, Peter; Emeršič, Žiga Kinship Verification from Ear Images: An Explorative Study with Deep Learning Models Proceedings Article In: International Workshop on Biometrics and Forensics (IWBF), pp. 1–6, 2022. @inproceedings{KinEars,
title = {Kinship Verification from Ear Images: An Explorative Study with Deep Learning Models},
author = {Grega Dvoršak and Ankita Dwivedi and Vitomir Štruc and Peter Peer and Žiga Emeršič},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2022/03/Gregovi_Uhlji_Template-2.pdf},
year = {2022},
date = {2022-04-21},
urldate = {2022-04-21},
booktitle = {International Workshop on Biometrics and Forensics (IWBF)},
pages = {1--6},
abstract = {The analysis of kin relations from visual data represents a challenging research problem with important real-world applications. However, research in this area has mostly been limited to the analysis of facial images, despite the potential of other physical (human) characteristics for this task. In this paper, we therefore study the problem of kinship verification from ear images and investigate whether salient appearance characteristics, useful for this task, can be extracted from ear data. To facilitate the study, we introduce a novel dataset, called KinEar, that contains data from 19 families with each family member having from 15 to 31 ear images. Using the KinEar data, we conduct experiments using a Siamese training setup and 5 recent deep learning backbones. The results of our experiments suggests that ear images represent a viable alternative to other modalities for kinship verification, as 4 out of 5 considered models reach a performance of over 60% in terms of the Area Under the Receiver Operating Characteristics (ROC-AUC). },
keywords = {biometrics, CNN, deep learning, ear, ear biometrics, kinear, kinship, kinship recognition, transformer},
pubstate = {published},
tppubtype = {inproceedings}
}
The analysis of kin relations from visual data represents a challenging research problem with important real-world applications. However, research in this area has mostly been limited to the analysis of facial images, despite the potential of other physical (human) characteristics for this task. In this paper, we therefore study the problem of kinship verification from ear images and investigate whether salient appearance characteristics, useful for this task, can be extracted from ear data. To facilitate the study, we introduce a novel dataset, called KinEar, that contains data from 19 families with each family member having from 15 to 31 ear images. Using the KinEar data, we conduct experiments using a Siamese training setup and 5 recent deep learning backbones. The results of our experiments suggests that ear images represent a viable alternative to other modalities for kinship verification, as 4 out of 5 considered models reach a performance of over 60% in terms of the Area Under the Receiver Operating Characteristics (ROC-AUC). |
Jug, Julijan; Lampe, Ajda; Štruc, Vitomir; Peer, Peter Body Segmentation Using Multi-task Learning Proceedings Article In: International Conference on Artificial Intelligence in Information and Communication (ICAIIC), IEEE, 2022, ISBN: 978-1-6654-5818-4. @inproceedings{JulijanJugBody,
title = {Body Segmentation Using Multi-task Learning},
author = {Julijan Jug and Ajda Lampe and Vitomir Štruc and Peter Peer},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2022/03/ICAIIC_paper.pdf},
doi = {10.1109/ICAIIC54071.2022.9722662},
isbn = {978-1-6654-5818-4},
year = {2022},
date = {2022-01-20},
urldate = {2022-01-20},
booktitle = {International Conference on Artificial Intelligence in Information and Communication (ICAIIC)},
publisher = {IEEE},
abstract = {Body segmentation is an important step in many computer vision problems involving human images and one of the key components that affects the performance of all downstream tasks. Several prior works have approached this problem using a multi-task model that exploits correlations between different tasks to improve segmentation performance. Based on the success of such solutions, we present in this paper a novel multi-task model for human segmentation/parsing that involves three tasks, i.e., (i) keypoint-based skeleton estimation, (ii) dense pose prediction, and (iii) human-body segmentation. The main idea behind the proposed Segmentation--Pose--DensePose model (or SPD for short) is to learn a better segmentation model by sharing knowledge across different, yet related tasks. SPD is based on a shared deep neural network backbone that branches off into three task-specific model heads and is learned using a multi-task optimization objective. The performance of the model is analysed through rigorous experiments on the LIP and ATR datasets and in comparison to a recent (state-of-the-art) multi-task body-segmentation model. Comprehensive ablation studies are also presented. Our experimental results show that the proposed multi-task (segmentation) model is highly competitive and that the introduction of additional tasks contributes towards a higher overall segmentation performance. },
keywords = {body segmentation, cn, CNN, computer vision, deep beauty, deep learning, multi-task learning, segmentation, virtual try-on},
pubstate = {published},
tppubtype = {inproceedings}
}
Body segmentation is an important step in many computer vision problems involving human images and one of the key components that affects the performance of all downstream tasks. Several prior works have approached this problem using a multi-task model that exploits correlations between different tasks to improve segmentation performance. Based on the success of such solutions, we present in this paper a novel multi-task model for human segmentation/parsing that involves three tasks, i.e., (i) keypoint-based skeleton estimation, (ii) dense pose prediction, and (iii) human-body segmentation. The main idea behind the proposed Segmentation--Pose--DensePose model (or SPD for short) is to learn a better segmentation model by sharing knowledge across different, yet related tasks. SPD is based on a shared deep neural network backbone that branches off into three task-specific model heads and is learned using a multi-task optimization objective. The performance of the model is analysed through rigorous experiments on the LIP and ATR datasets and in comparison to a recent (state-of-the-art) multi-task body-segmentation model. Comprehensive ablation studies are also presented. Our experimental results show that the proposed multi-task (segmentation) model is highly competitive and that the introduction of additional tasks contributes towards a higher overall segmentation performance. |
2021
|
Grm, Klemen; Vitomir, Štruc Frequency Band Encoding for Face Super-Resolution Proceedings Article In: Proceedings of ERK 2021, pp. 1-4, 2021. @inproceedings{Grm-SuperResolution_ERK2021,
title = {Frequency Band Encoding for Face Super-Resolution},
author = {Klemen Grm and Štruc Vitomir},
url = {https://lmi.fe.uni-lj.si/wp-content/uploads/2021/10/SRAE_ERK21.pdf},
year = {2021},
date = {2021-09-10},
booktitle = {Proceedings of ERK 2021},
pages = {1-4},
abstract = {In this paper, we present a novel method for face super-resolution based on an encoder-decoder architecture. Unlike previous approaches, which focused primarily on directly reconstructing the high-resolution face appearance from low-resolution images, our method relies on a multi-stage approach where we learn a face representation in different frequency bands, followed by decoding the representation into a high-resolution image. Using quantitative experiments, we are able to demonstrate that this approach results in better face image reconstruction, as well as aiding in downstream semantic tasks such as face recognition and face verification.},
keywords = {CNN, deep learning, face, face hallucination, frequency encoding, super-resolution},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we present a novel method for face super-resolution based on an encoder-decoder architecture. Unlike previous approaches, which focused primarily on directly reconstructing the high-resolution face appearance from low-resolution images, our method relies on a multi-stage approach where we learn a face representation in different frequency bands, followed by decoding the representation into a high-resolution image. Using quantitative experiments, we are able to demonstrate that this approach results in better face image reconstruction, as well as aiding in downstream semantic tasks such as face recognition and face verification. |
Batagelj, Borut; Peer, Peter; Štruc, Vitomir; Dobrišek, Simon How to correctly detect face-masks for COVID-19 from visual information? Journal Article In: Applied sciences, vol. 11, no. 5, pp. 1-24, 2021, ISBN: 2076-3417. @article{Batagelj2021,
title = {How to correctly detect face-masks for COVID-19 from visual information?},
author = {Borut Batagelj and Peter Peer and Vitomir Štruc and Simon Dobrišek},
url = {https://www.mdpi.com/2076-3417/11/5/2070/pdf},
doi = {10.3390/app11052070},
isbn = {2076-3417},
year = {2021},
date = {2021-03-01},
urldate = {2021-03-01},
journal = {Applied sciences},
volume = {11},
number = {5},
pages = {1-24},
abstract = {The new Coronavirus disease (COVID-19) has seriously affected the world. By the end of November 2020, the global number of new coronavirus cases had already exceeded 60 million and the number of deaths 1,410,378 according to information from the World Health Organization (WHO). To limit the spread of the disease, mandatory face-mask rules are now becoming common in public settings around the world. Additionally, many public service providers require customers to wear face-masks in accordance with predefined rules (e.g., covering both mouth and nose) when using public services. These developments inspired research into automatic (computer-vision-based) techniques for face-mask detection that can help monitor public behavior and contribute towards constraining the COVID-19 pandemic. Although existing research in this area resulted in efficient techniques for face-mask detection, these usually operate under the assumption that modern face detectors provide perfect detection performance (even for masked faces) and that the main goal of the techniques is to detect the presence of face-masks only. In this study, we revisit these common assumptions and explore the following research questions: (i) How well do existing face detectors perform with masked-face images? (ii) Is it possible to detect a proper (regulation-compliant) placement of facial masks? and (iii) How useful are existing face-mask detection techniques for monitoring applications during the COVID-19 pandemic? To answer these and related questions we conduct a comprehensive experimental evaluation of several recent face detectors for their performance with masked-face images. Furthermore, we investigate the usefulness of multiple off-the-shelf deep-learning models for recognizing correct face-mask placement. Finally, we design a complete pipeline for recognizing whether face-masks are worn correctly or not and compare the performance of the pipeline with standard face-mask detection models from the literature. To facilitate the study, we compile a large dataset of facial images from the publicly available MAFA and Wider Face datasets and annotate it with compliant and non-compliant labels. The annotation dataset, called Face-Mask-Label Dataset (FMLD), is made publicly available to the research community.},
keywords = {computer vision, COVID-19, deep learning, detection, face, mask detection, recognition},
pubstate = {published},
tppubtype = {article}
}
The new Coronavirus disease (COVID-19) has seriously affected the world. By the end of November 2020, the global number of new coronavirus cases had already exceeded 60 million and the number of deaths 1,410,378 according to information from the World Health Organization (WHO). To limit the spread of the disease, mandatory face-mask rules are now becoming common in public settings around the world. Additionally, many public service providers require customers to wear face-masks in accordance with predefined rules (e.g., covering both mouth and nose) when using public services. These developments inspired research into automatic (computer-vision-based) techniques for face-mask detection that can help monitor public behavior and contribute towards constraining the COVID-19 pandemic. Although existing research in this area resulted in efficient techniques for face-mask detection, these usually operate under the assumption that modern face detectors provide perfect detection performance (even for masked faces) and that the main goal of the techniques is to detect the presence of face-masks only. In this study, we revisit these common assumptions and explore the following research questions: (i) How well do existing face detectors perform with masked-face images? (ii) Is it possible to detect a proper (regulation-compliant) placement of facial masks? and (iii) How useful are existing face-mask detection techniques for monitoring applications during the COVID-19 pandemic? To answer these and related questions we conduct a comprehensive experimental evaluation of several recent face detectors for their performance with masked-face images. Furthermore, we investigate the usefulness of multiple off-the-shelf deep-learning models for recognizing correct face-mask placement. Finally, we design a complete pipeline for recognizing whether face-masks are worn correctly or not and compare the performance of the pipeline with standard face-mask detection models from the literature. To facilitate the study, we compile a large dataset of facial images from the publicly available MAFA and Wider Face datasets and annotate it with compliant and non-compliant labels. The annotation dataset, called Face-Mask-Label Dataset (FMLD), is made publicly available to the research community. |
2020
|
Stepec, Dejan; Emersic, Ziga; Peer, Peter; Struc, Vitomir Constellation-Based Deep Ear Recognition Book Section In: Jiang, R.; Li, CT.; Crookes, D.; Meng, W.; Rosenberger, C. (Ed.): Deep Biometrics: Unsupervised and Semi-Supervised Learning, Springer, 2020, ISBN: 978-3-030-32582-4. @incollection{Stepec2020COMEar,
title = {Constellation-Based Deep Ear Recognition},
author = {Dejan Stepec and Ziga Emersic and Peter Peer and Vitomir Struc},
editor = {R. Jiang and CT. Li and D. Crookes and W. Meng and C. Rosenberger},
url = {https://link.springer.com/chapter/10.1007/978-3-030-32583-1_8
https://lmi.fe.uni-lj.si/wp-content/uploads/2020/02/DeepBio2019___REMIX.pdf},
doi = {https://doi.org/10.1007/978-3-030-32583-1_8},
isbn = {978-3-030-32582-4},
year = {2020},
date = {2020-01-29},
booktitle = {Deep Biometrics: Unsupervised and Semi-Supervised Learning},
publisher = {Springer},
abstract = {This chapter introduces COM-Ear, a deep constellation model for ear recognition. Different from competing solutions, COM-Ear encodes global as well as local characteristics of ear images and generates descriptive ear representations that ensure competitive recognition performance. The model is designed as dual-path convolutional neural network (CNN), where one path processes the input in a holistic manner, and the second captures local images characteristics from image patches sampled from the input image. A novel pooling operation, called patch-relevant-information pooling, is also proposed and integrated into the COM-Ear model. The pooling operation helps to select features from the input patches that are locally important and to focus the attention of the network to image regions that are descriptive and important for representation purposes. The model is trained in an end-to-end manner using a combined cross-entropy and center loss. Extensive experiments on the recently introduced Extended Annotated Web Ears (AWEx).},
keywords = {biometrics, CNN, deep learning, ear recognition, neural networks},
pubstate = {published},
tppubtype = {incollection}
}
This chapter introduces COM-Ear, a deep constellation model for ear recognition. Different from competing solutions, COM-Ear encodes global as well as local characteristics of ear images and generates descriptive ear representations that ensure competitive recognition performance. The model is designed as dual-path convolutional neural network (CNN), where one path processes the input in a holistic manner, and the second captures local images characteristics from image patches sampled from the input image. A novel pooling operation, called patch-relevant-information pooling, is also proposed and integrated into the COM-Ear model. The pooling operation helps to select features from the input patches that are locally important and to focus the attention of the network to image regions that are descriptive and important for representation purposes. The model is trained in an end-to-end manner using a combined cross-entropy and center loss. Extensive experiments on the recently introduced Extended Annotated Web Ears (AWEx). |
Grm, Klemen; Scheirer, Walter J.; Štruc, Vitomir Face hallucination using cascaded super-resolution and identity priors Journal Article In: IEEE Transactions on Image Processing, 2020. @article{TIPKlemen_2020,
title = {Face hallucination using cascaded super-resolution and identity priors},
author = {Klemen Grm and Walter J. Scheirer and Vitomir Štruc},
url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8866753
https://lmi.fe.uni-lj.si/wp-content/uploads/2023/02/IEEET_face_hallucination_compressed.pdf},
doi = {10.1109/TIP.2019.2945835},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {IEEE Transactions on Image Processing},
abstract = {In this paper we address the problem of hallucinating high-resolution facial images from low-resolution inputs at high magnification factors. We approach this task with convolutional neural networks (CNNs) and propose a novel (deep) face hallucination model that incorporates identity priors into the learning procedure. The model consists of two main parts: i) a cascaded super-resolution network that upscales the lowresolution facial images, and ii) an ensemble of face recognition models that act as identity priors for the super-resolution network during training. Different from most competing super-resolution techniques that rely on a single model for upscaling (even with large magnification factors), our network uses a cascade of multiple SR models that progressively upscale the low-resolution images using steps of 2×. This characteristic allows us to apply supervision signals (target appearances) at different resolutions and incorporate identity constraints at multiple-scales. The proposed C-SRIP model (Cascaded Super Resolution with Identity Priors) is able to upscale (tiny) low-resolution images captured in unconstrained conditions and produce visually convincing results for diverse low-resolution inputs. We rigorously evaluate the proposed model on the Labeled Faces in the Wild (LFW), Helen and CelebA datasets and report superior performance compared to the existing state-of-the-art.
},
keywords = {biometrics, CNN, computer vision, deep learning, face, face hallucination, super-resolution},
pubstate = {published},
tppubtype = {article}
}
In this paper we address the problem of hallucinating high-resolution facial images from low-resolution inputs at high magnification factors. We approach this task with convolutional neural networks (CNNs) and propose a novel (deep) face hallucination model that incorporates identity priors into the learning procedure. The model consists of two main parts: i) a cascaded super-resolution network that upscales the lowresolution facial images, and ii) an ensemble of face recognition models that act as identity priors for the super-resolution network during training. Different from most competing super-resolution techniques that rely on a single model for upscaling (even with large magnification factors), our network uses a cascade of multiple SR models that progressively upscale the low-resolution images using steps of 2×. This characteristic allows us to apply supervision signals (target appearances) at different resolutions and incorporate identity constraints at multiple-scales. The proposed C-SRIP model (Cascaded Super Resolution with Identity Priors) is able to upscale (tiny) low-resolution images captured in unconstrained conditions and produce visually convincing results for diverse low-resolution inputs. We rigorously evaluate the proposed model on the Labeled Faces in the Wild (LFW), Helen and CelebA datasets and report superior performance compared to the existing state-of-the-art.
|
2019
|
Rot, Peter; Vitek, Matej; Grm, Klemen; Emeršič, Žiga; Peer, Peter; Štruc, Vitomir Deep Sclera Segmentation and Recognition Book Section In: Uhl, Andreas; Busch, Christoph; Marcel, Sebastien; Veldhuis, Rainer (Ed.): Handbook of Vascular Biometrics, pp. 395-432, Springer, 2019, ISBN: 978-3-030-27731-4. @incollection{ScleraNetChapter,
title = {Deep Sclera Segmentation and Recognition},
author = {Peter Rot and Matej Vitek and Klemen Grm and Žiga Emeršič and Peter Peer
and Vitomir Štruc},
editor = {Andreas Uhl and Christoph Busch and Sebastien Marcel and Rainer Veldhuis},
url = {https://link.springer.com/content/pdf/10.1007%2F978-3-030-27731-4_13.pdf},
doi = {https://doi.org/10.1007/978-3-030-27731-4_13},
isbn = {978-3-030-27731-4},
year = {2019},
date = {2019-11-14},
booktitle = {Handbook of Vascular Biometrics},
pages = {395-432},
publisher = {Springer},
chapter = {13},
series = {Advances in Computer Vision and Pattern Recognition},
abstract = {In this chapter, we address the problem of biometric identity recognition from the vasculature of the human sclera. Specifically, we focus on the challenging task of multi-view sclera recognition, where the visible part of the sclera vasculature changes from image to image due to varying gaze (or view) directions. We propose a complete solution for this task built around Convolutional Neural Networks (CNNs) and make several contributions that result in state-of-the-art recognition performance, i.e.: (i) we develop a cascaded CNN assembly that is able to robustly segment the sclera vasculature from the input images regardless of gaze direction, and (ii) we present ScleraNET, a CNN model trained in a multi-task manner (combining losses pertaining to identity and view-direction recognition) that allows for the extraction of discriminative vasculature descriptors that can be used for identity inference. To evaluate the proposed contributions, we also introduce a new dataset of ocular images, called the Sclera Blood Vessels, Periocular and Iris (SBVPI) dataset, which represents one of the few publicly available datasets suitable for research in multi-view sclera segmentation and recognition. The datasets come with a rich set of annotations, such as a per-pixel markup of various eye parts (including the sclera vasculature), identity, gaze-direction and gender labels. We conduct rigorous experiments on SBVPI with competing techniques from the literature and show that the combination of the proposed segmentation and descriptor-computation models results in highly competitive recognition performance.},
keywords = {biometrics, CNN, deep learning, ocular, sclera, segmentation, vasculature},
pubstate = {published},
tppubtype = {incollection}
}
In this chapter, we address the problem of biometric identity recognition from the vasculature of the human sclera. Specifically, we focus on the challenging task of multi-view sclera recognition, where the visible part of the sclera vasculature changes from image to image due to varying gaze (or view) directions. We propose a complete solution for this task built around Convolutional Neural Networks (CNNs) and make several contributions that result in state-of-the-art recognition performance, i.e.: (i) we develop a cascaded CNN assembly that is able to robustly segment the sclera vasculature from the input images regardless of gaze direction, and (ii) we present ScleraNET, a CNN model trained in a multi-task manner (combining losses pertaining to identity and view-direction recognition) that allows for the extraction of discriminative vasculature descriptors that can be used for identity inference. To evaluate the proposed contributions, we also introduce a new dataset of ocular images, called the Sclera Blood Vessels, Periocular and Iris (SBVPI) dataset, which represents one of the few publicly available datasets suitable for research in multi-view sclera segmentation and recognition. The datasets come with a rich set of annotations, such as a per-pixel markup of various eye parts (including the sclera vasculature), identity, gaze-direction and gender labels. We conduct rigorous experiments on SBVPI with competing techniques from the literature and show that the combination of the proposed segmentation and descriptor-computation models results in highly competitive recognition performance. |
2016
|
Stržinar, Žiga; Grm, Klemen; Štruc, Vitomir Učenje podobnosti v globokih nevronskih omrežjih za razpoznavanje obrazov Proceedings Article In: Proceedings of the Electrotechnical and Computer Science Conference (ERK), Portorož, Slovenia, 2016. @inproceedings{ERK2016_sebastjan,
title = {Učenje podobnosti v globokih nevronskih omrežjih za razpoznavanje obrazov},
author = {Žiga Stržinar and Klemen Grm and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/en/ucenjepodobnostivglobokihnevronskihomrezjihzarazpoznavanjeobrazov/},
year = {2016},
date = {2016-09-20},
urldate = {2016-09-20},
booktitle = {Proceedings of the Electrotechnical and Computer Science Conference (ERK)},
address = {Portorož, Slovenia},
abstract = {Učenje podobnosti med pari vhodnih slik predstavlja enega najpopularnejših pristopov k razpoznavanju na področju globokega učenja. Pri tem pristopu globoko nevronsko omrežje na vhodu sprejme par slik (obrazov) in na izhodu vrne mero podobnosti med vhodnima slikama, ki jo je moč uporabiti za razpoznavanje. Izračun podobnosti je pri tem lahko v celoti udejanjen z globokim omrežjem, lahko pa se omrežje uporabi zgolj za izračun predstavitve vhodnega para slik, preslikava iz izračunane predstavitve v mero podobnosti pa se izvede z drugim, potencialno primernejšim modelom. V tem prispevku preizkusimo 5 različnih modelov za izvedbo preslikave med izračunano predstavitvijo in mero podobnosti, pri čemer za poizkuse uporabimo lastno nevronsko omrežje. Rezultati naših eksperimentov na problemu razpoznavanja obrazov kažejo na pomembnost izbire primernega modela, saj so razlike med uspešnostjo razpoznavanje od modela do modela precejšnje.},
keywords = {biometrics, CNN, deep learning, difference space, face verification, LFW, performance evaluation},
pubstate = {published},
tppubtype = {inproceedings}
}
Učenje podobnosti med pari vhodnih slik predstavlja enega najpopularnejših pristopov k razpoznavanju na področju globokega učenja. Pri tem pristopu globoko nevronsko omrežje na vhodu sprejme par slik (obrazov) in na izhodu vrne mero podobnosti med vhodnima slikama, ki jo je moč uporabiti za razpoznavanje. Izračun podobnosti je pri tem lahko v celoti udejanjen z globokim omrežjem, lahko pa se omrežje uporabi zgolj za izračun predstavitve vhodnega para slik, preslikava iz izračunane predstavitve v mero podobnosti pa se izvede z drugim, potencialno primernejšim modelom. V tem prispevku preizkusimo 5 različnih modelov za izvedbo preslikave med izračunano predstavitvijo in mero podobnosti, pri čemer za poizkuse uporabimo lastno nevronsko omrežje. Rezultati naših eksperimentov na problemu razpoznavanja obrazov kažejo na pomembnost izbire primernega modela, saj so razlike med uspešnostjo razpoznavanje od modela do modela precejšnje. |
Grm, Klemen; Dobrišek, Simon; Štruc, Vitomir Deep pair-wise similarity learning for face recognition Proceedings Article In: 4th International Workshop on Biometrics and Forensics (IWBF), pp. 1–6, IEEE 2016. @inproceedings{grm2016deep,
title = {Deep pair-wise similarity learning for face recognition},
author = {Klemen Grm and Simon Dobrišek and Vitomir Štruc},
url = {https://lmi.fe.uni-lj.si/en/deeppair-wisesimilaritylearningforfacerecognition/},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
booktitle = {4th International Workshop on Biometrics and Forensics (IWBF)},
pages = {1--6},
organization = {IEEE},
abstract = {Recent advances in deep learning made it possible to build deep hierarchical models capable of delivering state-of-the-art performance in various vision tasks, such as object recognition, detection or tracking. For recognition tasks the most common approach when using deep models is to learn object representations (or features) directly from raw image-input and then feed the learned features to a suitable classifier. Deep models used in this pipeline are typically heavily parameterized and require enormous amounts of training data to deliver competitive recognition performance. Despite the use of data augmentation techniques, many application domains, predefined experimental protocols or specifics of the recognition problem limit the amount of available training data and make training an effective deep hierarchical model a difficult task. In this paper, we present a novel, deep pair-wise similarity learning (DPSL) strategy for deep models, developed specifically to overcome the problem of insufficient training data, and demonstrate its usage on the task of face recognition. Unlike existing (deep) learning strategies, DPSL operates on image-pairs and tries to learn pair-wise image similarities that can be used for recognition purposes directly instead of feature representations that need to be fed to appropriate classification techniques, as with traditional deep learning pipelines. Since our DPSL strategy assumes an image pair as the input to the learning procedure, the amount of training data available to train deep models is quadratic in the number of available training images, which is of paramount importance for models with a large number of parameters. We demonstrate the efficacy of the proposed learning strategy by developing a deep model for pose-invariant face recognition, called Pose-Invariant Similarity Index (PISI), and presenting comparative experimental results on the FERET an IJB-A datasets.},
keywords = {CNN, deep learning, face recognition, IJB-A, IWBF, performance evaluation, similarity learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Recent advances in deep learning made it possible to build deep hierarchical models capable of delivering state-of-the-art performance in various vision tasks, such as object recognition, detection or tracking. For recognition tasks the most common approach when using deep models is to learn object representations (or features) directly from raw image-input and then feed the learned features to a suitable classifier. Deep models used in this pipeline are typically heavily parameterized and require enormous amounts of training data to deliver competitive recognition performance. Despite the use of data augmentation techniques, many application domains, predefined experimental protocols or specifics of the recognition problem limit the amount of available training data and make training an effective deep hierarchical model a difficult task. In this paper, we present a novel, deep pair-wise similarity learning (DPSL) strategy for deep models, developed specifically to overcome the problem of insufficient training data, and demonstrate its usage on the task of face recognition. Unlike existing (deep) learning strategies, DPSL operates on image-pairs and tries to learn pair-wise image similarities that can be used for recognition purposes directly instead of feature representations that need to be fed to appropriate classification techniques, as with traditional deep learning pipelines. Since our DPSL strategy assumes an image pair as the input to the learning procedure, the amount of training data available to train deep models is quadratic in the number of available training images, which is of paramount importance for models with a large number of parameters. We demonstrate the efficacy of the proposed learning strategy by developing a deep model for pose-invariant face recognition, called Pose-Invariant Similarity Index (PISI), and presenting comparative experimental results on the FERET an IJB-A datasets. |
2015
|
Grm, Klemen; Dobrišek, Simon; Štruc, Vitomir The pose-invariant similarity index for face recognition Proceedings Article In: Proceedings of the Electrotechnical and Computer Science Conference (ERK), Portorož, Slovenia, 2015. @inproceedings{ERK2015Klemen,
title = {The pose-invariant similarity index for face recognition},
author = {Klemen Grm and Simon Dobrišek and Vitomir Štruc},
year = {2015},
date = {2015-04-20},
booktitle = {Proceedings of the Electrotechnical and Computer Science Conference (ERK)},
address = {Portorož, Slovenia},
keywords = {biometrics, CNN, deep learning, deep models, face verification, similarity learning},
pubstate = {published},
tppubtype = {inproceedings}
}
|