We won the Ig Nobel Prize!
Together with Nicolas Krauter, Bettina Derstroff, Christof Stönner, Efstratios Bourtsoukidis, Achim Edtbauer, Jochen Wulf, Thomas Klüpfel, Stefan Kramer, and Jonathan […]
I am senior lecturer at the School of Computer Science of the University of Auckland, CTO of enviPath, and lead the Machine Learning Group at UoA. My main research area is machine learning and its application to bioinformatics, cheminformatics, and computational sustainability. Before joining the University of Auckland in 2017, I did a PostDoc at University of Mainz, Germany, and a PhD at Technical University of Munich, Germany. I am always interested in interesting new research areas both for applied and non-applied machine learning, currently, I am particularly interest in reliability of machine learning algorithms, adversarial machine learning, and bias, with applications in chemistry, epidemiology, and environmental research.
I am currently looking for new PhD, Honours, or Masters students. I typically accept two new PhD students per year. If you are interested, check my research areas and contact me by mail with you CV attached. Also check the Supervision page for funding information.
Together with Nicolas Krauter, Bettina Derstroff, Christof Stönner, Efstratios Bourtsoukidis, Achim Edtbauer, Jochen Wulf, Thomas Klüpfel, Stefan Kramer, and Jonathan […]
Bensemann, Joshua; Cheena, Hasnain; Huang, David Tse Juang; Broadbendt, Elizabeth; Williams, Jonathan; Wicker, Jörg
From What You See to What We Smell: Linking Human Emotions to Biomarkers in Breath Journal Article Forthcoming
In: Forthcoming.
@article{bensemann2022from,
title = {From What You See to What We Smell: Linking Human Emotions to Biomarkers in Breath},
author = {Joshua Bensemann and Hasnain Cheena and David Tse Juang Huang and Elizabeth Broadbendt and Jonathan Williams and J\"{o}rg Wicker},
year = {2023},
date = {2023-12-01},
urldate = {2022-12-01},
abstract = {Breath collection is a non-invasive method for monitoring biological processes occurring within the human body. Prior studies have extended these methods to observe the general processes occurring in groups of humans and are able to link them to what those groups are collectively experiencing. However, previous work lacked an objective measure of emotional stimuli. In this research, we applied machine learning techniques to breath data collected from cinema audiences to find associations between the biomarkers in the crowd's breath and both the audio-visual stimuli and thematic events of the movie.
This analysis enabled us to make direct links between what the group was experiencing and their biological response to that experience. To achieve this, we first extracted visual and auditory features from a movie and compared it to the biomarkers in the crowd's breath, using both regression and pattern mining techniques. Our results supported the theory that a crowd's collective experience has a direct correlation to the biomarkers in the crowd's breath. Consequently, these findings suggest that visual and auditory experiences have predictable effects on the human body that can be monitored without the requirement of expensive and/or invasive neuroimaging techniques.},
keywords = {},
pubstate = {forthcoming},
tppubtype = {article}
}
Roeslin, Samuel; Ma, Quincy; Chigullapally, Pavan; Wicker, Jörg; Wotherspoon, Liam
Development of a Seismic Loss Prediction Model for Residential Buildings using Machine Learning – Christchurch, New Zealand Journal Article
In: Natural Hazards and Earth System Sciences, vol. 23, no. 3, pp. 1207-1226, 2023.
Abstract | Links | BibTeX | Altmetric
@article{Roeslin2023development,
title = {Development of a Seismic Loss Prediction Model for Residential Buildings using Machine Learning \textendash Christchurch, New Zealand},
author = {Samuel Roeslin and Quincy Ma and Pavan Chigullapally and J\"{o}rg Wicker and Liam Wotherspoon},
url = {https://nhess.copernicus.org/articles/23/1207/2023/},
doi = {10.5194/nhess-23-1207-2023},
year = {2023},
date = {2023-03-22},
urldate = {2023-03-22},
journal = {Natural Hazards and Earth System Sciences},
volume = {23},
number = {3},
pages = {1207-1226},
abstract = {This paper presents a new framework for the seismic loss prediction of residential buildings in Christchurch, New Zealand. It employs data science techniques, geospatial tools, and machine learning (ML) trained on insurance claims data from the Earthquake Commission (EQC) collected following the 2010\textendash2011 Canterbury Earthquake Sequence (CES). The seismic loss prediction obtained from the ML model is shown to outperform the output from existing risk analysis tools for New Zealand for each of the main earthquakes of the CES. In addition to the prediction capabilities, the ML model delivered useful insights into the most important features contributing to losses during the CES. ML correctly highlighted that liquefaction significantly influenced buildings losses for the 22 February 2011 earthquake. The results are consistent with observations, engineering knowledge, and previous studies, confirming the potential of data science and ML in the analysis of insurance claims data and the development of seismic loss prediction models using empirical loss data.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Pullar-Strecker, Zac; Dost, Katharina; Frank, Eibe; Wicker, Jörg
Hitting the Target: Stopping Active Learning at the Cost-Based Optimum Journal Article
In: Machine Learning, 2022.
Abstract | Links | BibTeX | Altmetric
@article{pullar-strecker2022hitting,
title = {Hitting the Target: Stopping Active Learning at the Cost-Based Optimum},
author = {Zac Pullar-Strecker and Katharina Dost and Eibe Frank and J\"{o}rg Wicker},
editor = {Yu-Feng Li and Prateek Jain},
url = {https://doi.org/10.1007/s10994-022-06253-1
https://arxiv.org/abs/2110.03802},
doi = {10.1007/s10994-022-06253-1},
year = {2022},
date = {2022-10-14},
urldate = {2022-12-01},
journal = {Machine Learning},
abstract = {Active learning allows machine learning models to be trained using fewer labels while retaining similar performance to traditional supervised learning. An active learner selects the most informative data points, requests their labels, and retrains itself. While this approach is promising, it raises the question of how to determine when the model is ‘good enough’ without the additional labels required for traditional evaluation. Previously, different stopping criteria have been proposed aiming to identify the optimal stopping point. Yet, optimality can only be expressed as a domain-dependent trade-off between accuracy and the number of labels, and no criterion is superior in all applications. As a further complication, a comparison of criteria for a particular real-world application would require practitioners to collect additional labelled data they are aiming to avoid by using active learning in the first place. This work enables practitioners to employ active learning by providing actionable recommendations for which stopping criteria are best for a given real-world scenario. We contribute the first large-scale comparison of stopping criteria for pool-based active learning, using a cost measure to quantify the accuracy/label trade-off, public implementations of all stopping criteria we evaluate, and an open-source framework for evaluating stopping criteria. Our research enables practitioners to substantially reduce labeling costs by utilizing the stopping criterion which best suits their domain.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Dost, Katharina; Pullar-Strecker, Zac; Brydon, Liam; Zhang, Kunyang; Hafner, Jasmin; Riddle, Pat; Wicker, Jörg
Combatting Over-Specialization Bias in Growing Chemical Databases Journal Article Forthcoming
In: Research Square, Forthcoming, (preprint).
Abstract | Links | BibTeX | Altmetric
@article{Dost2022combatting,
title = {Combatting Over-Specialization Bias in Growing Chemical Databases},
author = {Katharina Dost and Zac Pullar-Strecker and Liam Brydon and Kunyang Zhang and Jasmin Hafner and Pat Riddle and J\"{o}rg Wicker},
doi = {https://doi.org/10.21203/rs.3.rs-2133331/v1},
year = {2022},
date = {2022-10-05},
journal = {Research Square},
abstract = {Background: Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research towards the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers' experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions on compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed Solution: In this paper, we propose CANCELS (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. CANCELS does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results: An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that CANCELS produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor's performance while reducing the amount of required experiments. Overall, we believe that CANCELS can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.},
note = {preprint},
keywords = {},
pubstate = {forthcoming},
tppubtype = {article}
}
Chang, Luke; Dost, Katharina; Zhai, Kaiqi; Demontis, Ambra; Roli, Fabio; Dobbie, Gillian; Wicker, Jörg
BAARD: Blocking Adversarial Examples by Testing for Applicability, Reliability and Decidability Inproceedings Forthcoming
In: Proceedings of the 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Forthcoming.
@inproceedings{nokey,
title = { BAARD: Blocking Adversarial Examples by Testing for Applicability, Reliability and Decidability },
author = {Luke Chang and Katharina Dost and Kaiqi Zhai and Ambra Demontis and Fabio Roli and Gillian Dobbie and J\"{o}rg Wicker},
year = {2023},
date = {2023-05-25},
urldate = {2023-05-25},
booktitle = {Proceedings of the 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
keywords = {},
pubstate = {forthcoming},
tppubtype = {inproceedings}
}
Chen, Mark; Dost, Katharina; Zhu, Johnny; Dobbie, Gillian; Wicker, Jörg
Targeted Attacks on Time Series Forecasting Inproceedings Forthcoming
In: Proceedings of the 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), Forthcoming.
@inproceedings{nokey,
title = {Targeted Attacks on Time Series Forecasting},
author = {Mark Chen and Katharina Dost and Johnny Zhu and Gillian Dobbie and J\"{o}rg Wicker},
year = {2023},
date = {2023-05-25},
booktitle = {Proceedings of the 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
keywords = {},
pubstate = {forthcoming},
tppubtype = {inproceedings}
}
Kim, Jonathan; Urschler, Martin; Riddle, Pat; Wicker, Jörg
Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes Inproceedings
In: 2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2022), pp. 4352-4358, 2022.
@inproceedings{kim2022closing,
title = {Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes},
author = {Jonathan Kim and Martin Urschler and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/abstract/document/9981542},
doi = {10.1109/IROS47612.2022.9981542},
year = {2022},
date = {2022-10-20},
urldate = {2022-10-20},
booktitle = {2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2022)},
pages = {4352-4358},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Poonawala-Lohani, Nooriyan; Riddle, Pat; Adnan, Mehnaz; Wicker, Jörg
In: Proceedings of the 13th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics, pp. 1-7, Association for Computing Machinery, New York, NY, USA, 2022, ISBN: 9781450393867.
Abstract | Links | BibTeX | Altmetric
@inproceedings{poonawala-lohani2022geographic,
title = {Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains: Ensemble methods for spatio-temporal Time Series Forecasting of Influenza-like Illness},
author = {Nooriyan Poonawala-Lohani and Pat Riddle and Mehnaz Adnan and J\"{o}rg Wicker},
doi = {10.1145/3535508.3545562},
isbn = {9781450393867},
year = {2022},
date = {2022-08-07},
urldate = {2022-08-07},
booktitle = {Proceedings of the 13th ACM International Conference on Bioinformatics, Computational Biology and Health Informatics},
pages = {1-7},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {Influenza is a communicable respiratory illness that can cause serious public health hazards. Flu surveillance in New Zealand tracks case counts from various District health boards (DHBs) in the country to monitor the spread of influenza in different geographic locations. Many factors contribute to the spread of the influenza across a geographic region, and it can be challenging to forecast cases in one region without taking into account case numbers in another region. This paper proposes a novel ensemble method called Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains (GEO-Reach). GEO-Reach is an ensemble technique that uses a two layer approach to utilise interdependence of historical case counts between geographic regions in New Zealand. This work extends a previously published method by the authors called Randomized Ensembles of Auto-regression chains (Reach). State-of-the-art forecasting models look at studying the spread of the virus. They focus on accurate forecasting of cases for a location using historical case counts for the same location and other data sources based on human behaviour such as movement of people across cities/geographic regions. This new approach is evaluated using Influenza like illness (ILI) case counts in 7 major regions in New Zealand from the years 2015-2019 and compares its performance with other standard methods such as Dante, ARIMA, Autoregression and Random Forests. The results demonstrate that the proposed method performed better than baseline methods when applied to this multi-variate time series forecasting problem.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graffeuille, Olivier; Koh, Yun Sing; Wicker, Jörg; Lehmann, Moritz
Semi-Supervised Conditional Density Estimation with Wasserstein Laplacian Regularisation Inproceedings
In: Proceedings of the AAAI Conference on Artificial Intelligence,, pp. 6746-6754, 2022.
Abstract | Links | BibTeX | Altmetric
@inproceedings{graffeuille2022semi,
title = {Semi-Supervised Conditional Density Estimation with Wasserstein Laplacian Regularisation},
author = {Olivier Graffeuille and Yun Sing Koh and J\"{o}rg Wicker and Moritz Lehmann},
url = {https://ojs.aaai.org/index.php/AAAI/article/view/20630},
doi = {10.1609/aaai.v36i6.20630 },
year = {2022},
date = {2022-06-28},
urldate = {2022-02-22},
booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence,},
volume = {36},
number = {6},
pages = {6746-6754},
abstract = {Conditional Density Estimation (CDE) has wide-reaching applicability to various real-world problems, such as spatial density estimation and environmental modelling. CDE estimates the probability density of a random variable rather than a single value and can thus model uncertainty and inverse problems. This task is inherently more complex than regression, and many algorithms suffer from overfitting, particularly when modelled with few labelled data points. For applications where unlabelled data is abundant but labelled data is scarce, we propose Wasserstein Laplacian Regularisation, a semi-supervised learning framework that allows CDE algorithms to leverage these unlabelled data. The framework minimises an objective function which ensures that the learned model is smooth along the manifold of the underlying data, as measured by Wasserstein distance. When applying our framework to Mixture Density Networks, the resulting semi-supervised algorithm can achieve similar performance to a supervised model with up to three times as many labelled data points on baseline datasets. We additionally apply our technique to the problem of remote sensing for chlorophyll-a estimation in inland waters.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Duncanson, Hamish; Ziogas, Ioannis; Riddle, Pat; Wicker, Jörg
Divide and Imitate: Multi-Cluster Identification and Mitigation of Selection Bias Inproceedings
In: 26th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD2022), pp. 149–160, Springer-Verlag, Berlin, Heidelberg, 2022, ISBN: 978-3-031-05935-3.
Abstract | Links | BibTeX | Altmetric
@inproceedings{dost2022divide,
title = {Divide and Imitate: Multi-Cluster Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Hamish Duncanson and Ioannis Ziogas and Pat Riddle and J\"{o}rg Wicker},
doi = {10.1007/978-3-031-05936-0_12},
isbn = {978-3-031-05935-3},
year = {2022},
date = {2022-05-16},
urldate = {2022-05-16},
booktitle = {26th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD2022)},
pages = {149\textendash160},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
abstract = {Machine Learning can help overcome human biases in decision making by focusing on purely logical conclusions based on the training data. If the training data is biased, however, that bias will be transferred to the model and remains undetected as the performance is validated on a test set drawn from the same biased distribution. Existing strategies for selection bias identification and mitigation generally rely on some sort of knowledge of the bias or the ground-truth. An exception is the Imitate algorithm that assumes no knowledge but comes with a strong limitation: It can only model datasets with one normally distributed cluster per class. In this paper, we introduce a novel algorithm, Mimic, which uses Imitate as a building block but relaxes this limitation. By allowing mixtures of multivariate Gaussians, our technique is able to model multi-cluster datasets and provide solutions for a substantially wider set of problems. Experiments confirm that Mimic not only identifies potential biases in multi-cluster datasets which can be corrected early on but also improves classifier performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Poonawala-Lohani, Nooriyan; Riddle, Pat; Adnan, Mehnaz; Wicker, Jörg
A Novel Approach for Time Series Forecasting of Influenza-like Illness Using a Regression Chain Method Inproceedings
In: Altman, Russ; Dunker, Keith; Hunter, Lawrence; Ritchie, Marylyn; Murray, Tiffany; Klein, Teri (Ed.): Pacific Symposium on Biocomputing, pp. 301-312, 2022.
Abstract | Links | BibTeX | Altmetric
@inproceedings{poonawala-lohani2022novel,
title = {A Novel Approach for Time Series Forecasting of Influenza-like Illness Using a Regression Chain Method},
author = {Nooriyan Poonawala-Lohani and Pat Riddle and Mehnaz Adnan and J\"{o}rg Wicker},
editor = {Russ Altman and Keith Dunker and Lawrence Hunter and Marylyn Ritchie and Tiffany Murray and Teri Klein},
url = {https://www.worldscientific.com/doi/abs/10.1142/9789811250477_0028
http://psb.stanford.edu/psb-online/proceedings/psb22/poorawala-lohani.pdf},
doi = {10.1142/9789811250477_0028},
year = {2022},
date = {2022-01-03},
urldate = {2022-01-03},
booktitle = {Pacific Symposium on Biocomputing},
volume = {27},
pages = {301-312},
abstract = {Influenza is a communicable respiratory illness that can cause serious public health hazards. Due to its huge threat to the community, accurate forecasting of Influenza-like-illness (ILI) can diminish the impact of an influenza season by enabling early public health interventions. Current forecasting models are limited in their performance, particularly when using a longer forecasting window. To support better forecasts over a longer forecasting window, we propose to use additional features such as weather data. Commonly used methods to fore-cast ILI, including statistical methods such as ARIMA, limit prediction performance when using additional data sources that might have complex non-linear associations with ILI incidence. This paper proposes a novel time series forecasting method, Randomized Ensembles of Auto-regression chains (Reach). Reach implements an ensemble of random chains for multi-step time series forecasting. This new approach is evaluated on ILI case counts in Auckland, New Zealand from the years 2015-2018 and compared to other standard methods. The results demonstrate that the proposed method performed better than baseline methods when applied to this multi-variate time series forecasting problem.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}