bank-fraud-baf-lakehouse/references.bib

@misc{_bankaccountfraud_,
  title = {Bank-Account-Fraud/Documents/Datasheet.Pdf at Main {$\cdot$} Feedzai/Bank-Account-Fraud},
  journal = {GitHub},
  urldate = {2026-02-11},
  abstract = {Supporting documentation for the paper \&quot;Turning the Tables: Biased, Imbalanced, Dynamic Tabular Datasets for ML Evaluation\&quot;, and the Bank Account Fraud suite of datasets.  - feedzai/bank-...},
  howpublished = {https://github.com/feedzai/bank-account-fraud/blob/main/documents/datasheet.pdf},
  langid = {english},
  file = {/home/rkw/Zotero/storage/LT4CJB34/datasheet.html}
}

@article{ali2013classification,
  title = {Classification with Class Imbalance Problem},
  author = {Ali, Aida and Shamsuddin, Siti Mariyam and Ralescu, Anca L},
  year = 2013,
  journal = {Int. J. Advance Soft Compu. Appl},
  volume = {5},
  number = {3},
  pages = {176--204},
  keywords = {class imbalance,unbalanced classes},
  file = {/home/rkw/Zotero/storage/3AVBB4SQ/Ali et al. - 2013 - Classification with class imbalance problem.pdf}
}

@misc{aminian_fraudtransformer_2025,
  title = {{{FraudTransformer}}: {{Time-Aware GPT}} for {{Transaction Fraud Detection}}},
  shorttitle = {{{FraudTransformer}}},
  author = {Aminian, Gholamali and Elliott, Andrew and Li, Tiger and Wong, Timothy Cheuk Hin and Dehon, Victor Claude and Szpruch, Lukasz and Maple, Carsten and Read, Christopher and Brown, Martin and Reinert, Gesine and Mamouei, Mo},
  year = 2025,
  month = oct,
  number = {arXiv:2509.23712},
  eprint = {2509.23712},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2509.23712},
  urldate = {2026-02-21},
  abstract = {Detecting payment fraud in real-world banking streams requires models that can exploit both the order of events and the irregular time gaps between them. We introduce FraudTransformer, a sequence model that augments a vanilla GPT-style architecture with (i) a dedicated time encoder that embeds either absolute timestamps or inter-event values, and (ii) a learned positional encoder that preserves relative order. Experiments on a large industrial dataset -- tens of millions of transactions and auxiliary events -- show that FraudTransformer surpasses four strong classical baselines (Logistic Regression, XGBoost and LightGBM) as well as transformer ablations that omit either the time or positional component. On the held-out test set it delivers the highest AUROC and PRAUC.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/home/rkw/Zotero/storage/YQQE72ZK/Aminian et al. - 2025 - FraudTransformer Time-Aware GPT for Transaction Fraud Detection.pdf;/home/rkw/Zotero/storage/XVL3X42S/2509.html}
}

@article{bartoszkrawczyk_learning_2016,
  title = {Learning from Imbalanced Data: Open Challenges and Future Directions},
  author = {{Bartosz Krawczyk} and Krawczyk, Bartosz},
  year = 2016,
  month = apr,
  journal = {Progress in Artificial Intelligence},
  volume = {5},
  number = {4},
  pages = {221--232},
  doi = {10.1007/s13748-016-0094-0},
  abstract = {Despite more than two decades of continuous development learning from imbalanced data is still a focus of intense research. Starting as a problem of skewed distributions of binary tasks, this topic evolved way beyond this conception. With the expansion of machine learning and data mining, combined with the arrival of big data era, we have gained a deeper insight into the nature of imbalanced learning, while at the same time facing new emerging challenges. Data-level and algorithm-level methods are constantly being improved and hybrid approaches gain increasing popularity. Recent trends focus on analyzing not only the disproportion between classes, but also other difficulties embedded in the nature of data. New real-life problems motivate researchers to focus on computationally efficient, adaptive and real-time methods. This paper aims at discussing open issues and challenges that need to be addressed to further develop the field of imbalanced learning. Seven vital areas of research in this topic are identified, covering the full spectrum of learning from imbalanced data: classification, regression, clustering, data streams, big data analytics and applications, e.g., in social media and computer vision. This paper provides a discussion and suggestions concerning lines of future research for each of them.},
  keywords = {imbalanced data,unbalanced classes},
  annotation = {MAG ID: 2338318698},
  file = {/home/rkw/Zotero/storage/ZFYYHYYR/Bartosz Krawczyk and Krawczyk - 2016 - Learning from imbalanced data open challenges and future directions.pdf}
}

@article{iscan_walletbased_2023,
  title = {Wallet-{{Based Transaction Fraud Prevention Through LightGBM With}} the {{Focus}} on {{Minimizing False Alarms}}},
  author = {Iscan, Can and Kumas, Osman and Akbulut, Fatma Patlar and Akbulut, Akhan},
  year = 2023,
  journal = {IEEE Access},
  volume = {11},
  pages = {131465--131474},
  issn = {2169-3536},
  doi = {10.1109/ACCESS.2023.3321666},
  urldate = {2026-02-21},
  abstract = {E-wallets' rising popularity can be attributed to the fact that they facilitate a wide variety of financial activities such as payments, transfers, investments, etc., and eliminate the need for actual cash or cards. The confidentiality, availability, and integrity of a user's financial information stored in an electronic wallet can be compromised by threats such as phishing, malware, and social engineering; therefore, fintech platforms employ intelligent fraud detection mechanisms to mitigate the problem. The purpose of this study is to detect fraudulent activity using cutting-edge machine learning techniques on data obtained from the leading e-wallet platform in Turkey. After a comprehensive analysis of the dataset's features via feature engineering procedures, we found that the LightGBM approach had the highest detection accuracy of fraudulent activity with 97\% in the experiments conducted. An additional key objective of reducing false alerts was accomplished, as the number of false alarms went from 13,024 to 6,249. This approach resulted in the establishment of a machine-learning model suitable for use by relatively small fraud detection teams.},
  keywords = {E-wallet,Feature extraction,fintech,Fraud,fraud detection,LightGBM,Machine learning,Machine learning algorithms,Monitoring,Online banking,Real-time systems},
  file = {/home/rkw/Zotero/storage/B2K3D8W9/Iscan et al. - 2023 - Wallet-Based Transaction Fraud Prevention Through LightGBM With the Focus on Minimizing False Alarms.pdf}
}

@article{jesus_baf_,
  title = {{{BAF Dataset Suite Datasheet}}},
  author = {Jesus, S{\'e}rgio and Pombal, Jos{\'e} and Alves, Duarte and Cruz, Andr{\'e} F and Saleiro, Pedro and Ribeiro, Rita P and Gama, Jo{\~a}o and Bizarro, Pedro},
  langid = {english},
  file = {/home/rkw/Zotero/storage/6A29JS3R/Jesus et al. - BAF Dataset Suite Datasheet.pdf}
}

@misc{jesus_turning_2022,
  title = {Turning the {{Tables}}: {{Biased}}, {{Imbalanced}}, {{Dynamic Tabular Datasets}} for {{ML Evaluation}}},
  shorttitle = {Turning the {{Tables}}},
  author = {Jesus, S{\'e}rgio and Pombal, Jos{\'e} and Alves, Duarte and Cruz, Andr{\'e} and Saleiro, Pedro and Ribeiro, Rita P. and Gama, Jo{\~a}o and Bizarro, Pedro},
  year = 2022,
  month = nov,
  number = {arXiv:2211.13358},
  eprint = {2211.13358},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2211.13358},
  urldate = {2026-02-11},
  abstract = {Evaluating new techniques on realistic datasets plays a crucial role in the development of ML research and its broader adoption by practitioners. In recent years, there has been a significant increase of publicly available unstructured data resources for computer vision and NLP tasks. However, tabular data -- which is prevalent in many high-stakes domains -- has been lagging behind. To bridge this gap, we present Bank Account Fraud (BAF), the first publicly available privacy-preserving, large-scale, realistic suite of tabular datasets. The suite was generated by applying state-of-the-art tabular data generation techniques on an anonymized,real-world bank account opening fraud detection dataset. This setting carries a set of challenges that are commonplace in real-world applications, including temporal dynamics and significant class imbalance. Additionally, to allow practitioners to stress test both performance and fairness of ML methods, each dataset variant of BAF contains specific types of data bias. With this resource, we aim to provide the research community with a more realistic, complete, and robust test bed to evaluate novel and existing methods.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Machine Learning},
  file = {/home/rkw/Zotero/storage/FSBNDIP4/Jesus et al. - 2022 - Turning the Tables Biased, Imbalanced, Dynamic Tabular Datasets for ML Evaluation.pdf;/home/rkw/Zotero/storage/7HXKQPDC/2211.html}
}

@article{johnson_deep_2019,
  title = {Deep {{Learning}} and {{Data Sampling}} with {{Imbalanced Big Data}}},
  author = {Johnson, Justin M. and Khoshgoftaar, Taghi M.},
  year = 2019,
  month = jul,
  pages = {175--183},
  doi = {10.1109/iri.2019.00038},
  abstract = {This study evaluates the use of deep learning and data sampling on a class-imbalanced Big Data problem, i.e. Medicare fraud detection. Medicare offers affordable health insurance to the elderly population and serves more than 15\% of the United States population. To increase transparency and help reduce fraud, the Centers for Medicare and Medicaid Services (CMS) have made several data sets publicly available for analysis. Our research group has conducted several studies using CMS data and traditional machine learning algorithms (non-deep learning), but challenges associated with severe class imbalance leave room for improvement. These previous studies serve as baselines as we employ deep neural networks with various data-sampling techniques to determine the efficacy of deep learning in addressing class imbalance. Random over-sampling (ROS), random under-sampling (RUS), and combinations of the two (ROS-RUS) are applied to study how varying levels of class imbalance impact model training and performance. Classwise performance is maximized by identifying optimal decision thresholds, and a strong linear relationship between minority class size and optimal threshold is observed. Results show that ROS significantly outperforms RUS, combining RUS and ROS both maximizes performance and efficiency with a 4 x speedup in training time, and the default threshold of 0.5 is never optimal when training data is imbalanced. To the best of our knowledge, this is the first study to provide statistical results comparing ROS, RUS, and ROS-RUS deep learning methods across a range of class distributions. Additional contributions include a unique analysis of thresholding as it relates to the minority class size and state-of-the-art performance on the given fraud detection task.},
  keywords = {LEIE},
  annotation = {MAG ID: 2974916584},
  file = {/home/rkw/Zotero/storage/ZQR6NJPU/Johnson and Khoshgoftaar - 2019 - Deep Learning and Data Sampling with Imbalanced Bi.pdf}
}

@article{kaur_systematic_2019,
  title = {A {{Systematic Review}} on {{Imbalanced Data Challenges}} in {{Machine Learning}}: {{Applications}} and {{Solutions}}},
  shorttitle = {A {{Systematic Review}} on {{Imbalanced Data Challenges}} in {{Machine Learning}}},
  author = {Kaur, Harsurinder and Pannu, Husanbir Singh and Malhi, Avleen Kaur},
  year = 2019,
  month = aug,
  journal = {ACM Comput. Surv.},
  volume = {52},
  number = {4},
  pages = {79:1--79:36},
  issn = {0360-0300},
  doi = {10.1145/3343440},
  urldate = {2026-02-11},
  abstract = {In machine learning, the data imbalance imposes challenges to perform data analytics in almost all areas of real-world research. The raw primary data often suffers from the skewed perspective of data distribution of one class over the other as in the case of computer vision, information security, marketing, and medical science. The goal of this article is to present a comparative analysis of the approaches from the reference of data pre-processing, algorithmic and hybrid paradigms for contemporary imbalance data analysis techniques, and their comparative study in lieu of different data distribution and their application areas.},
  file = {/home/rkw/Zotero/storage/4WZYQG9W/Kaur et al. - 2019 - A Systematic Review on Imbalanced Data Challenges in Machine Learning Applications and Solutions.pdf}
}

@inproceedings{zhang_leveraging_2025,
  title = {Leveraging {{LightGBM}} for {{High-Accuracy Telecom Fraud Detection}} with {{Clustering-Based Undersampling}}},
  booktitle = {2025 8th {{International Symposium}} on {{Big Data}} and {{Applied Statistics}} ({{ISBDAS}})},
  author = {Zhang, Shuo and Zhang, Bo and Hou, Shichong and Fu, Zhiyuan},
  year = 2025,
  month = feb,
  pages = {384--388},
  doi = {10.1109/ISBDAS64762.2025.11117117},
  urldate = {2026-02-21},
  abstract = {This study presents a machine learning framework designed to predict and prevent telecom fraud by analyzing 1 million transaction records. The model identifies critical fraud patterns through the analysis of key features, including PIN usage, transaction frequency, and location. To address the severe class imbalance in the dataset (initially a 10:1 ratio), a clustering-based undersampling technique was employed, balancing the dataset to a 1:1 ratio while preserving data integrity and improving model performance. The framework utilizes LightGBM, optimized through Bayesian hyperparameter tuning and five-fold cross-validation, achieving an accuracy of 98\% and a robust AUC of 0.9. Key findings highlight that transactions involving both bank cards and PIN verification exhibit a drastically reduced fraud risk (0.0001 ratio), while cardless or PIN-less transactions are significantly more susceptible to fraud. The study emphasizes the importance of multi-factor authentication and provides actionable insights for financial institutions to mitigate fraud risks. Additionally, it underscores the transformative potential of machine learning in real-time fraud detection, with future opportunities for integrating emerging technologies like blockchain to further enhance security.},
  keywords = {Accuracy,Bayes methods,Bayesian optimization,Clustering-Based Undersampling,Fraud,fraud prediction,LightGBM model,machine learning,Machine learning,Pins,Predictive models,Real-time systems,Security,Telecommunications,Tuning},
  file = {/home/rkw/Zotero/storage/8XIZ5CCT/11117117.html}
}

@article{zhao_improved_2024,
  title = {Improved {{LightGBM}} for {{Extremely Imbalanced Data}} and {{Application}} to {{Credit Card Fraud Detection}}},
  author = {Zhao, Xiaosong and Liu, Yong and Zhao, Qiangfu},
  year = 2024,
  journal = {IEEE Access},
  volume = {12},
  pages = {159316--159335},
  issn = {2169-3536},
  doi = {10.1109/ACCESS.2024.3487212},
  urldate = {2026-02-21},
  abstract = {Credit card fraud (CCF) is a significant threat to cardholders and financial institutions. CCF detection against this threat is challenging due to extremely imbalanced data (EID). EID involves extremely few instances of fraud for training and an extremely high risk of overlooking fraud. While class balancing or oversampling techniques can address the former problem by punishing negative classes or augmenting the positive data, they do not mitigate the latter. In contrast, the cost-sensitive learning approach targets only the high risk of false negative errors. Therefore, existing approaches are insufficient to solve all the issues of the EID problem. Based on the LightGBM (Light Gradient Boosting Machine) framework, this study introduces two novel machine-learning methods: the class balancing cost-harmonization LightGBM (CB-CHL-LightGBM) and the oversampling cost-harmonization LightGBM (OS-CHL-LightGBM). The new approaches combine class balancing or oversampling technology with LightGBM to solve the EID problem comprehensively. They enhance the efficacy of LightGBM in CCF detection scenarios. Experimental results on three CCF datasets indicate that the two proposed methods outperform LightGBM in several crucial performance metrics. For example, compared with the original LightGBM, CB-CHL-LightGBM or OS-CHL-LightGBM can increase the F2-score from 0.77 to 0.83 for the first dataset, from 0.77 to 0.86 for the second dataset, and from 0.70 to 0.82 for the third dataset. However, adding class balancing, oversampling, and cost-harmonization loss separately to LightGBM may not obtain better results.},
  keywords = {Accuracy,Boosting,Class balancing cost-harmonization LightGBM,Classification algorithms,cost-sensitive,Costs,credit card fraud detection,Credit cards,Data models,extremely imbalanced data,Fraud,interpretability,Loss measurement,oversampling,Synthetic data,Training},
  file = {/home/rkw/Zotero/storage/KI2Y7NIA/Zhao et al. - 2024 - Improved LightGBM for Extremely Imbalanced Data and Application to Credit Card Fraud Detection.pdf}
}