@misc{17871, author = {Max Hort and Leon Moonen}, title = {On a Sustainable Training of Large Language Models for Source Code}, abstract = {Large language models (LLMs) have gained widespread attention and user adoption. These models, when trained on source code from platforms like GitHub, acquire a deep understanding of both the semantic and syntactic structures of code (i.e., code language models or CLMs). While CLMs offer tremendous assistance in software engineering tasks, their massive data requirements result in substantial energy consumption and CO2 emissions. In this work, we aim to find solutions to help reduce the environmental impact of training CLMs. Rather than following the conventional wisdom that {\textquotedblleft}more data is better{\textquotedblright}, we advocate for a refined approach to data in the training of CLMs. We propose that by intentionally decreasing training data volume while simultaneously enhancing data quality through data refinement techniques, we can reduce energy consumption while maintaining or even improving performance on software engineering tasks. Keywords: sustainability, language model, data refinement, machine learning}, year = {2024}, journal = {International Conference on Information and Communications Technology for Sustainability (ICT4S)}, }