@article{10.22454/FamMed.2026.142029, author = {Rowland, Kate and Wang, Ling and Bash, Kirstie and DeShetler, Lori and Vick, Sarah and Nguyen, Emma and Rogers-Johnson, Michelle and Anderson, Lauren and Sweet, Michelle and Fasula, Kimberly and Carter, Stefanie}, title = {Validation of the Use of a Large Language Model for Detecting Sentiment in Student Course Evaluation}, journal = {Family Medicine}, volume = {58}, number = {2}, year = {2026}, month = {2}, pages = {132-137}, doi = {10.22454/FamMed.2026.142029}, abstract = {Background and Objectives: The use of large language models and natural language processing (NLP) in medical education has expanded rapidly in recent years. Because of the documented risks of bias and errors, these artificial intelligence (AI) tools must be validated before being used for research or education. Traditional and novel conceptual frameworks can be used. This study aimed to validate the application of an NLP method, bidirectional encoder representations from transformers (BERT) model, to identify the presence and patterns of sentiment in end-of-course evaluations from M3 (medical school year 3) core clerkships at multiple institutions. Methods: We used the Patino framework, designed for the use of artificial intelligence in health professions education, as a guide for validating the NLP. Written comments from de-identified course evaluations at four schools were coded by teams of two human coders, and human-human interrater reliability statistics were calculated. Humans identified key terms to train the BERT model. The trained BERT model predicted the sentiments of a set of comments, and human-NLP interrater reliability statistics were calculated. Results: A total of 364 discrete comments were evaluated in the human phase. The range of positive (30.6%–61.0%), negative (4.9%–39.5%), neutral (9.8%–19.0%), and mixed (1.7%–27.5%) sentiments varied by school. Human-human and human-AI interrater reliability also varied by school. Human-human and human-AI reliability were comparable. Conclusions: Several conceptual frameworks offer models for validation of AI tools in health professions education. A BERT model, with training, can detect sentiment in medical student course evaluations with an interrater reliability similar to human coders.}, URL = {https://journals.stfm.org//familymedicine/2026/february/rowland-0237/}, eprint = {https://journals.stfm.org//media/5ofjgiuw/fammed-58-132.pdf}, }