@article{10.22454/FamMed.2025.954255, author = {Thomas, Kirstyn and Szalacha, Laura and Hanna, Karim and Anibal, James and Petrilli, John}, title = {Evaluating the Effectiveness of ChatGPT Versus Human Proctors in Grading Medical Students’ Post-OSCE Notes}, journal = {Family Medicine}, volume = {57}, number = {10}, year = {2025}, month = {11}, pages = {727-731}, doi = {10.22454/FamMed.2025.954255}, abstract = {Background and Objectives: Artificial intelligence (AI) tools have potential utility in multiple domains, including medical education. However, educators have yet to evaluate AI’s assessment of medical students’ clinical reasoning as evidenced in note-writing. This study compares ChatGPT with a human proctor’s grading of medical students’ notes. Methods: A total of 127 subjective, objective, assessment, and plan notes, derived from an objective structured clinical examination, were previously graded by a physician proctor across four categories: history, physical exam, differential diagnosis/thought process, and treatment plan. ChatGPT-4, using the same rubric, was tasked with evaluating these 127 notes. We compared AI-generated scores with proctors’ scores using t tests and χ2 analysis. Results: The grades assigned by ChatGPT were significantly different than those assigned by proctors in history (P<.001), differential diagnosis/thought process (P<.001), and treatment plan (P<.001). Cohen’s d was the largest for treatment plan at 1.25. The differences led to a significant difference in students’ mean cumulative grade (proctor 23.13 [SD=2.84], ChatGPT 24.11 [SD 1.27], P<.001), affecting final grade distribution (P<.001). With proctor grading, 81 of the 127 (63.8%) notes were honors and 46 of the 127 (36.2%) were pass. ChatGPT gave significantly more honors (118/127 [92.9%]) than pass (9/127 [7.1%]). Conclusions: When compared to a human proctor, ChatGPT-4 assigned statistically different grades to students’ SOAP notes, although the practical difference was small. The most substantial grading discrepancy occurred in the treatment plan. Despite the slight numerical difference, ChatGPT assigned significantly more honors grades. Medical educators should therefore investigate a large language model’s performance characteristics in their local grading framework before using AI to augment grading of summative, written assessments.}, URL = {https://journals.stfm.org//familymedicine/2025/november-december/thomas-0068/}, eprint = {https://journals.stfm.org//media/k2afi1ha/fammed-57-727.pdf}, }