<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIRx Med</journal-id><journal-id journal-id-type="publisher-id">xmed</journal-id><journal-id journal-id-type="index">34</journal-id><journal-title>JMIRx Med</journal-title><abbrev-journal-title>JMIRx Med</abbrev-journal-title><issn pub-type="epub">2563-6316</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v6i1e74899</article-id><article-id pub-id-type="doi">10.2196/74899</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Assessing the Limitations of Large Language Models in Clinical Practice Guideline&#x2013;Concordant Treatment Decision-Making on Real-World Data: Retrospective Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Roeschl</surname><given-names>Tobias</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Hoffmann</surname><given-names>Marie</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hashemi</surname><given-names>Djawid</given-names></name><degrees>MD, PD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rarreck</surname><given-names>Felix</given-names></name><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hinrichs</surname><given-names>Nils</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Trippel</surname><given-names>Tobias Daniel</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gr&#x00F6;schel</surname><given-names>Matthias I</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Unbehaun</surname><given-names>Axel</given-names></name><degrees>MD, PD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Klein</surname><given-names>Christoph</given-names></name><degrees>MD, PD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kempfert</surname><given-names>J&#x00F6;rg</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dreger</surname><given-names>Henryk</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>O'Brien</surname><given-names>Benjamin</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hindricks</surname><given-names>Gerhard</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Balzer</surname><given-names>Felix</given-names></name><degrees>MD, PhD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Falk</surname><given-names>Volkmar</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff10">10</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Meyer</surname><given-names>Alexander</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff11">11</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Cardiology, Angiology and Intensive Care Medicine, Deutsches Herzzentrum der Charit&#x00E9;</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, corporate member of Freie Universit&#x00E4;t Berlin and Humboldt-Universit&#x00E4;t zu Berlin</institution><addr-line>Charit&#x00E9;platz 1</addr-line><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Berlin Institute of Health at Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, BIH Biomedical Innovation Academy, BIH Charit&#x00E9; Digital Clinician Scientist Program</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff4"><institution>DZHK (German Centre for Cardiovascular Research), partner site Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff5"><institution>Department of Cardiothoracic and Vascular Surgery, Deutsches Herzzentrum der Charit&#x00E9; (DHZC)</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff6"><institution>Department of Infectious Diseases and Respiratory Medicine, Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff7"><institution>Department of Cardiac Anesthesiology and Intensive Care Medicine, Deutsches Herzzentrum der Charit&#x00E9; (DHZC)</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff8"><institution>Department of Perioperative Medicine, St Bartholomew&#x2019;s Hospital and Barts Heart Centre</institution><addr-line>London</addr-line><country>United Kingdom</country></aff><aff id="aff9"><institution>Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, Institute of Medical Informatics</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff10"><institution>Department of Health Sciences and Technology, Translational Cardiovascular Technologies, Institute of Translational Medicine, Swiss Federal Institute of Technology</institution><addr-line>Z&#x00FC;rich</addr-line><country>Switzerland</country></aff><aff id="aff11"><institution>Berlin Institute for the Foundations of Learning and Data &#x2013; TU Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Grover</surname><given-names>Abhinav</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Novak</surname><given-names>Andrej</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Singh</surname><given-names>Reenu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Marie Hoffmann, PhD, Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, corporate member of Freie Universit&#x00E4;t Berlin and Humboldt-Universit&#x00E4;t zu Berlin, Charit&#x00E9;platz 1, Berlin, 10117, Germany, 49 17632864219; <email>marie.hoffmann3@dhzc-charite.de</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>3</day><month>11</month><year>2025</year></pub-date><volume>6</volume><elocation-id>e74899</elocation-id><history><date date-type="received"><day>26</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>23</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>19</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Tobias Roeschl, Marie Hoffmann, Djawid Hashemi, Felix Rarreck, Nils Hinrichs, Tobias Daniel Trippel, Matthias I Gr&#x00F6;schel, Axel Unbehaun, Christoph Klein, J&#x00F6;rg Kempfert, Henryk Dreger, Benjamin O'Brien, Gerhard Hindricks, Felix Balzer, Volkmar Falk, Alexander Meyer. Originally published in JMIRx Med (<ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org">https://med.jmirx.org</ext-link>), 3.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIRx Med, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org/">https://med.jmirx.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://xmed.jmir.org/2025/1/e74899"/><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/74899" xlink:title="Preprint (JMIR Preprints)" xlink:type="simple">http://preprints.jmir.org/preprint/74899</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/84175" xlink:title="Peer-Review Report by Reenu Singh (Reviewer K)" xlink:type="simple">https://med.jmirx.org/2025/1/e84175</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/84174" xlink:title="Peer-Review Report by Andrej Novak (Reviewer BI)" xlink:type="simple">https://med.jmirx.org/2025/1/e84174</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/84173" xlink:title="Authors' Response to Peer-Review Reports" xlink:type="simple">https://med.jmirx.org/2025/1/e84173</related-article><abstract><sec><title>Background</title><p>Studies have shown that large language models (LLMs) are promising in therapeutic decision-making, with findings comparable to those of medical experts, but these studies used highly curated patient data.</p></sec><sec><title>Objective</title><p>This study aimed to determine if LLMs can make guideline-concordant treatment decisions based on patient data as typically present in clinical practice (lengthy, unstructured medical text).</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a retrospective study of 80 patients with severe aortic stenosis who were scheduled for either surgical (SAVR; n=24) or transcatheter aortic valve replacement (TAVR; n=56) by our institutional heart team in 2022. Various LLMs (BioGPT, GPT-3.5, GPT-4, GPT-4 Turbo, GPT-4o, LLaMA-2, Mistral, PaLM 2, and DeepSeek-R1) were queried using either anonymized original medical reports or manually generated case summaries to determine the most guideline-concordant treatment. We measured agreement with the heart team using Cohen &#x03BA; coefficients, reliability using intraclass correlation coefficients (ICCs), and fairness using the frequency bias index (FBI; FBI &#x003E;1 indicated bias toward TAVR).</p></sec><sec sec-type="results"><title>Results</title><p>When presented with original medical reports, LLMs showed poor performance (Cohen &#x03BA; coefficient: &#x2212;0.47 to 0.22; ICC: 0.0&#x2010;1.0; FBI: 0.95&#x2010;1.51). The LLMs&#x2019; performance improved substantially when case summaries were used as input and additional guideline knowledge was added to the prompt (Cohen &#x03BA; coefficient: &#x2212;0.02 to 0.63; ICC: 0.01&#x2010;1.0; FBI: 0.46&#x2010;1.23). Qualitative analysis revealed instances of hallucinations in all LLMs tested.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Even advanced LLMs require extensively curated input for informed treatment decisions. Unreliable responses, bias, and hallucinations pose significant health risks and highlight the need for caution in applying LLMs to real-world clinical decision-making.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>foundation models</kwd><kwd>reasoning models</kwd><kwd>treatment decision-making</kwd><kwd>aortic stenosis</kwd><kwd>clinical practice guidelines</kwd><kwd>medical data processing</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) have recently demonstrated their impressive capabilities in medicine, exemplified by passing medical board exams [<xref ref-type="bibr" rid="ref1">1</xref>], making correct diagnoses in complex clinical cases [<xref ref-type="bibr" rid="ref2">2</xref>], and excelling in physician-patient communication [<xref ref-type="bibr" rid="ref3">3</xref>]. Most recently, the use of LLMs in therapeutic decision-making has been trialed. Several studies have shown that LLMs can make treatment decisions for patients with oncological and cardiovascular diseases that are in substantial agreement with the respective treatment decisions made by clinical experts on tumor boards [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>] and heart teams (HTs) [<xref ref-type="bibr" rid="ref8">8</xref>]. However, a common feature of these studies was that the LLMs did not make treatment decisions based on real-world patient data in its original format (eg, discharge letters, imaging reports, etc) but rather made decisions based on preprocessed data.</p><p>In clinical practice, relevant patient data, such as patient characteristics, comorbidities, tumor stages, and imaging results, are typically available in free-text format, either as medical text reports or as text entries in the electronic health record, a format that is likely to persist in the near future. In the aforementioned studies, however, decision-relevant patient data were extracted from the original medical reports by the investigators in a preprocessing step before being provided to the LLMs as input in a concise and high-quality form. However, it is still unknown to what extent LLMs can make treatment decisions based on the original medical data, a scenario that could lead to a significant reduction in physician workload and potentially increase guideline adherence and thus improve patient care.</p><p>In this study, we investigated the impact of data representation, that is, using original medical reports versus case summaries, on the performance of LLMs in therapeutic decision-making.</p><p>As our study population, we selected patients with severe aortic stenosis (AS). This cohort was chosen because the parameters relevant to decision-making are readily quantifiable, the potential for resource optimization is substantial, and the prevalence of the condition is increasing. If left untreated, AS is associated with high morbidity and mortality [<xref ref-type="bibr" rid="ref9">9</xref>]. Treatment modalities for severe AS include surgical aortic valve replacement (SAVR), transcatheter aortic valve replacement (TAVR), and, to a lesser extent, medical therapy. The choice of the optimal treatment modality depends on several clinical variables, including patient age, estimated surgical risk, comorbidities, and anatomical factors, as specified in the 2021 European Society of Cardiology (ESC) and European Association for Cardio-Thoracic Surgery (EACTS) Guidelines for the management of valvular heart disease [<xref ref-type="bibr" rid="ref10">10</xref>]. The 2021 ESC/EACTS Guidelines strongly endorse an active, collaborative consultation with a multidisciplinary HT. HTs are comprised of cardiologists, cardiac surgeons, cardiac imaging specialists, and cardiac anesthesiologists. In HT meetings, these experts review a patient&#x2019;s condition based on patient data laboriously extracted from medical reports before arriving at a treatment decision using a guideline-based approach.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Evaluation Framework</title><p>We presented patient data to an LLM to obtain a treatment decision of either SAVR or TAVR. We assessed the degree of agreement between the treatment decisions provided by the LLM and the treatment decisions provided by the HT. Furthermore, we assessed the decidability, reliability, and fairness of the LLM. Finally, we compared the performance of 7 state-of-the-art LLMs to the performance of a simple non-LLM reference model. In an ablative manner, we studied the effect of using case summaries instead of the original medical reports and adding guideline knowledge to the prompt separately, resulting in 4 distinct experiments (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Experimental design. We presented the clinical data of 80 patients with severe aortic stenosis to a large language model (LLM) to receive a treatment decision for either surgical aortic valve replacement (SAVR) or transcatheter aortic valve replacement (TAVR), repeating each query 10 times. To investigate whether injecting guideline knowledge (raw+) into the prompt and/or using case summaries (sum and sum+) instead of the original medical reports (raw) improves LLM performance, we conducted a total of 4 experiments. Case summaries included only decision-relevant patient data and were manually created by physicians. CT: computed tomography.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v6i1e74899_fig01.png"/></fig></sec><sec id="s2-2"><title>Study Population</title><p>This study included patients treated at a heart center. We screened all patients with severe degenerative AS who were scheduled for an HT meeting in our hospital information system at 1 campus of our center in 2022. We identified 80 patients with sufficiently digitized documentation. As part of a quaternary care center, our institutional HT receives preselected patients scheduled for invasive AS treatment. Therefore, the number of patients recommended for conservative treatment at our institution is negligible. As a result, we decided to limit the possible therapeutic options for this study to SAVR and TAVR, excluding conservative therapy.</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study was approved by the research ethics committee of Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin (EA1/146/23). The approval included the collection of data based on implied consent owing to the retrospective and observational nature of the study.</p></sec><sec id="s2-4"><title>Data Collection</title><p>Medical reports were available as PDF files in our hospital information system. For each patient, we included the following preprocedural reports: the 2 most recent discharge letters (including letters from external clinics), invasive coronary angiography report, echocardiography report, computed tomography (CT) scan report, and HT report. We manually anonymized these reports prior to analysis.</p><p>HT meeting protocols are standardized documents that contain decision-relevant patient characteristics, such as comorbidities, surgical risk scores, and the final treatment decision of the HT (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). A detailed description of our institutional HT is provided in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-5"><title>LLMs Assessed</title><p>The study used several state-of-the-art LLMs, namely GPT-3.5 [<xref ref-type="bibr" rid="ref11">11</xref>], GPT-4 [<xref ref-type="bibr" rid="ref12">12</xref>], GPT-4 Turbo, and GPT-4o by OpenAI, and PaLM 2 by Google [<xref ref-type="bibr" rid="ref13">13</xref>]. In addition, we used the open-source models DeepSeek-R1 [<xref ref-type="bibr" rid="ref14">14</xref>] by DeepSeek, Mistral-7B [<xref ref-type="bibr" rid="ref15">15</xref>], LLaMA-2 by Meta [<xref ref-type="bibr" rid="ref16">16</xref>], and BioGPT [<xref ref-type="bibr" rid="ref17">17</xref>]. These LLMs had either demonstrated proficiency in similar tasks or had undergone pretraining on medical literature. Model details are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The model hyperparameters were set to the default values, except for the temperature, which was set to zero in accordance with previous studies in the medical domain [<xref ref-type="bibr" rid="ref18">18</xref>]. Temperature is a hyperparameter that controls the randomness of the LLM&#x2019;s output. Lower values make the output more deterministic and focused, reducing variability and creativity. A detailed description of how we accessed the LLMs and handled input size constraints is given in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-6"><title>Reference Model</title><p>The reference model represented an algorithmic emulation of the 2021 ESC/EACTS Guidelines for the management of valvular heart disease [<xref ref-type="bibr" rid="ref10">10</xref>]. More specifically, the reference model assigned patients to either SAVR or TAVR according to a flowchart (Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and relevant clinical variables (Tables S4 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref10">10</xref>]. Model details are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-7"><title>Experiments</title><p>Four experiments were conducted to investigate the effects of data preprocessing on LLM performance: raw, raw+, sum, and sum+.</p><sec id="s2-7-1"><title>Raw</title><p>In the raw experiment, we programmatically extracted the text content from the PDF files of relevant medical reports (ie, the 2 most recent discharge letters, invasive coronary angiography report, echocardiography report, and CT scan report) using Tesseract and concatenated these into a unified plain-text file. This text file was then manually anonymized and programmatically inserted into a prompt template. Each prompt included an introductory or continuation phrase and concluded with a request for a treatment decision (Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-7-2"><title>Raw+</title><p>As it is unknown whether the LLMs we used had sufficient knowledge of clinical practice guidelines (CPGs), we compiled a summary of relevant CPG content from the ESC/EACTS Guidelines [<xref ref-type="bibr" rid="ref10">10</xref>]. We added this summary to the prompt along with the unified text reports.</p></sec><sec id="s2-7-3"><title>Sum</title><p>To study the effect of content compression, we replaced the original medical reports used in the raw experiment with concise case summaries. These case summaries were created manually by the study team following a predefined template, with each patient characteristic documented in the HT protocol (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) either affirmed, negated, or populated with the patient-specific value, as exemplified in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-7-4"><title>Sum+</title><p>Case summaries were used as input and were enriched with the CPG summary (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><p>Prompt templates, the CPG summary, and an exemplary case summary are shown in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The LLMs&#x2019; responses were manually reviewed and categorized as either &#x201C;TAVR,&#x201D; &#x201C;SAVR,&#x201D; or &#x201C;indeterminate.&#x201D; Indeterminate responses occur when the model output does not match the available answer choices or when the model determines that there is insufficient information to support a decision (Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). To assess reliability and obtain robust estimates of performance metrics, the LLMs were presented with the same prompt input 10 times in succession for each experiment and patient (hereafter referred to as &#x201C;runs&#x201D;) to obtain a treatment decision. To prevent memory bias, a new chat session was initiated for each run.</p></sec></sec><sec id="s2-8"><title>Performance Metrics</title><p>We quantified agreement by means of Cohen &#x03BA; coefficients. For the sake of completeness, we also calculated accuracies as the proportion of treatment decisions that agreed with those made by the HT; however, we emphasize that due to class imbalance, this metric is only of limited significance and therefore only reported in Table S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Decidability was quantified as the proportion of determinate treatment decisions. Bias was quantified using the frequency bias index (FBI), defined as the ratio of predicted to observed treatment decisions for TAVR.</p><p>Due to the limitations of individual metrics, we used 2 different metrics to quantify reliability: intraclass correlation coefficients (ICCs) and normalized Shannon entropy. A detailed description of the performance metrics, including strategies for handling indeterminate responses, is provided in Table S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-9"><title>Statistical Analysis</title><p>The characteristics of patients who received SAVR and those who received TAVR were compared using the Student <italic>t</italic>-test for normally distributed continuous variables and the Mann-Whitney <italic>U</italic> test for variables departing from normality. The Shapiro-Wilk test was used to assess normality. The chi-square test was used for binary variables, and the Fisher exact test was used for sparse binary data.</p><p>Accuracy and Cohen &#x03BA; were computed with Python&#x2019;s sklearn.metrics package (version 1.2.2). ICCs were calculated based on a 1-way random effects, absolute agreement, single-rater model [<xref ref-type="bibr" rid="ref19">19</xref>] using Python&#x2019;s pingouin package (version 0.5.3).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Patient Characteristics</title><p>A total of 80 patients with severe AS who were discussed at our institutional HT in 2022 were included. Of these patients, 24 (30%) underwent SAVR, while 56 (70%) underwent TAVR. Patient characteristics are presented in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Patient characteristics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom">Data availability (%)</td><td align="left" valign="bottom">Overall (N=80)</td><td align="left" valign="bottom">SAVR<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (n=24)</td><td align="left" valign="bottom">TAVR<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (n=56)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Age (years), mean (SD)</td><td align="left" valign="top">100</td><td align="left" valign="top">77.74 (7.5)</td><td align="left" valign="top">70.71 (6.1)</td><td align="left" valign="top">80.75 (5.8)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Female sex, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">36 (45)</td><td align="left" valign="top">8 (33)</td><td align="left" valign="top">28 (50)</td><td align="left" valign="top">.26</td></tr><tr><td align="left" valign="top">Height (cm), mean (SD)</td><td align="left" valign="top">100</td><td align="left" valign="top">168.1 (11.0)</td><td align="left" valign="top">172.5 (11.0)</td><td align="left" valign="top">166.3 (10.6)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top">Body mass (kg), mean (SD)</td><td align="left" valign="top">100</td><td align="left" valign="top">76.3 (17.0)</td><td align="left" valign="top">79.0 (16.0)</td><td align="left" valign="top">75.1 (17.4)</td><td align="left" valign="top">.35</td></tr><tr><td align="left" valign="top">BMI (kg/m<sup>2</sup>), median (IQR)</td><td align="left" valign="top">100</td><td align="left" valign="top">26.0 (23.0-29.7)</td><td align="left" valign="top">25.9 (23.2-29.0)</td><td align="left" valign="top">26.2 (23.0-29.8)</td><td align="left" valign="top">.66</td></tr><tr><td align="left" valign="top">Logistic EuroSCORE<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, median (IQR)</td><td align="left" valign="top">31</td><td align="left" valign="top">6.8 (4.5-13.0)</td><td align="left" valign="top">4.5 (2.2-6.8)</td><td align="left" valign="top">8.4 (5.0-16.0)</td><td align="left" valign="top">.20</td></tr><tr><td align="left" valign="top">EuroSCORE II, median (IQR)</td><td align="left" valign="top">99</td><td align="left" valign="top">2.6 (1.6-4.5)</td><td align="left" valign="top">1.8 (1.1-3.1)</td><td align="left" valign="top">2.9 (1.8-4.9)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top">STS<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> score, median (IQR)</td><td align="left" valign="top">76</td><td align="left" valign="top">2.8 (1.6-4.5)</td><td align="left" valign="top">1.4 (1.1-3.0)</td><td align="left" valign="top">3.3 (2.1-4.5)</td><td align="left" valign="top">.12</td></tr><tr><td align="left" valign="top">Left ventricular ejection fraction (%), median (IQR)</td><td align="left" valign="top">100</td><td align="left" valign="top">60.0 (54.3-61.3)</td><td align="left" valign="top">60.0 (48.8-62.0)</td><td align="left" valign="top">60.0 (55.0-60.0)</td><td align="left" valign="top">.28</td></tr><tr><td align="left" valign="top">Aortic valve opening area (cm<sup>2</sup>), median (IQR)</td><td align="left" valign="top">100</td><td align="left" valign="top">0.70 (0.60-0.80)</td><td align="left" valign="top">0.80 (0.68-0.80)</td><td align="left" valign="top">0.70 (0.60-0.80)</td><td align="left" valign="top">.18</td></tr><tr><td align="left" valign="top">Arterial hypertension, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">59 (74)</td><td align="left" valign="top">18 (75)</td><td align="left" valign="top">41 (73)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Diabetes mellitus, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">22 (28)</td><td align="left" valign="top">6 (25)</td><td align="left" valign="top">16 (29)</td><td align="left" valign="top">.96</td></tr><tr><td align="left" valign="top">Hyperlipidemia, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">51 (64)</td><td align="left" valign="top">13 (54)</td><td align="left" valign="top">38 (68)</td><td align="left" valign="top">.36</td></tr><tr><td align="left" valign="top">Previous cardiac surgery, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">1 (1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (2)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Frailty, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">7 (9)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">7 (13)</td><td align="left" valign="top">.17</td></tr><tr><td align="left" valign="top">Sequelae of chest radiation, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Porcelain aorta, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Expected patient-prosthesis mismatch, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">1 (1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (2)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Severe chest deformation or scoliosis, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">7 (9)</td><td align="left" valign="top">1 (4)</td><td align="left" valign="top">6 (11)</td><td align="left" valign="top">.60</td></tr><tr><td align="left" valign="top">Severe coronary artery disease requiring revascularization, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">6 (8)</td><td align="left" valign="top">5 (21)</td><td align="left" valign="top">1 (2)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">Left ventricular ejection fraction &#x2264;40%, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">6 (8)</td><td align="left" valign="top">3 (13)</td><td align="left" valign="top">3 (5)</td><td align="left" valign="top">.52</td></tr><tr><td align="left" valign="top">Active neoplasia, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">7 (9)</td><td align="left" valign="top">2 (8)</td><td align="left" valign="top">5 (9)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Liver cirrhosis, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">1 (1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (2)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Chronic obstructive pulmonary disease (GOLD<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> stage &#x2265;3), n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">5 (6)</td><td align="left" valign="top">1 (4)</td><td align="left" valign="top">4 (7)</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top">Pulmonary arterial hypertension, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">8 (10)</td><td align="left" valign="top">3 (13)</td><td align="left" valign="top">5 (9)</td><td align="left" valign="top">.94</td></tr><tr><td align="left" valign="top">Under immunosuppressive therapy, n (%)</td><td align="left" valign="top">100</td><td align="left" valign="top">10 (13)</td><td align="left" valign="top">2 (8)</td><td align="left" valign="top">8 (14)</td><td align="left" valign="top">.71</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>SAVR: surgical aortic valve replacement.</p></fn><fn id="table1fn2"><p><sup>b</sup>TAVR: transcatheter aortic valve replacement.</p></fn><fn id="table1fn3"><p><sup>c</sup>EuroSCORE: European System for Cardiac Operative Risk Evaluation.</p></fn><fn id="table1fn4"><p><sup>d</sup>STS: Society of Thoracic Surgeons.</p></fn><fn id="table1fn5"><p><sup>e</sup>GOLD: Global Initiative for Chronic Obstructive Lung Disease.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Qualitative Analysis</title><p>The LLMs&#x2019; outputs ranged from nonsensical treatment recommendations (eg, heart transplant) and purely fabricated content to correctly assessing the patient&#x2019;s status, choosing the correct treatment option, and supporting the treatment decision with additional anatomical insights (<xref ref-type="table" rid="table2">Table 2</xref>). Qualitative analysis revealed that smaller models (eg, BioGPT) tended to provide conflicting treatment recommendations for the same patient. In contrast, the frontier models (eg, GPT-4 and PaLM 2) consistently provided the same treatment recommendation when presented with the same patient data repeatedly over 10 runs.</p><p>In each experiment, all LLMs produced hallucinations of varying severity and frequency. These included instructional, contextual, and factual inconsistencies (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Representative responses from the LLMs<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model</td><td align="left" valign="top">Experiment</td><td align="left" valign="top">Patient characteristics</td><td align="left" valign="top">LLM response<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">HT<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> treatment decision</td><td align="left" valign="top">Interpretation</td></tr></thead><tbody><tr><td align="left" valign="top">PaLM 2</td><td align="left" valign="top">Raw</td><td align="left" valign="top">56-year-old male; EuroSCORE<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> II: 0.55%; no comorbidities except diffuse, mild coronary atherosclerosis and arterial hypertension; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;Based on the provided information, the recommended treatment for the patient would be transcatheter aortic valve replacement (TAVR). The patient has severe aortic stenosis, which is causing symptoms and has been confirmed by echocardiography and cardiac catheterization. The patient is also <italic>at high risk for surgical aortic valve replacement (SAVR) due to their age, comorbidities, and the presence of diffuse coronary artery disease</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>. TAVR is a less invasive procedure than SAVR and is associated with a <italic>lower risk of complications</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>. Therefore, <italic>TAVR</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> would be the preferred treatment option for this patient.&#x201D;</p></list-item></list></td><td align="left" valign="top">SAVR<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM incorrectly recommended TAVR<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup> based on falsely extracted and/or interpreted patient characteristics (factual inconsistency).</p></list-item><list-item><p>According to the literature, it has not yet been conclusively determined whether TAVR is associated with fewer complications than SAVR [<xref ref-type="bibr" rid="ref20">20</xref>] (factual inconsistency/bias).</p></list-item></list></td></tr><tr><td align="left" valign="top">BioGPT</td><td align="left" valign="top">Raw</td><td align="left" valign="top">69-year-old male; EuroSCORE II: 7.2%; postcardiac arrest syndrome, frailty, long-term mechanical ventilation, and liver cirrhosis; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;<italic>Surgical replacement of the aortic valve. The patient was referred to the operating room for AVR. The operation was performed under general anesthesia and extracorporeal circulation ... The operation was performed with a median sternotomy. The ascending aorta was dissected free and opened&#x2026;and the coronary arteries were visualized&#x2026;</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>&#x201D;</p></list-item></list></td><td align="left" valign="top">TAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM falsely stated a surgery report from a procedure this patient never received and that was not mentioned in the prompt (context inconsistency).</p></list-item></list></td></tr><tr><td align="left" valign="top">BioGPT</td><td align="left" valign="top">Raw</td><td align="left" valign="top">75-year-old female; EuroSCORE II: 2.4%; STS<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup> score: 2.9%; COPD<sup><xref ref-type="table-fn" rid="table2fn10">j</xref></sup> (GOLD<sup><xref ref-type="table-fn" rid="table2fn11">k</xref></sup> Stage III); pulmonary hypertension and frailty; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;<italic>Heart transplantation. - The patient&#x2019;s family decided against surgery. - The patient died due to heart failure.</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>&#x201D;</p></list-item></list></td><td align="left" valign="top">TAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM did not follow the instruction to choose between SAVR and TAVR but opted for heart transplantation (instruction inconsistency).</p></list-item><list-item><p>The LLM fabricated a clinical course that was not mentioned in the prompt (context inconsistency).</p></list-item></list></td></tr><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Sum</td><td align="left" valign="top">72-year-old female; EuroSCORE II: 1.6%; STS score: 1.1%; no relevant comorbidities; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response<sub>1</sub>: &#x201C;Based on the given information, the recommended treatment for a person with severe aortic valve stenosis and no other significant comorbidities would be <italic>SAVR</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup> (Surgical Aortic Valve Replacement) if conservative therapy is not an option.&#x201D;</p></list-item><list-item><p>Response<sub>2</sub>: &#x201C;Based on the provided information, the recommended treatment for a person with severe aortic valve stenosis, who does not have any contraindications or relevant comorbidities, would be <italic>TAVI</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> (Transcatheter Aortic Valve Implantation) rather than SAVR (Surgical Aortic Valve Replacement).&#x201D;</p></list-item></list></td><td align="left" valign="top">SAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM gave opposite answers to the exact same question (unreliability).</p></list-item><list-item><p>The LLM incorrectly recommended TAVR based on the absence of comorbidities (factual inconsistency).</p></list-item></list></td></tr><tr><td align="left" valign="top">PaLM 2</td><td align="left" valign="top">Raw+</td><td align="left" valign="top">56-years-old male; EuroSCORE II: 0.55%; STS score: 0.7%; no comorbidities except arterial hypertension; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;Based on the guidelines, the recommended treatment would be transcatheter aortic valve replacement (<italic>TAVR</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>). The patient is 56 years old and at <italic>high surgical risk</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> (STS-PROM score &#x003E;8% or EuroSCORE II &#x003E;8 %) due to <italic>severe frailty</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> and <italic>previous cardiac surgery</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>. TAVR is the preferred treatment for patients who are inoperable or at high surgical risk.&#x201D;</p></list-item></list></td><td align="left" valign="top">SAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM falsely claimed that the patient is at high surgical risk due to frailty and previous cardiac surgery, which are both entirely fabricated (factual inconsistency).</p></list-item></list></td></tr><tr><td align="left" valign="top">GPT-3.5</td><td align="left" valign="top">Sum+</td><td align="left" valign="top">81-year-old female; logistic EuroSCORE: 8.44%; EuroSCORE II: 1.82%; STS score: 4.33%; stage 3A chronic kidney disease; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;Based on the guidelines provided, the recommended treatment for the patient would be <italic>SAVR</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> (surgical aortic valve replacement) if conservative therapy is not an option. This is because the patient is 81 years old and falls into the <italic>older age category</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup>, and <italic>their logistic EuroSCORE</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> is 8.44%, which is above the threshold for TAVR (transcatheter aortic valve replacement) recommendation.&#x201D;</p></list-item></list></td><td align="left" valign="top">TAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM incorrectly mentioned an association between the logistic EuroSCORE and surgical risk prohibiting SAVR (context inconsistency).</p></list-item><list-item><p>The LLM correctly assigned the patient to the older age category but then incorrectly recommended SAVR as the optimal therapy (logical inconsistency).</p></list-item></list></td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Raw</td><td align="left" valign="top">65-year-old female; EuroSCORE II: 2.5%; STS score: 1.4%; no relevant comorbidities; ascending aortic aneurysm (48 mm) mentioned in the CT<sup><xref ref-type="table-fn" rid="table2fn13">m</xref></sup> scan report</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;Based on the guidelines, the recommended treatment would be Surgical Aortic Valve Replacement (<italic>SAVR</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup>) as the patient has a severe aortic stenosis and an <italic>ascending aortic aneurysm</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup>. Transcatheter Aortic Valve Implantation (TAVI) is typically recommended for patients who are at <italic>high or prohibitive surgical risk, which does not seem to be the case here.</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup>&#x201D;</p></list-item></list></td><td align="left" valign="top">SAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Correct treatment decision plus additional extraction of decision-relevant clinical data.</p></list-item></list></td></tr><tr><td align="left" valign="top">LLaMA-2</td><td align="left" valign="top">Sum+</td><td align="left" valign="top">68-year-old male; EuroSCORE II: 1.29%; STS score: 3.04%; COPD; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Reponse<sub>1</sub>: &#x201C;Based on the guidelines provided, the recommended treatment for this patient would be TAVI. The patient&#x2019;s age (68 y old) and <italic>low surgical risk</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup> (EuroSCORE II 1.29 %, STS score 3.037 %) make them a suitable candidate for <italic>TAVI</italic>. Additionally, the patient does not have any <italic>contraindications for TAVI</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup><italic>, such as severe frailty, previous cardiac surgery, or significant thorax deformity.</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>&#x201D;</p></list-item><list-item><p>Response<sub>2</sub>: &#x2018;Based on the information provided, the patient&#x2019;s EuroSCORE II is 1.29% and STS score is 3.037 %, which indicates that they are at <italic>low surgical risk</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup>. The patient is also <italic>younger than 75 years</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup> old. Therefore, according to the guidelines, <italic>SAVR</italic><sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup> is the recommended treatment option for this patient.&#x201D;</p></list-item></list></td><td align="left" valign="top">SAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM correctly identified low surgical risk but then incorrectly rejected SAVR as a therapeutic option (logical inconsistency).</p></list-item><list-item><p>The LLM incorrectly mentioned factors favoring TAVR from the prompt (context inconsistency).</p></list-item><list-item><p>Although the LLM correctly selected SAVR the second time, it gave the opposite response to the exact same question (unreliability).</p></list-item></list></td></tr><tr><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">Raw+</td><td align="left" valign="top">65-year-old male; EuroSCORE II: 0.92%; STS score: 0.73%; end-stage renal disease requiring hemodialysis; no relevant anatomical aspects to consider</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Response: &#x201C;&#x2026; Based on the 2021 ESC/EACTS guidelines and the patient&#x2019;s clinical profile, the recommended treatment would be <italic>TAVR</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>. Reasoning: Patient age (65 y) is below 75 but has significant comorbidities, <italic>including terminal renal insufficiency, indicating high surgical risk</italic><sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>. Suitable tricuspid aortic valve anatomy for TAVR. High surgical risk factors favor TAVR over SAVR.&#x201D;</p></list-item></list></td><td align="left" valign="top">SAVR</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>The LLM overestimated the effect of end-stage renal disease on surgical risk, which was still relatively low overall, as evidenced by low surgical risk scores (factual inconsistency).</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table2fn2"><p><sup>b</sup>The LLMs&#x2019; treatment responses included well-informed decisions but also hallucinations ranging from obvious misinformation to absurd treatment recommendations and logical errors. We largely adhered to the taxonomy for the description of hallucinations established by Huang et al [<xref ref-type="bibr" rid="ref21">21</xref>].</p></fn><fn id="table2fn3"><p><sup>c</sup>LLM responses with subscripts indicate responses to the same question (obtained during 10 runs).</p></fn><fn id="table2fn4"><p><sup>d</sup>HT: heart team.</p></fn><fn id="table2fn5"><p><sup>e</sup>EuroSCORE: European System for Cardiac Operative Risk Evaluation.</p></fn><fn id="table2fn6"><p><sup>f</sup>The italicized part indicates an incorrect or harmful response.</p></fn><fn id="table2fn7"><p><sup>g</sup>SAVR: surgical aortic valve replacement.</p></fn><fn id="table2fn8"><p><sup>h</sup>TAVR: transcatheter aortic valve replacement.</p></fn><fn id="table2fn9"><p><sup>i</sup>STS: Society of Thoracic Surgeons.</p></fn><fn id="table2fn10"><p><sup>j</sup>COPD: chronic obstructive pulmonary disease.</p></fn><fn id="table2fn11"><p><sup>k</sup>GOLD: Global Initiative for Chronic Obstructive Lung Disease.</p></fn><fn id="table2fn12"><p><sup>l</sup>The italicized part indicates a correct or useful response.</p></fn><fn id="table2fn13"><p><sup>m</sup>CT: computed tomography.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Quantitative Analysis</title><p><xref ref-type="fig" rid="figure2">Figure 2</xref> and Table S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> present the performance metrics. In the raw experiment, LLMs&#x2019; treatment decisions were in poor agreement with the HT. In this experiment, DeepSeek-R1 showed the highest agreement with the HT, with a Cohen &#x03BA; coefficient of 0.22. Some LLMs gave indeterminate treatment recommendations in up to 54% of cases (eg, GPT-3.5) and showed low reliability as evidenced by low ICCs and high entropy values (eg, Mistral, LLaMA-2, and DeepSeek-R1). FBIs were substantially higher than 1.0 for all LLMs, except BioGPT, indicating a bias toward TAVR. The reference model outperformed the LLMs in the raw experiment regarding the metrics we assessed.</p><p>In the raw+ experiment, DeepSeek-R1 again showed the highest agreement with the HT with a Cohen &#x03BA; coefficient of 0.40, indicating fair agreement. The performance metrics of the other LLMs did not change substantially in the raw+ experiment. However, the performance metrics of most LLMs substantially improved in the sum experiment and peaked in the sum+ experiment, where some LLMs (eg, GPT-4 models and DeepSeek-R1) drew level with the reference model.</p><p>A general trend toward more concordant treatment decisions, fewer indeterminate responses, increased reliability, and less bias toward TAVR was observed with increasing data preprocessing and information enrichment efforts from the raw experiment to the sum+ experiment (<xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Performance metrics of the large language models are shown for the 4 experiments conducted. The dashed line represents the reference model. Cohen &#x03BA; coefficients &#x2264;0 indicate no agreement, 0.01&#x2010;0.20 indicate slight agreement, 0.21&#x2010;0.40 indicate fair agreement, 0.41&#x2010;0.60 indicate moderate agreement, 0.61&#x2010;0.80 indicate substantial agreement, and 0.81&#x2010;1.0 indicate almost perfect agreement [<xref ref-type="bibr" rid="ref20">20</xref>] with the heart team&#x2019;s treatment decisions. Frequency bias index (FBI) values &#x003E;1 indicate bias toward transcatheter aortic valve replacement (TAVR) and &#x003C;1 indicate bias toward surgical aortic valve replacement (SAVR). Intraclass correlation coefficients (ICCs) &#x003C;0.5 indicate poor test-retest reliability, 0.50&#x2010;0.75 indicate moderate reliability, 0.75&#x2010;0.90 indicate good reliability, and &#x003E;0.90 indicate excellent reliability [<xref ref-type="bibr" rid="ref19">19</xref>]. Instances where ICCs were undefined are marked by asterisks. Entropy values close to 0 indicate low output variation, and entropy values close to 1 indicate high output variation. Decidability was defined as the proportion of nonindeterminate treatment decisions. The exact numerical values for the performance metrics are displayed in Table S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v6i1e74899_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Frequencies of the treatment decisions of the large language models in the 4 experiments conducted. A general trend toward increasing decidability and an increasing proportion of treatment decisions favoring surgical aortic valve replacement (SAVR) could be observed between the raw experiment and the sum+ experiment. TAVR: transcatheter aortic valve replacement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v6i1e74899_fig03.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>LLM Performance With Original Clinical Data</title><p>To our knowledge, this is the first study to evaluate the impact of input data representation, including real-world medical data, on the ability of LLMs to make guideline-concordant treatment decisions.</p></sec><sec id="s4-2"><title>Current LLMs Make Incorrect Decisions Based on Original Clinical Data</title><p>Our analysis revealed that LLMs struggled to process original medical reports effectively, often outputting &#x201C;TAVR&#x201D; or providing indeterminate responses. The LLMs showed low agreement with the HT, exhibited undecidability and unreliability, and displayed a strong bias toward TAVR. The considerably high accuracies (Table S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) observed with some LLMs in the raw experiment can be largely attributed to the class imbalance within our patient cohort, where 70% of patients received TAVR.</p></sec><sec id="s4-3"><title>LLMs Require Extensive Data Preprocessing to Make Sound Therapeutic Decisions</title><p>Performance improved substantially when physician-made case summaries were used as input and when guideline knowledge was added to the prompts. The GPT-4 models and DeepSeek-R1 stood out as the most capable LLMs in our experiments. When given case summaries and a CPG summary, these 2 models showed substantial agreement with HT and drew level with the reference model in terms of interrater agreement, decidability, and bias.</p></sec><sec id="s4-4"><title>Data Representation Affects LLM Performance</title><p>GPT-4o, a distilled and streamlined version of GPT-4, and DeepSeek-R1, a model with enhanced reasoning abilities, showed more promising results than previous-generation LLMs when confronted with real-world medical data (raw and raw+ experiments); however, their performances remain insufficient for clinical application. The fact that even state-of-the-art LLMs show significant stochastic variations in decision-making, and thus unreliability, further supports this finding.</p><p>An explanation for the underperformance of LLMs in the raw experiment is not immediately apparent due to their opaque nature and a lack of established tools that allow the direct examination of input-output correlations. However, the underperformance cannot be attributed to a lack of guideline knowledge or incorrectly applied guideline knowledge since the performance in the raw+ experiment was, in general, similar to that in the raw experiment and since LLMs can presumably apply clinical knowledge to clinical cases as shown in their ability to pass medical board exams [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>This, along with the significant performance gains observed when providing case summaries instead of original medical reports, suggests that input data representation is the most critical factor in LLM performance. This finding is consistent with the fact that virtually all studies, which showed that LLMs make sound treatment decisions, used preprocessed clinical data as input [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Of note is the study by Salihu et al [<xref ref-type="bibr" rid="ref8">8</xref>]. In this study, data from patients with severe AS were provided to GPT-4 to obtain a treatment decision for either TAVR, SAVR, or conservative management. Patient data were provided in the form of a standardized multiple-choice questionnaire with 14 key clinical variables as input, similar to our sum experiments. GPT-4 treatment decisions were in substantial agreement with HT treatment decisions, a finding that we were able to reproduce in our experiments. Similarly, in studies on tasks beyond therapeutic decision-making, such as answering board exam questions [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref23">23</xref>] and diagnosing complex clinical cases [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], LLMs performed particularly well when the input data were concise and information-dense.</p><p>Basic research has indicated that LLMs struggle with lengthy texts [<xref ref-type="bibr" rid="ref26">26</xref>] spanning over multiple prompts, potentially leading to memory loss [<xref ref-type="bibr" rid="ref27">27</xref>] and texts with a low signal-to-noise ratio [<xref ref-type="bibr" rid="ref28">28</xref>]. A study by Levy et al [<xref ref-type="bibr" rid="ref29">29</xref>] demonstrated that LLM reasoning performance declined notably with increasing input length. Specifically, the authors observed a 26% drop in LLM performance when the input length was artificially increased from 250 to 3000 tokens, that is, a range of input lengths comparable to that in our study (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Recently, Hager et al [<xref ref-type="bibr" rid="ref30">30</xref>] investigated the ability of LLMs to correctly diagnose patients presenting to the emergency department with abdominal pain. In this study, it was shown that deliberately withholding relevant clinical information from the LLMs paradoxically improved their diagnostic accuracy. Overall, this implies that LLMs are sensitive to both the signal-to-noise ratio and the sheer quantity of information provided.</p></sec><sec id="s4-5"><title>LLMs Are Not Yet Ready for Clinical Decision-Making</title><p>The results obtained with preprocessed patient data in our study and in previous studies demonstrate the potential of LLMs in medicine. However, the use of curated and preprocessed data does not reflect the clinical situation: To this day, the communication of clinical data within hospitals is largely based on unstructured free text.</p><p>Health care professionals have high expectations of artificial intelligence (AI) to reduce their workload. This is not the case when physicians must manually extract and prepare key patient data for LLMs, as data extraction, not the actual decision-making task, is usually the most labor-intensive step.</p><p>Once key patient data have been extracted and prepared as input, simpler machine learning models (eg, tree-based models) could be used alternatively to provide decision support. In our study, as well as in the study by Salihu et al [<xref ref-type="bibr" rid="ref8">8</xref>], simple reference models performed comparably to GPT-4, suggesting that non-LLM models could outperform LLMs if trained appropriately. In addition, nongenerative models do not exhibit undesirable behaviors, such as hallucinations and unreliability [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], and provide explainability and established measures of uncertainty quantification, which are 2 hallmarks of reasonable AI [<xref ref-type="bibr" rid="ref33">33</xref>] that are currently not adequately implemented for LLMs [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>Another hallmark of reasonable AI is to address algorithmic bias [<xref ref-type="bibr" rid="ref37">37</xref>]. It is conceivable that the bias we observed in virtually all LLMs in our study could be due to LLMs being exposed to an abundance of TAVR-related internet literature during training compared to SAVR, subsequently influencing the treatment decisions.</p><p>A reasonable approach could be to use LLMs to extract clinical data [<xref ref-type="bibr" rid="ref38">38</xref>] and generate input for downstream deterministic models, which then perform the decision-making. While this strategy should ideally exploit the strengths of LLMs and well-established machine learning classifiers, its effectiveness remains to be proven in future studies.</p></sec><sec id="s4-6"><title>Limitations</title><p>Our study has some limitations, including a small patient cohort from a single center and the retrospective nature of our investigation. Nevertheless, the size of our study cohort (n=80) was comparable to previous key publications [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref39">39</xref>] studying the performance of LLMs in medicine, and we assume that our patient cohort was sufficiently large given the clear trends we observed.</p><p>The HT decisions against which we compared the LLMs&#x2019; treatment decisions may themselves be nonobjective and deviate from the CPGs. We manually reviewed the HT treatment decisions and found no substantial deviations from the CPGs. Since treatment decisions are ultimately made by a team of physicians (ie, human individuals), the ground truth in experiments such as ours is inherently susceptible to some degree of subjectivity.</p><p>Given the limited cohort size and the considerable length of the medical reports, few-shot prompting or fine-tuning was not a viable option. We did not employ more sophisticated prompting techniques, such as chain-of-thought [<xref ref-type="bibr" rid="ref40">40</xref>], and confined hyperparameter tuning to the temperature parameter. Moreover, given the rapid pace of LLM development, it is plausible that the most recently released reasoning-focused models (eg, GPT-o3 and Grok 4) may outperform those evaluated in our study. Accordingly, our findings should be interpreted as a reflection of the current state of model performance.</p><p>The majority of LLMs evaluated were primarily trained on English-language data. While recent studies suggest that newer models exhibit greater language agnosticism, it remains plausible that our use of German-language clinical reports contributed to reduced model performance, thereby limiting the generalizability of our findings to other languages and clinical settings [<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>We acknowledge that the off-the-shelf LLMs used in our study may exhibit biases due to the underrepresentation of certain ethnic, gender, or socioeconomic groups in their training data. However, given the limited size of our cohort, we were not able to systematically assess or stratify model performance across these dimensions.</p><p>Lastly, we did not investigate whether incorporating imaging data as additional input for multimodal LLMs, such as GPT-4o, could have improved model performance in our task. While this is theoretically plausible, recent research suggests that the effectiveness of multimodal models in clinical applications depends heavily on the quality of the accompanying textual context [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Given that relevant imaging findings were generally described in detail in the imaging reports, we assume that the inclusion of imaging data in our specific use case would likely have had only a limited impact on overall model performance.</p></sec><sec id="s4-7"><title>Conclusions</title><p>Our experiments are among the most challenging tasks LLMs have been asked to perform in the medical sciences. Overall, we conclude that LLMs are currently not suitable as decision makers for the treatment of patients with severe AS, as our results suggest that LLMs require elaborate preprocessing of patient data to make guideline-concordant treatment decisions. Thus, we do not share the medical community&#x2019;s concern that staff will be replaced by AI [<xref ref-type="bibr" rid="ref44">44</xref>] in clinical decision-making in the near future.</p><p>Our findings suggest that LLMs should be used cautiously, particularly by medical laypersons seeking medical advice, such as second opinions. Users without extensive domain knowledge may receive treatment recommendations at a level similar to our raw experiments. This is because medical laypersons may not be able to support prompts with guideline knowledge or create case summaries of sufficient quality but will only be able to use original medical reports. The findings in the study by Hager et al [<xref ref-type="bibr" rid="ref30">30</xref>] suggest that LLMs perform poorly when collecting additional patient data sequentially, as physicians would during a patient-physician dialogue. This suggests that the alternative to our approach&#x2014;not providing all clinical data to the LLM at once, but having medical laypersons provide essential information incrementally during a chat session&#x2014;is also likely to lead to suboptimal therapeutic recommendations.</p><p>Finally, medical laypersons may not be able to recognize hallucinations as effectively as medical professionals. This, combined with the eloquent and persuasive linguistic style of most LLMs, has the potential to mislead users by creating an illusion of greater certainty than warranted, aggravating the hazardous effects of incorrect treatment recommendations.</p></sec></sec></body><back><ack><p>We thank Michael Gudo (MORPHISTO GmbH) for providing access to GPT-4 and Hadi El Ali (BSc), University of Bayreuth, for contributing to the illustration of <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>This work was supported by the German Centre for Cardiovascular Research (DZHK), funded by the German Federal Ministry of Education and Research, and the Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin. DH received 2 grants from the DZHK (grant number: 81X3100214 and grant number: 81X3100220). TR and DH are participants in the BIH Charit&#x00E9; Digital Clinician Scientist Program funded by the Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin and the Berlin Institute of Health at Charit&#x00E9; (BIH).</p></ack><notes><sec><title>Data Availability</title><p>The anonymized data underlying this article will be shared upon reasonable request to the corresponding author.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: TR, MH, DH, AM (equal)</p><p>Data curation: DH, FR</p><p>Formal analysis: TR, MH (equal)</p><p>Methodology: TR, MH (equal)</p><p>Supervision: AM</p><p>Visualization: MH (lead), TR (supporting)</p><p>Writing &#x2013; original draft: TR, MH (equal)</p><p>Writing &#x2013; review &#x0026; editing: TR, MH (equal), AM (supporting), NH (supporting), TDT (supporting), MIG (supporting), AU (supporting), CK (supporting), JK (supporting), HD (supporting), BOB (supporting), GH (supporting), FB (supporting), VF (supporting)</p></fn><fn fn-type="conflict"><p>DH reports financial engagements beyond the scope of the presented work. These activities include consultation services and speaking engagements for companies, including AstraZeneca, Bayer Vital, Boehringer Ingelheim, Coliquio, and Novartis.</p><p>TDT holds shares of Microsoft, Amazon, and Palantir Technologies.</p><p>AU serves as a physician proctor to Boston Scientific, Edwards Lifesciences, and Medtronic.</p><p>JK reports personal fees from Edwards and personal fees from LSI outside the submitted work.</p><p>BOB declares research funding from the British Heart Foundation and the National Institute for Health Science Research, and relevant financial activities outside the submitted work with Teleflex and Abiomed in relation to consultancy fees.</p><p>FB reports funding from Medtronic and grants from the German Federal Ministry of Education and Research, grants from the German Federal Ministry of Health, grants from the Berlin Institute of Health, personal fees from Elsevier Publishing, grants from Hans B&#x00F6;ckler Foundation, other funds from Robert Koch Institute, grants from Einstein Foundation, and grants from Berlin University Alliance outside the submitted work.</p><p>VF declares relevant financial activities outside the submitted work with Medtronic GmbH, Biotronik SE &#x0026; Co, Abbott GmbH &#x0026; Co KG, Boston Scientific, Edwards Lifesciences, Berlin Heart, Novartis Pharma GmbH, JOTEC GmbH, and Zurich Heart in relation to educational grants (including travel support), fees for lectures and speeches, fees for professional consultation, and research and study funds.</p><p>AM declares receiving consulting and lecturing fees from Medtronic, lecturing fees from Bayer, and consulting fees from Pfizer. AM is the founder and managing director of x-cardiac GmbH.</p><p>The other authors have no conflicts of interest to disclose.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AS</term><def><p>aortic stenosis</p></def></def-item><def-item><term id="abb3">CPG</term><def><p>clinical practice guideline</p></def></def-item><def-item><term id="abb4">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb5">EACTS</term><def><p>European Association for Cardio-Thoracic Surgery</p></def></def-item><def-item><term id="abb6">ESC</term><def><p>European Society of Cardiology</p></def></def-item><def-item><term id="abb7">FBI</term><def><p>frequency bias index</p></def></def-item><def-item><term id="abb8">HT</term><def><p>heart team</p></def></def-item><def-item><term id="abb9">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb10">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb11">SAVR</term><def><p>surgical aortic valve replacement</p></def></def-item><def-item><term id="abb12">TAVR</term><def><p>transcatheter aortic valve replacement</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Palepu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Towards conversational diagnostic AI</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Jan 11, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2401.05654">https://arxiv.org/abs/2401.05654</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sklair-Levy</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language model (ChatGPT) as a support tool for breast tumor board</article-title><source>NPJ Breast Cancer</source><year>2023</year><month>05</month><day>30</day><volume>9</volume><issue>1</issue><fpage>44</fpage><pub-id pub-id-type="doi">10.1038/s41523-023-00557-8</pub-id><pub-id pub-id-type="medline">37253791</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aghamaliyev</surname><given-names>U</given-names> </name><name name-style="western"><surname>Karimbayli</surname><given-names>J</given-names> </name><name name-style="western"><surname>Giessen-Jung</surname><given-names>C</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s gastrointestinal tumor board tango: a limping dance partner?</article-title><source>Eur J Cancer</source><year>2024</year><month>07</month><volume>205</volume><fpage>114100</fpage><pub-id pub-id-type="doi">10.1016/j.ejca.2024.114100</pub-id><pub-id pub-id-type="medline">38729055</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kozel</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gurses</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Gecici</surname><given-names>NN</given-names> </name><etal/></person-group><article-title>Chat-GPT on brain tumors: an examination of artificial intelligence/machine learning&#x2019;s ability to provide diagnoses and treatment plans for example neuro-oncology cases</article-title><source>Clin Neurol Neurosurg</source><year>2024</year><month>04</month><volume>239</volume><issue>108238</issue><fpage>108238</fpage><pub-id pub-id-type="doi">10.1016/j.clineuro.2024.108238</pub-id><pub-id pub-id-type="medline">38507989</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lukac</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dayan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fink</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Evaluating ChatGPT as an adjunct for the multidisciplinary tumor board decision-making in primary breast cancer cases</article-title><source>Arch Gynecol Obstet</source><year>2023</year><month>12</month><volume>308</volume><issue>6</issue><fpage>1831</fpage><lpage>1844</lpage><pub-id pub-id-type="doi">10.1007/s00404-023-07130-5</pub-id><pub-id pub-id-type="medline">37458761</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salihu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Meier</surname><given-names>D</given-names> </name><name name-style="western"><surname>Noirclerc</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A study of ChatGPT in facilitating heart team decisions on severe aortic stenosis</article-title><source>EuroIntervention</source><year>2024</year><month>04</month><day>15</day><volume>20</volume><issue>8</issue><fpage>e496</fpage><lpage>e503</lpage><pub-id pub-id-type="doi">10.4244/EIJ-D-23-00643</pub-id><pub-id pub-id-type="medline">38629422</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roth</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Mensah</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>CO</given-names> </name><etal/></person-group><article-title>Global burden of cardiovascular diseases and risk factors, 1990-2019: update from the GBD 2019 study</article-title><source>J Am Coll Cardiol</source><year>2020</year><month>12</month><day>22</day><volume>76</volume><issue>25</issue><fpage>2982</fpage><lpage>3021</lpage><pub-id pub-id-type="doi">10.1016/j.jacc.2020.11.010</pub-id><pub-id pub-id-type="medline">33309175</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vahanian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Beyersdorf</surname><given-names>F</given-names> </name><name name-style="western"><surname>Praz</surname><given-names>F</given-names> </name><etal/></person-group><article-title>2021 ESC/EACTS Guidelines for the management of valvular heart disease</article-title><source>Eur Heart J</source><year>2022</year><month>02</month><day>12</day><volume>43</volume><issue>7</issue><fpage>561</fpage><lpage>632</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehab395</pub-id><pub-id pub-id-type="medline">34453165</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ye</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A comprehensive capability analysis of GPT-3 and GPT-3.5 series models</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Dec 23, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2303.10420">https://arxiv.org/abs/2303.10420</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Mar 4, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2303.08774">https://arxiv.org/abs/2303.08774</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Anil</surname><given-names>R</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Firat</surname><given-names>O</given-names> </name><etal/></person-group><article-title>PaLM 2 technical report</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Sep 13, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2305.10403">https://arxiv.org/abs/2305.10403</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Jan 22, 2025</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2501.12948">https://arxiv.org/abs/2501.12948</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Oct 10, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2310.06825">https://arxiv.org/abs/2310.06825</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Jul 19, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2307.09288">https://arxiv.org/abs/2307.09288</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>BioGPT: generative pre-trained transformer for biomedical text generation and mining</article-title><source>Brief Bioinformatics</source><year>2022</year><month>11</month><day>19</day><volume>23</volume><issue>6</issue><fpage>bbac409</fpage><pub-id pub-id-type="doi">10.1093/bib/bbac409</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li&#x00E9;vin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Hother</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Motzfeldt</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Winther</surname><given-names>O</given-names> </name></person-group><article-title>Can large language models reason about medical questions?</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Dec 24, 2022</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2207.08143">https://arxiv.org/abs/2207.08143</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Virtanen</surname><given-names>MPO</given-names> </name><name name-style="western"><surname>Eskola</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jalava</surname><given-names>MP</given-names> </name><etal/></person-group><article-title>Comparison of outcomes after transcatheter aortic valve replacement vs surgical aortic valve replacement among patients with aortic stenosis at low operative risk</article-title><source>JAMA Netw Open</source><year>2019</year><month>06</month><day>5</day><volume>2</volume><issue>6</issue><fpage>e195742</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.5742</pub-id><pub-id pub-id-type="medline">31199448</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>W</given-names> </name><etal/></person-group><article-title>A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Nov 19, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2311.05232">https://arxiv.org/abs/2311.05232</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>MedBench: a large-scale chinese benchmark for evaluating medical large language models</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Dec 20, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2312.12806">https://arxiv.org/abs/2312.12806</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eriksen</surname><given-names>AV</given-names> </name><name name-style="western"><surname>M&#x00F6;ller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ryg</surname><given-names>J</given-names> </name></person-group><article-title>Use of GPT-4 to diagnose complex clinical cases</article-title><source>NEJM AI</source><year>2024</year><month>01</month><volume>1</volume><issue>1</issue><fpage>AIp2300031</fpage><pub-id pub-id-type="doi">10.1056/AIp2300031</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Novak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zeljkovi&#x0107;</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rode</surname><given-names>F</given-names> </name><etal/></person-group><article-title>The pulse of artificial intelligence in cardiology: a comprehensive evaluation of state-of-the-art large language models for potential use in clinical cardiology</article-title><source>medRxiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Dec 7, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://www.medrxiv.org/content/10.1101/2023.08.08.23293689v3">https://www.medrxiv.org/content/10.1101/2023.08.08.23293689v3</ext-link></comment><pub-id pub-id-type="doi">10.1101/2023.08.08.23293689</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>NF</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hewitt</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Lost in the middle: how language models use long contexts</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Nov 20, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2307.03172">https://arxiv.org/abs/2307.03172</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sejnowski</surname><given-names>TJ</given-names> </name></person-group><article-title>Large language models and the reverse turing test</article-title><source>Neural Comput</source><year>2023</year><month>02</month><day>17</day><volume>35</volume><issue>3</issue><fpage>309</fpage><lpage>342</lpage><pub-id pub-id-type="doi">10.1162/neco_a_01563</pub-id><pub-id pub-id-type="medline">36746144</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>NF</given-names> </name></person-group><article-title>Resilience of large language models for noisy instructions</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Oct 3, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2404.09754">https://arxiv.org/abs/2404.09754</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Levy</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jacoby</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goldberg</surname><given-names>Y</given-names> </name></person-group><article-title>Same task, more tokens: the impact of input length on the reasoning performance of large language models</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Jul 10, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2402.14848">https://arxiv.org/abs/2402.14848</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roustan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bastardot</surname><given-names>F</given-names> </name></person-group><article-title>The clinicians&#x2019; guide to large language models: a general perspective with a focus on hallucinations</article-title><source>Interact J Med Res</source><year>2025</year><month>01</month><day>28</day><volume>14</volume><fpage>e59823</fpage><pub-id pub-id-type="doi">10.2196/59823</pub-id><pub-id pub-id-type="medline">39874574</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivarajah</surname><given-names>U</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Olya</surname><given-names>H</given-names> </name><name name-style="western"><surname>Mathew</surname><given-names>S</given-names> </name></person-group><article-title>Responsible artificial intelligence (AI) for digital health and medical analytics</article-title><source>Inf Syst Front</source><year>2023</year><month>06</month><day>5</day><volume>5</volume><issue>1&#x2013;6</issue><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1007/s10796-023-10412-7</pub-id><pub-id pub-id-type="medline">37361886</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Specia</surname><given-names>L</given-names> </name></person-group><article-title>From understanding to utilization: a survey on explainability for large language models</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Feb 22, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2401.12874">https://arxiv.org/abs/2401.12874</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>G</given-names> </name></person-group><article-title>Uncertainty estimation and quantification for LLMs: a simple supervised approach</article-title><source>arXiv</source><access-date>2025-10-01</access-date><comment>Preprint posted online on  Oct 23, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2404.15993">https://arxiv.org/abs/2404.15993</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quttainah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>V</given-names> </name><name name-style="western"><surname>Madakam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lurie</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mark</surname><given-names>S</given-names> </name></person-group><article-title>Cost, usability, credibility, fairness, accountability, transparency, and explainability framework for safe and effective large language models in medical education: narrative review and qualitative study</article-title><source>JMIR AI</source><year>2024</year><month>04</month><day>23</day><volume>3</volume><fpage>e51834</fpage><pub-id pub-id-type="doi">10.2196/51834</pub-id><pub-id pub-id-type="medline">38875562</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Vajravelu</surname><given-names>BN</given-names> </name></person-group><article-title>Assessing the current limitations of large language models in advancing health care education</article-title><source>JMIR Form Res</source><year>2025</year><month>01</month><day>16</day><volume>9</volume><fpage>e51319</fpage><pub-id pub-id-type="doi">10.2196/51319</pub-id><pub-id pub-id-type="medline">39819585</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dagdelen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Structured information extraction from scientific text with large language models</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>15</day><volume>15</volume><issue>1</issue><fpage>1418</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45563-x</pub-id><pub-id pub-id-type="medline">38360817</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Hirabayashi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Barna</surname><given-names>L</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pasquale</surname><given-names>LR</given-names> </name></person-group><article-title>Assessment of a large language model&#x2019;s responses to questions and cases about glaucoma and retina management</article-title><source>JAMA Ophthalmol</source><year>2024</year><month>04</month><day>1</day><volume>142</volume><issue>4</issue><fpage>371</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.6917</pub-id><pub-id pub-id-type="medline">38386351</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><conf-name>NIPS &#x2019;22: Proceedings of the 36th International Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, LA, USA</conf-loc><pub-id pub-id-type="doi">10.5555/3600270.3602070</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roso&#x0142;</surname><given-names>M</given-names> </name><name name-style="western"><surname>G&#x0105;sior</surname><given-names>JS</given-names> </name><name name-style="western"><surname>&#x0141;aba</surname><given-names>J</given-names> </name><name name-style="western"><surname>Korzeniewski</surname><given-names>K</given-names> </name><name name-style="western"><surname>M&#x0142;y&#x0144;czak</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish Medical Final Examination</article-title><source>Sci Rep</source><year>2023</year><month>11</month><day>22</day><volume>13</volume><issue>1</issue><fpage>20512</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-46995-z</pub-id><pub-id pub-id-type="medline">37993519</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00FC;nay</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x00D6;zt&#x00FC;rk</surname><given-names>A</given-names> </name><name name-style="western"><surname>&#x00D6;zerol</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yi&#x011F;it</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Erenler</surname><given-names>AK</given-names> </name></person-group><article-title>Comparison of emergency medicine specialist, cardiologist, and chat-GPT in electrocardiography assessment</article-title><source>Am J Emerg Med</source><year>2024</year><month>06</month><volume>80</volume><fpage>51</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.03.017</pub-id><pub-id pub-id-type="medline">38507847</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeljkovic</surname><given-names>I</given-names> </name><name name-style="western"><surname>Novak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lisicic</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Beyond text: the impact of clinical context on GPT-4&#x2019;s 12-lead electrocardiogram interpretation accuracy</article-title><source>Can J Cardiol</source><year>2025</year><month>07</month><volume>41</volume><issue>7</issue><fpage>1406</fpage><lpage>1414</lpage><pub-id pub-id-type="doi">10.1016/j.cjca.2025.01.036</pub-id><pub-id pub-id-type="medline">39971004</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fogo</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Kronbichler</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bajema</surname><given-names>IM</given-names> </name></person-group><article-title>AI&#x2019;s threat to the medical profession</article-title><source>JAMA</source><year>2024</year><month>02</month><day>13</day><volume>331</volume><issue>6</issue><fpage>471</fpage><lpage>472</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.0018</pub-id><pub-id pub-id-type="medline">38241042</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional information to support the study findings.</p><media xlink:href="xmed_v6i1e74899_app1.docx" xlink:title="DOCX File, 385 KB"/></supplementary-material></app-group></back></article>