<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="author-comment"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIRx Med</journal-id><journal-id journal-id-type="publisher-id">xmed</journal-id><journal-id journal-id-type="index">34</journal-id><journal-title>JMIRx Med</journal-title><abbrev-journal-title>JMIRx Med</abbrev-journal-title><issn pub-type="epub">2563-6316</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v6i1e84173</article-id><article-id pub-id-type="doi">10.2196/84173</article-id><article-categories><subj-group subj-group-type="heading"><subject>Authors&#x2019; Response To Peer Reviews</subject></subj-group></article-categories><title-group><article-title>Authors&#x2019; Response to Peer Reviews of &#x201C;Assessing the Limitations of Large Language Models in Clinical Practice Guideline&#x2013;Concordant Treatment Decision-Making on Real-World Data: Retrospective Study&#x201D;</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Roeschl</surname><given-names>Tobias</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Hoffmann</surname><given-names>Marie</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hashemi</surname><given-names>Djawid</given-names></name><degrees>MD, PD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rarreck</surname><given-names>Felix</given-names></name><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hinrichs</surname><given-names>Nils</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Trippel</surname><given-names>Tobias Daniel</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gr&#x00F6;schel</surname><given-names>Matthias I</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Unbehaun</surname><given-names>Axel</given-names></name><degrees>MD, PD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Klein</surname><given-names>Christoph</given-names></name><degrees>MD, PD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kempfert</surname><given-names>J&#x00F6;rg</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dreger</surname><given-names>Henryk</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>O'Brien</surname><given-names>Benjamin</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hindricks</surname><given-names>Gerhard</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Balzer</surname><given-names>Felix</given-names></name><degrees>MD, PhD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Falk</surname><given-names>Volkmar</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff10">10</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Meyer</surname><given-names>Alexander</given-names></name><degrees>MD, Prof Dr Med</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff11">11</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Cardiology, Angiology and Intensive Care Medicine, Deutsches Herzzentrum der Charit&#x00E9;</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, corporate member of Freie Universit&#x00E4;t Berlin and Humboldt-Universit&#x00E4;t zu Berlin</institution><addr-line>Charit&#x00E9;platz 1</addr-line><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Berlin Institute of Health at Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, BIH Biomedical Innovation Academy, BIH Charit&#x00E9; Digital Clinician Scientist Program</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff4"><institution>DZHK (German Centre for Cardiovascular Research), partner site Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff5"><institution>Department of Cardiothoracic and Vascular Surgery, Deutsches Herzzentrum der Charit&#x00E9; (DHZC)</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff6"><institution>Department of Infectious Diseases and Respiratory Medicine, Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff7"><institution>Department of Cardiac Anesthesiology and Intensive Care Medicine, Deutsches Herzzentrum der Charit&#x00E9; (DHZC)</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff8"><institution>Department of Perioperative Medicine, St Bartholomew&#x2019;s Hospital and Barts Heart Centre</institution><addr-line>London</addr-line><country>United Kingdom</country></aff><aff id="aff9"><institution>Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, Institute of Medical Informatics</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff10"><institution>Department of Health Sciences and Technology, Translational Cardiovascular Technologies, Institute of Translational Medicine, Swiss Federal Institute of Technology</institution><addr-line>Z&#x00FC;rich</addr-line><country>Switzerland</country></aff><aff id="aff11"><institution>Berlin Institute for the Foundations of Learning and Data &#x2013; TU Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Grover</surname><given-names>Abhinav</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Marie Hoffmann, PhD, Charit&#x00E9; &#x2013; Universit&#x00E4;tsmedizin Berlin, corporate member of Freie Universit&#x00E4;t Berlin and Humboldt-Universit&#x00E4;t zu Berlin, Charit&#x00E9;platz 1, Berlin, 10117, Germany, 49 17632864219; <email>marie.hoffmann3@dhzc-charite.de</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>3</day><month>11</month><year>2025</year></pub-date><volume>6</volume><elocation-id>e84173</elocation-id><history><date date-type="received"><day>15</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>15</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; Tobias Roeschl, Marie Hoffmann, Djawid Hashemi, Felix Rarreck, Nils Hinrichs, Tobias Daniel Trippel, Matthias I Gr&#x00F6;schel, Axel Unbehaun, Christoph Klein, J&#x00F6;rg Kempfert, Henryk Dreger, Benjamin O'Brien, Gerhard Hindricks, Felix Balzer, Volkmar Falk, Alexander Meyer. Originally published in JMIRx Med (<ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org">https://med.jmirx.org</ext-link>), 3.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIRx Med, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org/">https://med.jmirx.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://xmed.jmir.org/2025/1/e84173"/><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/74899" xlink:title="Preprint (JMIR Preprints)" xlink:type="simple">http://preprints.jmir.org/preprint/74899</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/84175" xlink:title="Peer-Review Report by Reenu Singh (Reviewer K)" xlink:type="simple">https://med.jmirx.org/2025/1/e84175</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/84174" xlink:title="Peer-Review Report by Andrej Novak (Reviewer BI)" xlink:type="simple">https://med.jmirx.org/2025/1/e84174</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/74899" xlink:title="Published Article" xlink:type="simple">https://med.jmirx.org/2025/1/e74899</related-article><kwd-group><kwd>large language model</kwd><kwd>foundation model</kwd><kwd>reasoning model</kwd><kwd>treatment decision-making</kwd><kwd>aortic stenosis</kwd><kwd>clinical practice guidelines</kwd><kwd>medical data processing</kwd></kwd-group></article-meta></front><body><p><italic>This is the authors&#x2019; response to peer-review reports for &#x201C;Assessing the Limitations of Large Language Models in Clinical Practice Guideline&#x2013;Concordant Treatment Decision-Making on Real-World Data: Retrospective Study.&#x201D;</italic></p><sec id="s2"><title>Round 1 Review</title><sec id="s1-1"><title>Reviewer K [<xref ref-type="bibr" rid="ref1">1</xref>]</title><p><italic>1. To improve the discussion on bias in large language models (LLMs) for clinical decision-making, the study [<xref ref-type="bibr" rid="ref2">2</xref>] should include the following aspects:</italic></p><p><italic>If LLMs are trained predominantly on Western medical literature or specific demographic groups, their recommendations may not generalize well to diverse patient populations. If the data used to fine-tune the model lack representation from certain ethnic, gender, or socioeconomic groups, the artificial intelligence may produce recommendations that are not universally applicable. Even with a diverse dataset, biases can arise due to model architecture, reinforcement learning strategies, or human-in-the-loop feedback mechanisms that shape model responses.</italic></p><p><bold>Response:</bold> Thank you for this thoughtful and important comment. We fully agree that the generalizability and fairness of LLMs in health care are significantly influenced by the composition of their training and fine-tuning data. As you rightly note, underrepresentation of certain ethnic, gender, or socioeconomic groups can lead to biased outputs and potentially widen existing health disparities. Indeed, we have also discovered, for example, bias toward transcatheter aortic valve implantation in our experiments, as indicated through the Frequency Bias Index in Figure 2 and Table S9. All LLMs were taken off-the-shelf without fine-tuning as the cohort size was limited by the inherently low incidence of eligible cases and the stringent requirements for high-quality, comprehensive patient data. Each case required detailed manual review and the generation of structured case summaries, which further constrained the pool of analyzable data. As a result, stratification and investigation of bias by additional features such as ethnic, gender, or socioeconomic features was not feasible. In the Limitations section, we have added that potential biases remain unaddressed.</p><p><italic>2. What datasets were used? If real patient data were used, specify its source (eg, electronic health records, clinical trial data, or synthetic datasets). Provide the total number of cases or records used for testing the large language models. If synthetic data were generated, describe the method used to create the data. Were diverse age groups, genders, and ethnic backgrounds represented? A lack of diversity in data can affect the generalizability of results.</italic></p><p><bold>Response:</bold> Thank you for addressing this very important point. As described in the Methods section, we have used real clinical reports in PDF format from our hospital information system and extracted the content into text files. Either these text files (experiments RAW and RAW+) or manually drafted summaries (SUM and SUM+) from these text files had been used as input to the LLMs. No trial or synthetic data were used.</p><p><italic>3. What datasets were used? If real patient data were used, specify its source (eg, electronic health records, clinical trial data, or synthetic datasets). Provide the total number of cases or records used for testing the large language models. If synthetic data were generated, describe the method used to create the data. Were diverse age groups, genders, and ethnic backgrounds represented? A lack of diversity in data can affect the generalizability of results.</italic></p><p><bold>Response:</bold> Thank you for your comment. This comment is identical to Comment #2, which we have addressed in detail above. To summarize: we used real clinical reports extracted from our hospital information system (electronic health records), and no synthetic or trial data were used. Additional details, including data source and sample characteristics, are provided in our response to Comment #2 and in the revised Methods section under &#x201C;Study Population&#x201D; and &#x201C;Data Collection and Preprocessing.&#x201D;</p><p><italic>4. The study&#x2019;s impact can be significantly enhanced by addressing the following challenges: Raw medical reports often include free-text narratives, physician notes, abbreviations, and inconsistencies, requiring advanced natural language processing techniques such as entity recognition, text normalization, and standardization. These reports may also contain irrelevant information, redundancies, or nonessential clinical details. Effective preprocessing is essential to filter out unnecessary content while preserving critical medical insights. A key consideration is how to optimize this preprocessing to mitigate these challenges efficiently.</italic></p><p><bold>Response:</bold> Thank you for this insightful comment. The central objective of our study was to assess model performance using the same type of raw clinical data that health care professionals routinely encounter, including free-text narratives and unstructured content. The rationale behind this approach was that, for real-world clinical implementation, it would be most beneficial if LLMs could generate guideline-concordant treatment recommendations directly from routine clinical documentation&#x2014;without relying on curated or heavily preprocessed inputs. This would help avoid the considerable time and resource demands associated with manual or automated preprocessing pipelines. To explore this, we compared model performance on raw clinical reports with performance on highly preprocessed, structured synopses, as used in previous studies where frontier models have shown strong results. We simulated this optimized input scenario through manually drafted summaries (SUM and SUM+), which represent a best-case input condition. Replicating such preprocessing through automated means would require extensive quality control mechanisms and may still fall short of the accuracy and relevance achieved through expert curation.</p></sec><sec id="s1-2"><title>Reviewer BI [<xref ref-type="bibr" rid="ref3">3</xref>]</title><p><italic>1. The format and provenance of the SUM (&#x201C;case summary&#x201D;) reports require clearer specification. Although the authors note these summaries were &#x201C;manually generated,&#x201D; it would be helpful to state whether they followed a standardized template, who exactly drafted them (eg, experienced cardiologists, research assistants), and which elements of the Heart Team protocol they distilled into each summary.</italic></p><p><bold>Response:</bold> Thank you for pointing this out. We agree that this aspect was not sufficiently described in the original manuscript. We have revised the Methods section under &#x201C;Experiments&#x201D; to clarify that the case summaries were manually created but adhered to a structured format: all patient characteristics documented in the heart team protocol were systematically addressed by either affirming, negating, or populating them with patient-specific values. An illustrative example is provided in Table S6.</p><p><italic>2. The authors report that the original medical documents were saved as PDFs and later converted to plain text. It would be helpful to clarify this process to avoid confusion, since LLMs accessed via chat interfaces or application programming interfaces often struggle with PDF inputs or text embedded in images, treating them differently from pure text. A brief discussion acknowledging this limitation&#x2014;and explaining how PDF parsing was handled or validated&#x2014;would help readers assess real-world applicability.</italic></p><p><bold>Response:</bold> We appreciate the reviewer&#x2019;s helpful comment. In every case, plain text&#x2014;not PDF files&#x2014;was provided as input. To clarify this point in the Methods section, we have added a description of the process: the text content of each PDF file was programmatically extracted using the Tesseract OCR software and concatenated into a single plain-text file, which was then used as input for the models for the RAW and RAW+ experiments.</p><p><italic>3. Raw inputs (PDFs and summaries) were provided in German (except for BioGPT, which required translation to English). A comment in the Discussion about how model performance can vary by input language&#x2014;perhaps citing studies that showed different results in Polish versus English&#x2014;would contextualize the findings for non-English clinical settings:</italic></p><list list-type="bullet"><list-item><p><italic>Roso&#x0142; M, G&#x0105;sior JS, &#x0141;aba J, Korzeniewski K, M&#x0142;y&#x0144;czak M. Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish Medical Final Examination.</italic> Sci Rep. <italic>2023;13(1):20512.</italic></p></list-item></list><p><bold>Response:</bold> We appreciate the reviewer&#x2019;s thoughtful suggestion. We agree that input texts in languages other than English may pose an additional challenge for LLMs, as they are primarily trained on English-language literature. We have added a comment and the suggested citation in the Limitations section. The study you cited suggests that more recent GPT models may be more language-agnostic than previous generations, though it remains unclear whether this holds true for other languages and frontier models.</p><p><italic>4. The Discussion section feels comparatively weak and could be strengthened by broader literature coverage. For instance, a brief discussion of input formats&#x2014;pure text versus multimodal inputs&#x2014;would be valuable, especially given the inclusion of GPT-4o, which handles images. Preliminary studies in this area include:</italic></p><list list-type="bullet"><list-item><p><italic>G&#x00FC;nay et al. Comparison of emergency medicine specialist, cardiologist, and ChatGPT in electrocardiography assessment. Am J Emerg Med. 2024 Jun;80:51-60.</italic></p></list-item><list-item><p><italic>Zeljkovic et al. Beyond text: the impact of clinical context on GPT-4&#x2019;s 12-lead electrocardiogram interpretation accuracy. Canadian J Cardiol. 2025 Jul;41(7):1406-1414.</italic></p></list-item></list><p><italic>These compare electrocardiogram interpretation with and without accompanying clinical context and demonstrate the importance of textual input alongside images.</italic></p><p><italic>It would also be helpful to reference work showing that, despite similar hallucination tendencies, LLMs perform strongly on standardized exams, for example:</italic></p><list list-type="bullet"><list-item><p><italic>Gilson et al. How does ChatGPT perform on the USMLE? Implications for medical education and knowledge assessment. JMIR Med Educ. 2023 Feb 8;9:e45312.</italic></p></list-item><list-item><p><italic>Novak et al. The pulse of artificial intelligence in cardiology: evaluating state-of-the-art LLMs for clinical cardiology. medRxiv. Preprint posted online on January 30, 2024.</italic></p></list-item></list><p><italic>These additions could situate the findings within a broader context of multimodal and high-stakes assessment.</italic></p><p><bold>Response:</bold> We thank the reviewer for this valuable suggestion. We agree that the Discussion section benefits from a broader contextualization, particularly with respect to input formats and the evolving capabilities of multimodal models. At the current time, the diagnostic quality of multimodal models remains rudimentary, especially for images other than X-rays. As you suggested, we have added a paragraph to the Limitations section, where we stated that including imaging data in addition to the textual data would have most likely not led to a substantial improvement in model performance in our task&#x2013;referring to the studies by G&#x00FC;nay et al and Zeljkovic et al that you kindly mentioned.</p><p>In addition, we gladly added the references (Gilson et al, Novak et al) that you mentioned to the &#x201C;Data Representation Affects LLM Performance&#x201D; section of the Discussion to further strengthen our point that LLMs generally perform well when provided with concise and information-dense data but struggle with noisy and unprocessed clinical data.</p><p><italic>5. As an exploratory aside, it would be interesting to evaluate how the newest reasoning-focused models (eg, &#x201C;o3&#x201D; or &#x201C;o4&#x201D;) perform on this task. Although this is likely beyond the current scope, including a sentence to that effect in the manuscript&#x2019;s Limitations section could guide future research.</italic></p><p><bold>Response:</bold> We agree that in the fast-paced environment of LLM development, it is plausible that the newest reasoning-focused models might perform substantially better in our task than the reasoning models we used. We addressed this in the Limitations section.</p><p><italic>6. For consistency and precision, when describing model access in the &#x201C;Large Language Models&#x201D; section (and elsewhere in the text), the manuscript should explicitly cite the exact supplementary tables or materials (eg, &#x201C;see Table S1 for model details and context sizes&#x201D;) rather than referring generically to &#x201C;the Supplementary.&#x201D;</italic></p><p><bold>Response:</bold> We agree that referring to specific supplementary tables and figures improves both clarity and precision. Accordingly, we have specified which supplementary tables and figures we are referring to throughout the manuscript.</p><p><italic>7. In the Statistical Methods subsection, rather than stating that nonnormally distributed data were compared using the Mann-Whitney U test &#x201C;for nonnormally distributed continuous variables,&#x201D; the phrasing could be tightened to &#x201C;for variables departing from normality&#x201D; or &#x201C;for variables not following a normal distribution&#x201D; to align with standard statistical terminology.</italic></p><p><bold>Response:</bold> We thank the reviewer for this constructive suggestion. We have revised the phrasing in the &#x201C;Statistical Analysis&#x201D; subsection of the Methods to align with standard statistical terminology. Specifically, we now refer to the use of the Mann&#x2013;Whitney <italic>U</italic> test for &#x201C;variables departing from normality,&#x201D; as recommended.</p><p>Changes made to the manuscript on our end:</p><list list-type="bullet"><list-item><p>We made minor adjustments to the affiliations on the title page to align with newly introduced in-house guidelines.</p></list-item><list-item><p>In Table 2 and Table S6, we replaced the previously reported age ranges (used in accordance with medRxiv&#x2019;s data protection policy) with the actual patient ages, now presented as integer values.</p></list-item><list-item><p>We replaced the term &#x201C;non-LLM models&#x201D; with &#x201C;deterministic models&#x201D; in the final paragraph before the Limitations section, as this terminology is more commonly used in recent literature and provides a more precise characterization.</p></list-item></list></sec></sec></body><back><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>R</given-names> </name></person-group><article-title>Peer review of &#x201C;Assessing the Limitations of Large Language Models in Clinical Practice Guideline&#x2013;Concordant Treatment Decision-Making on Real-World Data: Retrospective Study"</article-title><source>JMIRx Med</source><year>2025</year><volume>6</volume><fpage>e84175</fpage><pub-id pub-id-type="doi">10.2196/84175</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roeschl</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hoffmann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hashemi</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Assessing the limitations of large language models in clinical practice guideline&#x2013;concordant treatment decision-making on real-world data: retrospective study</article-title><source>JMIRx Med</source><year>2025</year><volume>6</volume><fpage>e84173</fpage><pub-id pub-id-type="doi">10.2196/84173</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Novak</surname><given-names>A</given-names> </name></person-group><article-title>Peer review of &#x201C;Assessing the Limitations of Large Language Models in Clinical Practice Guideline&#x2013;Concordant Treatment Decision-Making on Real-World Data: Retrospective Study"</article-title><source>JMIRx Med</source><year>2025</year><volume>6</volume><fpage>e84174</fpage><pub-id pub-id-type="doi">10.2196/84174</pub-id></nlm-citation></ref></ref-list></back></article>