<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="reviewer-report"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIRx Med</journal-id><journal-id journal-id-type="publisher-id">xmed</journal-id><journal-id journal-id-type="index">34</journal-id><journal-title>JMIRx Med</journal-title><abbrev-journal-title>JMIRx Med</abbrev-journal-title><issn pub-type="epub">2563-6316</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e65727</article-id><article-id pub-id-type="doi">10.2196/65727</article-id><article-categories><subj-group subj-group-type="heading"><subject>Peer-Review Report</subject></subj-group></article-categories><title-group><article-title>Peer Review of &#x201C;All You Need Is Context: Clinician Evaluations of Various Iterations of a Large Language Model&#x2013;Based First Aid Decision Support Tool in Ghana (Preprint)&#x201D;</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Gao</surname><given-names>Yixuan</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Olatoye</surname><given-names>Toba</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mahmoud</surname><given-names>Randa Salah Gomaa</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Computer Science, Cornell University</institution>, <addr-line>616 Thurston Ave</addr-line><addr-line>Ithaca</addr-line><addr-line>NY</addr-line>, <country>United States</country></aff><aff id="aff2"><institution>University of Ilorin</institution>, <addr-line>Ilorin</addr-line>, <country>Nigeria</country></aff><aff id="aff3"><institution>Zagazig University</institution>, <addr-line>Zagazig</addr-line>, <country>Egypt</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Leung</surname><given-names>Tiffany</given-names></name></contrib></contrib-group><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>18</day><month>9</month><year>2024</year></pub-date><volume>5</volume><elocation-id>e65727</elocation-id><history><date date-type="received"><day>23</day><month>08</month><year>2024</year></date><date date-type="accepted"><day>23</day><month>08</month><year>2024</year></date></history><copyright-statement>&#x00A9; Yixuan Gao, Toba Olatoye, Randa Salah Gomaa Mahmoud. Originally published in JMIRx Med (<ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org">https://med.jmirx.org</ext-link>), 18.9.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIRx Med, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org/">https://med.jmirx.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://xmed.jmir.org/2024/1/e65727"/><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.1101/2024.04.03.24305276v4" xlink:title="Preprint (medRxiv)" xlink:type="simple">https://www.medrxiv.org/content/10.1101/2024.04.03.24305276v4</related-article><kwd-group><kwd>medical informatics</kwd><kwd>clinical decision support tools</kwd><kwd>AI in health care</kwd><kwd>large language models</kwd><kwd>emergency medical services</kwd><kwd>clinical evaluation</kwd><kwd>medical emergencies</kwd><kwd>resource-constrained settings</kwd></kwd-group></article-meta></front><body><p><italic>This is a peer-review report submitted for the preprint &#x201C;All You Need Is Context: Clinician Evaluations of Various Iterations of a Large Language Model&#x2013;Based First Aid Decision Support Tool in Ghana.&#x201D;</italic></p><p>This review is the result of a virtual collaborative live review discussion organized and hosted by PREreview and JMIR Publications on June 20, 2024. The discussion was joined by 15 people: 2 facilitators, 2 members of the JMIR Publications team, 2 authors, and 9 live review participants, including 3 who agreed to be named, Aswathi Surendran, Khushboo Thaker, Arya Rahgozar, and Emmanuel Adamolekun, but did not contribute to the final composition of this review. The authors of this review have dedicated additional asynchronous time over the course of 2 weeks to help compose this final report using the notes from the live review. We thank all participants who contributed to the discussion and made it possible for us to provide feedback on this preprint.</p><sec id="s2"><title>Summary</title><p>This study [<xref ref-type="bibr" rid="ref1">1</xref>] investigates the performance and application of large language models (LLMs) as support tools for making clinical decisions during medical emergencies in the resource-constrained settings of low- and middle-income countries (LMIC) such as Ghana. The research&#x2019;s aim is to provide a premise for future research and development of LLM-based clinical decision support tools by assessing the suitability and effectiveness of five selected generalized LLMs using context-specific prompts. A total of 13 medical experts with an average of 3 years of experience working in an environment of limited resources evaluated the outputs of these models quantitatively by using mean ranking scores and qualitatively using thematic analysis.</p><p>The authors used off-the-shelf pretrained LLMs (GPT-4 Turbo, Gemini 1.5 Pro, and Claude Sonnet) with prompt engineering and retrieval augmented generation (RAG) techniques to develop five iterations of a decision support tool. A total of 50 responses were generated and evaluated. Machine evaluations were also performed and compared with theirs, using conventional machine learning metrics like bilingual evaluation understudy and Recall-Oriented Understudy for Gisting Evaluation.</p><p>Their findings showed that Gemini 1.5 Pro+ prompt engineering outperformed the other LLMs used in their research, while the adjustments of other LLMs using suitable parameters improved their overall performance. This may imply that LLM-based first aid assistants could provide useful instructions for the management and treatment of medical conditions, especially in resource-constrained settings. The practitioners were generally satisfied with the diagnoses and instructions from these LLMs, demonstrating their potential and importance in managing medical emergencies. Future research should involve larger datasets, additional metrics, and more detailed evaluations to refine and enhance the use of LLMs in real-world medical emergencies.</p><p>The discussion from participants of this live review is summarized below.</p></sec><sec id="s3"><title>List of Major Concerns and Feedback</title><sec id="s2-1"><title>Statistical Significance of Differences in Mean Ranking Scores</title><list list-type="bullet"><list-item><p>Concern: The paper does not assess if the difference in mean ranking scores with a change in RAG approach (result in Table 2) is statistically significant.</p></list-item><list-item><p>Feedback: Perform statistical tests such as <italic>t</italic> tests or Kruskal-Wallis test by ranks to determine if the differences in mean ranking scores are statistically significant. This will add robustness to the findings.</p></list-item></list></sec><sec id="s2-2"><title>Incomplete Figures</title><list list-type="bullet"><list-item><p>Concern: The Figure 2 image is incomplete, with the right side cut off, and the Figure 1 legend is incomplete. In Figure 3, the data is not clear to assess the correlation.</p></list-item><list-item><p>Feedback: Revise the figures to ensure they are complete and clearly labeled. This will improve the clarity and comprehensibility of the visual data.</p></list-item></list></sec><sec id="s2-3"><title>Availability of Google Form Reference</title><list list-type="bullet"><list-item><p>Concern: The Google form (reference 15) is not available.</p></list-item><list-item><p>Feedback: Ensure the Google form is accessible in the supplementary files. This is crucial for transparency and reproducibility.</p></list-item></list></sec></sec><sec id="s4"><title>List of Minor Concerns and Feedback</title><list list-type="bullet"><list-item><p>It would be helpful for the reader to see the aim of the work, the main results, and the conclusion mentioned in the abstract.</p></list-item><list-item><p>Participants were a bit confused about reference 1 in the Authors section and wondered if that was the most appropriate place to cite the project involved with this study.</p></list-item><list-item><p>It is unclear if Claude 3.5 Sonnet or Claude 3 Opus was used. Please clarify.</p></list-item><list-item><p>It is unclear what is being referred to with &#x201C;Low-and Low-Middle-Income countries (LMICs).&#x201D; Is it low-income countries or &#x201C;Lower Middle Income Countries (LMICs),&#x201D; forms more commonly used as defined by the World Bank [<xref ref-type="bibr" rid="ref2">2</xref>]?</p></list-item><list-item><p>In section E of the Methodology, it would be helpful to mention the total number of clinicians involved in the study. In section G, the text says &#x201C;The first group of 30 responses were evaluated by all 13 physicians. The second group of 20 responses was evaluated by 8 of the physicians.&#x201D; It would be helpful to know why and how these 8 were selected out of the total 13.</p></list-item><list-item><p>In section F of the Methodology section, the text presents a quote by one of the clinicians involved. It would be helpful to understand why this quote is presented in the text.</p></list-item><list-item><p>It would be helpful to have more information about the statistical tests used for the quantitative analysis and why.</p></list-item><list-item><p>In the Results section, there seems to be inconsistency in the labeling style of tables: Roman numerals in the text versus Arabic numerals in the figure label. It would be helpful to choose one style and be consistent throughout the manuscript so that the reader can better follow the results.</p></list-item><list-item><p>In the Results section, under the Qualitative Analysis section, the sentence &#x201C;Table 3 shows the 8 codes and their descriptions&#x201D;: Table 3 should be corrected to Table 4.</p></list-item><list-item><p>Figure 1 is a bit hard to read and understand. A bigger font and an explanation of what is plotted in the figure legend would significantly enhance comprehension.</p></list-item><list-item><p>In the second paragraph on page 6, the abbreviation EMS is first mentioned and it should be spelled out as the emergency medical services (EMS).</p></list-item><list-item><p>It was expected that the RAG-based approach would have performed better than the approach solely based on LLM. It would be helpful if the authors discussed the results in the context of these expectations, highlighting potential limitations of the study.</p></list-item></list></sec></body><back><ack><p>PREreview and JMIR Publications thank the authors of the preprint for posting their work openly for feedback. We also thank all participants of the live review call for their time and for engaging in the lively discussion that generated this review.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">EMS</term><def><p>emergency medical services</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">LMIC</term><def><p>low- and middle-income countries</p></def></def-item><def-item><term id="abb4">RAG</term><def><p>retrieval augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Mensah</surname><given-names>PB</given-names> </name><name name-style="western"><surname>Quao</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Dagadu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mensah</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Darkwah</surname><given-names>JD</given-names> </name><collab>Project Genie Clinician Evaluation Group</collab></person-group><article-title>All you need is context: clinician evaluations of various iterations of a large language model&#x2013;based first aid decision support tool in Ghana</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 25, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.04.03.24305276</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Fantom</surname><given-names>N</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Prince</surname><given-names>WC</given-names> </name></person-group><article-title>LICs, LMICs, UMICs, and HICs: classifying economies for analytical purposes</article-title><source>World Bank Blogs</source><year>2024</year><month>06</month><day>13</day><access-date>2024-08-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://blogs.worldbank.org/en/opendata/lics-lmics-umics-and-hics-classifying-economies-analytical-purposes">https://blogs.worldbank.org/en/opendata/lics-lmics-umics-and-hics-classifying-economies-analytical-purposes</ext-link></comment></nlm-citation></ref></ref-list></back></article>