<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Dermatol</journal-id><journal-id journal-id-type="publisher-id">derma</journal-id><journal-id journal-id-type="index">29</journal-id><journal-title>JMIR Dermatology</journal-title><abbrev-journal-title>JMIR Dermatol</abbrev-journal-title><issn pub-type="epub">2562-0959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v8i1e60827</article-id><article-id pub-id-type="doi">10.2196/60827</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>The Comparative Sufficiency of ChatGPT, Google Bard, and Bing AI in Answering Diagnosis, Treatment, and Prognosis Questions About Common Dermatological Diagnoses</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Chau</surname><given-names>Courtney A</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Feng</surname><given-names>Hao</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Cobos</surname><given-names>Gabriela</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Park</surname><given-names>Joyce</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Icahn School of Medicine at Mount Sinai</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Dermatology, University of Connecticut Health Center</institution><addr-line>Farmington</addr-line><addr-line>CT</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Dermatology, Tufts Medical Center</institution><addr-line>260 Tremont St, Fl 13</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff4"><institution>Skin Refinery PLLC</institution><addr-line>Spokane</addr-line><addr-line>WA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Dellavalle</surname><given-names>Robert</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bai</surname><given-names>Enze</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mao</surname><given-names>Siqi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Cho</surname><given-names>Yung-Tsu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Gabriela Cobos, MD, Department of Dermatology, Tufts Medical Center, 260 Tremont St, Fl 13, Boston, MA, 02116, United States, 1 617-636-0156; <email>gabriela.cobos@tuftsmedicine.org</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>7</day><month>1</month><year>2025</year></pub-date><volume>8</volume><elocation-id>e60827</elocation-id><history><date date-type="received"><day>22</day><month>05</month><year>2024</year></date><date date-type="rev-recd"><day>16</day><month>10</month><year>2024</year></date><date date-type="accepted"><day>17</day><month>10</month><year>2024</year></date></history><copyright-statement>&#x00A9; Courtney Andrea Chau, Hao Feng, Gabriela Cobos, Joyce Park. Originally published in JMIR Dermatology (<ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>), 7.1.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Dermatology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://derma.jmir.org/2025/1/e60827"/><abstract><p>Our team explored the utility of unpaid versions of 3 artificial intelligence chatbots in offering patient-facing responses to questions about 5 common dermatological diagnoses, and highlighted the strengths and limitations of different artificial intelligence chatbots, while demonstrating how chatbots presented the most potential in tandem with dermatologists&#x2019; diagnosis.</p></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>ChatGPT</kwd><kwd>atopic dermatitis</kwd><kwd>acne vulgaris</kwd><kwd>cyst</kwd><kwd>actinic keratosis</kwd><kwd>rosacea</kwd><kwd>diagnosis</kwd><kwd>treatment</kwd><kwd>prognosis</kwd><kwd>dermatological</kwd><kwd>patient</kwd><kwd>chatbot</kwd><kwd>dermatologist</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Artificial intelligence (AI) chatbots, such as ChatGPT, offer platforms for patients to ask medical questions, particularly with limited access to care [<xref ref-type="bibr" rid="ref1">1</xref>]. Although ChatGPT utility in dermatology has been assessed, few studies have compared the performance between chatbots [<xref ref-type="bibr" rid="ref2">2</xref>]. This study compared the clinical utility of the unpaid versions of ChatGPT 3.5, Google Bard, and Bing AI in generating patient-facing responses to questions about 5 common dermatological diagnoses (atopic dermatitis, acne vulgaris, actinic keratosis, cyst, and rosacea) [<xref ref-type="bibr" rid="ref3">3</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>For each condition, 2 diagnosis, 2 treatment, and 1 prognosis questions were devised. Diagnosis questions requested a diagnosis and presented the patient history including age, sex, symptoms (duration/location), treatments and outcomes, and medical history. Nineteen questions were modeled from questions on Reddit forums (&#x201C;r/AskDocs&#x201D; and &#x201C;r/dermatology&#x201D;). For topics with insufficient Reddit questions, the coauthors devised prompts reflecting common questions in their experience (6 questions).</p><p>Questions were inputted into each chatbot; the prompts used are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Three board-certified dermatologists scored the responses on appropriateness for a patient-facing platform (Yes/No), sufficiency for clinical practice (Yes/No: not specific, not concise, or inaccurate information), accuracy from 1 (completely inaccurate) to 6 (completely accurate), and overall from 1 (worst possible answer) to 10 (best possible answer) [<xref ref-type="bibr" rid="ref4">4</xref>]. The Wilcoxon rank-sum test was used for pairwise comparisons. <italic>P</italic>-values were adjusted using the Bonferroni correction.</p></sec><sec id="s3" sec-type="results"><title>Results</title><p>One response was omitted because Google Bard declined answering the second atopic dermatitis diagnosis question (&#x201C;I am a 19-year old&#x2026;&#x201D;), responding with, &#x201C;I&#x2019;m just a language model, so I can&#x2019;t help you with that.&#x201D; ChatGPT responses had significantly lower Flesch reading ease scores than Google Bard (<italic>P</italic>&#x003C;.001) and Bing AI (<italic>P</italic>&#x003C;.001), indicating lower comprehensibility (<xref ref-type="table" rid="table1">Table 1</xref>). ChatGPT responses received significantly higher accuracy (<italic>P</italic>=.01, <xref ref-type="fig" rid="figure1">Figure 1</xref>) and overall (<italic>P</italic>=.003) ratings than Bing AI. Considering patient-facing platform appropriateness and clinical practice sufficiency, ChatGPT received the most appropriate (95%) and sufficient (55%) ratings; Bing AI received the fewest (87% and 55%, respectively). In total, 45%, 49%, and 53% of ChatGPT, Google Bard, and Bing AI responses, respectively, had inaccurate information or were not specific. For diagnosis prompts, 9 of 10 of ChatGPT and Bing AI and 7 of 10 of Google Bard responses included the intended diagnosis. Of the 25 responses from each chatbot, 25 of Bing AI&#x2019;s, 24 of ChatGPT&#x2019;s, and 19 of Google Bard&#x2019;s responses emphasized the importance of consulting healthcare professionals. No fabrication or hallucination was observed for any chatbot responses.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Descriptive statistics of scores between chatbots.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top">ChatGPT 3.5 (n=75)</td><td align="left" valign="top">Google Bard (n=72)</td><td align="left" valign="top">Bing AI (n=75)</td></tr></thead><tbody><tr><td align="left" valign="top">Mean Flesch reading ease score (SD)*<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">33.90 (8.1)</td><td align="left" valign="top">49.72 (15.4)</td><td align="left" valign="top">46.53 (9.7)</td></tr><tr><td align="left" valign="top">Mean accuracy (SD)</td><td align="left" valign="top">5.29 (0.97)</td><td align="left" valign="top">5.00 (0.98)</td><td align="left" valign="top">4.87 (1.1)</td></tr><tr><td align="left" valign="top">Mean overall rating (SD)</td><td align="left" valign="top">8.37 (1.8)</td><td align="left" valign="top">7.94 (1.9)</td><td align="left" valign="top">7.41 (2.1)</td></tr><tr><td align="left" valign="top">Number of responses appropriate for a patient-facing platform (%)</td><td align="left" valign="top">71 (95)</td><td align="left" valign="top">65 (90)</td><td align="left" valign="top">65 (87)</td></tr><tr><td align="left" valign="top" colspan="4">Sufficiency for clinical practice</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes (%)</td><td align="left" valign="top">41 (55)</td><td align="left" valign="top">35 (49)</td><td align="left" valign="top">35 (47)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No: not specific enough (%)</td><td align="left" valign="top">14 (19)</td><td align="left" valign="top">15 (21)</td><td align="left" valign="top">23 (31)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No: inaccurate information (%)</td><td align="left" valign="top">20 (27)</td><td align="left" valign="top">20 (28)</td><td align="left" valign="top">17 (23)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No: not concise (%)</td><td align="left" valign="top">0</td><td align="left" valign="top">2 (3)</td><td align="left" valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Out of n=25 for ChatGPT and Bing AI and n=24 for Google Bard because only 1 Flesch reading ease score was calculated for each response. The other measures in the table are based on evaluation of each chatbot response by 3 board-certified dermatologists.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Distribution of the accuracy ratings for each chatbot. The accuracy scores from the three board-certified dermatologists ranged from 1 (completely inaccurate) to 6 (completely accurate).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v8i1e60827_fig01.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>ChatGPT outputs were most accurate and appropriate for patient questions. However, ChatGPT responses had college-level readability, limiting public utility [<xref ref-type="bibr" rid="ref5">5</xref>]. Responses were deemed sufficient for clinical practice if the chatbot concisely provided completely correct information that specifically answered the patient&#x2019;s question without missing critical components. Only approximately half the responses were sufficient for clinical practice, primarily due to inaccuracies and lack of specificity. ChatGPT and Bing AI performed the best at diagnosis and emphasized the importance of seeking input from a healthcare professional. Google Bard did not perform well in these domains, indicating that it is less suitable for suggesting diagnoses. Despite the better diagnostic performance of ChatGPT and Bing AI, an unranked list of conditions with differing treatments is not actionable for patients. Chatbots present more potential in offering advice once a diagnosis has been established. This study is limited by exploring only 5 questions for each of the 5 conditions. Exploring a broader range of conditions with a larger set of questions would more robustly capture chatbots&#x2019; performance. However, this study lays the groundwork for future research to compare chatbots using more expansive domains.</p><p>ChatGPT 3.5 displays more promise than Google Bard and Bing AI in evaluating, diagnosing, and suggesting a treatment plan for dermatologic conditions, consistent with previous findings, in which the chatbots&#x2019; responses to questions about melanoma were evaluated [<xref ref-type="bibr" rid="ref2">2</xref>]. However, this study revealed several important improvements needed for all 3 chatbots: enhancing readability, removing inaccuracies, and improving information specificity. Dermatologists may be able to reference these AI in practice, to limited extents, by suggesting patients use AI as a reference only to obtain information about the condition after being diagnosed. This strategy is similar to paper handouts, where AI chatbots provide background knowledge that patients can later follow-up on with their dermatologist. In conclusion, while chatbot utility is most promising in tandem with a dermatologist&#x2019;s diagnosis and contributes to information dissemination, chatbots should not function as a first-line independent entity. As access to AI grows, dermatologists must be aware of the quality of information patients may receive from AI and how it may differ from a dermatologist&#x2019;s advice.</p></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baker</surname><given-names>MN</given-names> </name><name name-style="western"><surname>Burruss</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Wilson</surname><given-names>CL</given-names> </name></person-group><article-title>ChatGPT: a supplemental tool for efficiency and improved communication in rural dermatology</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e43812</fpage><pub-id pub-id-type="doi">10.7759/cureus.43812</pub-id><pub-id pub-id-type="medline">37731429</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>B</given-names> </name><name name-style="western"><surname>Seth</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Comparison of large language models in management advice for melanoma: Google&#x2019;s AI BARD, BingAI and ChatGPT</article-title><source>Skin Health Dis</source><year>2024</year><month>02</month><volume>4</volume><issue>1</issue><fpage>e313</fpage><pub-id pub-id-type="doi">10.1002/ski2.313</pub-id><pub-id pub-id-type="medline">38312244</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Taheri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Feldman</surname><given-names>SR</given-names> </name></person-group><article-title>Top dermatologic diagnoses by age</article-title><source>Dermatol Online J</source><year>2014</year><month>04</month><day>16</day><volume>20</volume><issue>4</issue><fpage>22368</fpage><pub-id pub-id-type="medline">24746305</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Young</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Poplausky</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The utility of ChatGPT in generating patient-facing and clinical responses for melanoma</article-title><source>J Am Acad Dermatol</source><year>2023</year><month>09</month><volume>89</volume><issue>3</issue><fpage>602</fpage><lpage>604</lpage><pub-id pub-id-type="doi">10.1016/j.jaad.2023.05.024</pub-id><pub-id pub-id-type="medline">37207953</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hutchinson</surname><given-names>N</given-names> </name><name name-style="western"><surname>Baird</surname><given-names>GL</given-names> </name><name name-style="western"><surname>Garg</surname><given-names>M</given-names> </name></person-group><article-title>Examining the reading level of internet medical information for common internal medicine diagnoses</article-title><source>Am J Med</source><year>2016</year><month>06</month><volume>129</volume><issue>6</issue><fpage>637</fpage><lpage>639</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2016.01.008</pub-id><pub-id pub-id-type="medline">26829438</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts inputted into ChatGPT 3.5, Google Bard, and Bing AI.</p><media xlink:href="derma_v8i1e60827_app1.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material></app-group></back></article>