<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Dermatol</journal-id><journal-id journal-id-type="publisher-id">derma</journal-id><journal-id journal-id-type="index">29</journal-id><journal-title>JMIR Dermatology</journal-title><abbrev-journal-title>JMIR Dermatol</abbrev-journal-title><issn pub-type="epub">2562-0959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v8i1e67551</article-id><article-id pub-id-type="doi">10.2196/67551</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>Evaluating the Diagnostic Accuracy of ChatGPT-4 Omni and ChatGPT-4 Turbo in Identifying Melanoma: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Sattler</surname><given-names>Samantha S.</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chetla</surname><given-names>Nitin</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Matthew</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hage</surname><given-names>Tamer Rajai</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chang</surname><given-names>Joseph</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Guo</surname><given-names>William Young</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hugh</surname><given-names>Jeremy</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Dermatology, Stony Brook University Hospital</institution><addr-line>Stony Brook</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff2"><institution>School of Medicine, University of Virginia</institution><addr-line>Charlottesville</addr-line><addr-line>VA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Renaissance School of Medicine, Stony Brook University</institution><addr-line>Stony Brook</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff4"><institution>Virginia Tech</institution><addr-line>Blacksburg</addr-line><addr-line>VA</addr-line><country>United States</country></aff><aff id="aff5"><institution>University of Passau</institution><addr-line>Passau</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Meisenheimer</surname><given-names>John</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chow</surname><given-names>James C L</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mosca</surname><given-names>Lucia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tamer Rajai Hage, BS, Virginia Tech, Blacksburg, VA, 24061, United States, 1 7038948362; <email>tamerwh@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>21</day><month>3</month><year>2025</year></pub-date><volume>8</volume><elocation-id>e67551</elocation-id><history><date date-type="received"><day>14</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>05</day><month>02</month><year>2025</year></date><date date-type="accepted"><day>17</day><month>02</month><year>2025</year></date></history><copyright-statement>&#x00A9; Samantha S. Sattler, Nitin Chetla, Matthew Chen, Tamer Rajai Hage, Joseph Chang, William Young Guo, Jeremy Hugh. Originally published in JMIR Dermatology (<ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>), 21.3.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Dermatology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://derma.jmir.org/2025/1/e67551"/><abstract><p>ChatGPT is increasingly used in healthcare. Fields like dermatology and radiology could benefit from ChatGPT&#x2019;s ability to help clinicians diagnose skin lesions. This study evaluates the accuracy of ChatGPT in diagnosing melanoma. Our analysis indicates that ChatGPT cannot be used reliably to diagnose melanoma, and further improvements are needed to reach this capability.</p></abstract><kwd-group><kwd>melanoma</kwd><kwd>skin cancer</kwd><kwd>chatGPT</kwd><kwd>chat-GPT</kwd><kwd>chatbot</kwd><kwd>dermatology</kwd><kwd>cancer</kwd><kwd>oncology</kwd><kwd>metastases</kwd><kwd>diagnostic</kwd><kwd>diagnosis</kwd><kwd>lesion</kwd><kwd>efficacy</kwd><kwd>machine learning</kwd><kwd>ML</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>algorithm</kwd><kwd>model</kwd><kwd>analytics</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Artificial Intelligence (AI) is being increasingly integrated into health care [<xref ref-type="bibr" rid="ref1">1</xref>]. Multiple AI systems exist in medicine, including large language models (LLMs), neural networks, and predictive models. While studies have demonstrated AI&#x2019;s mixed precision and accuracy, the technology is poised to assist with data-driven diagnostics in dermatology [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>There has a been rapid popularization of the LLM, ChatGPT for home-based medical inquiries [<xref ref-type="bibr" rid="ref3">3</xref>]. Minimal research exists on ChatGPT&#x2019;s accuracy in detecting melanoma. Given that patients are increasingly presenting internet-derived diagnostics during cancer consultations, it is imperative to understand the capabilities of commonly used AI engines, such as ChatGPT [<xref ref-type="bibr" rid="ref4">4</xref>]. In this study, we compare the capabilities of two models&#x2014;ChatGPT-4 Omni (GPT-4o) and ChatGPT-4 Turbo (GPT-4 Turbo)&#x2014;in identifying melanoma versus &#x201C;not melanoma&#x201D; skin lesions. These LLMs were chosen due to their accessibility and ability to answer image-based dermatology board-style questions correctly [<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>OpenAI was used to query GPT-4o and GPT-4 Turbo for classifying dermatoscopic images of melanoma versus &#x201C;not melanoma&#x201D; (ie, melanocytic nevi, basal cell carcinoma, actinic keratoses, dermatofibromas, and vascular lesions) selected from the HAM10K database, which contains &#x003E;10,000 dermatoscopic images collected over 20 years from multiple populations, and verified by histopathology or confocal microscopy [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Five-hundred melanoma and &#x201C;not melanoma&#x201D; diagnoses were randomly selected with no image modifications. A standardized prompt was used: &#x201C;This is an image of the step 1 examination. The multiple-choice question is as follows: Based on the image, does the patient have (A) melanoma (B) no melanoma? Only output the answer as A or B.&#x201D; Incomplete responses were categorized as &#x201C;not a number&#x201D; and excluded.</p><p>To assess the effect of binary versus nonbinary prompting, an additional 1000 randomly selected &#x201C;not melanoma&#x201D; dermatoscopic images were classified by GPT-4o, given its higher sensitivity compared to GPT-4 Turbo. Manual classification was applied for &#x201C;not a number&#x201D; results when the response leaned towards &#x201C;melanoma&#x201D; or &#x201C;not melanoma&#x201D; but did not explicitly state &#x201C;A&#x201D; or &#x201C;B.&#x201D;</p></sec><sec id="s3" sec-type="results"><title>Results</title><p>The diagnostic accuracies of GPT-4 Turbo and GPT-4o were 0.546 (95% CI 0.515&#x2010;0.577) and 0.577 (95% CI 0.547&#x2010;0.608), respectively. There was no significant difference in accuracy between the two models (<italic>P</italic>=.10). GPT-4 Turbo demonstrated a sensitivity of 76.3%, specificity of 32.9%, and false-positive rate of 67.1% (<xref ref-type="table" rid="table1">Table 1</xref>). GPT-4o yielded a higher sensitivity of 96.8% (<italic>P</italic>&#x003C;.001), lower specificity of 18.4% (<italic>P</italic>=.09), and higher false-positive rate of 81.6% (<italic>P</italic>&#x003C;.001).</p><p>GPT-4o&#x2019;s additional analysis of &#x201C;not melanoma&#x201D; images using nonbinary prompting yielded an accuracy of 6.56% (95% CI 4.94%&#x2010;8.18%), correctly classifying 59 of 899 images (<xref ref-type="table" rid="table2">Table 2</xref>). Binary prompting increased GPT-4o accuracy to 25.25% (95% CI 22.55%&#x2010;27.95%), with 252 of 998 images correctly identified as &#x201C;not melanoma.&#x201D; The confusion matrices associated with the statistical measures of GPT-4o and GPT-4 Turbo are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>GPT-4 Omni and GPT-4 Turbo demonstrate low accuracy and low specificity for melanoma diagnosis.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Statistical measure</td><td align="left" valign="bottom">Chat-GPT 4 Turbo</td><td align="left" valign="bottom">Chat-GPT 4 Omni</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy, (95% CI)</td><td align="left" valign="top">0.546 (0.515&#x2010;0.577)</td><td align="left" valign="top">0.577 (0.547&#x2010;0.608)</td></tr><tr><td align="left" valign="top">Precision</td><td align="left" valign="top">0.532</td><td align="left" valign="top">0.544</td></tr><tr><td align="left" valign="top">Specificity, % (95% CI)</td><td align="left" valign="top">32.9 (0.288&#x2010;0.370)</td><td align="left" valign="top">18.4 (0.150&#x2010;0.218)</td></tr><tr><td align="left" valign="top">Sensitivity, % (95% CI)</td><td align="left" valign="top">76.3 (0.726&#x2010;0.801)</td><td align="left" valign="top">96.8 (0.952&#x2010;0.983)</td></tr><tr><td align="left" valign="top">F1-score</td><td align="left" valign="top">0.627</td><td align="left" valign="top">0.697</td></tr><tr><td align="left" valign="top">False-positive rate (%)</td><td align="left" valign="top">67.1</td><td align="left" valign="top">81.6</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Accuracy of ChatGPT-4o in diagnosing melanoma and &#x201C;not melanoma&#x201D; with binary versus nonbinary prompting.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Statistical measure</td><td align="left" valign="bottom">Nonbinary prompting (n=899)</td><td align="left" valign="bottom">Binary prompting (n=998)</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy, n (%)</td><td align="char" char="." valign="top">59 (6.56)</td><td align="char" char="." valign="top">252 (25.25)</td></tr><tr><td align="char" char="." valign="top">95% CI (%)</td><td align="char" char="." valign="top">4.94&#x2010;8.18</td><td align="char" char="." valign="top">22.55&#x2010;27.95</td></tr><tr><td align="left" valign="top">False-positive rate (%)</td><td align="char" char="." valign="top">81.6</td><td align="char" char="." valign="top">67.1</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Currently, GPT engines demonstrate low accuracy for diagnosing melanoma. Higher diagnostic accuracies have been achieved using neural networks such as Moleanalyzer pro (87.7%) and ChatGPT Vision (85%); however, these studies included much smaller sample sizes of 100 and 60 images, respectively [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Our findings exhibit a higher-powered analysis of ChatGPT performance.</p><p>GPT-4o&#x2019;s improved accuracy with binary versus nonbinary prompting aligns with prior AI research demonstrating that these search engines struggle without explicit direction [<xref ref-type="bibr" rid="ref8">8</xref>]. When more intricate prompts are provided, results improve [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. However, such a methodology is not generalizable to the average user. Patients using these engines to self-diagnose suspicious lesions at home are more likely to use nonbinary prompts without detailed instructions for the AI engine. Thus, our nonbinary prompting results reflect that ChatGPT would provide inaccurate outputs when used by the average patient.</p><p>The high false-positive rates of GPT-4o and GPT-4 Turbo in evaluating &#x201C;not melanoma&#x201D; suggest a conservative bias. This raises ethical concerns, as undue patient harm may result from AI&#x2019;s overdiagnosis of &#x201C;melanoma.&#x201D; Patients receiving incorrect &#x201C;melanoma&#x201D; diagnoses from ChatGPT prior to their dermatology appointments may develop mistrust if the physician accurately contradicts AI diagnoses. These patients may feel unheard if they do not receive biopsies for their &#x201C;suspicious&#x201D; moles. Increased in-office counseling may be warranted to disentangle the biases AI imparts to patients.</p><p>Limitations included using a single dataset and dermatoscopic images without broader clinical information. The models were not specifically trained before querying. ChatGPT is a generative AI that may be less suitable than specialized AI systems in dermatoscopic image diagnoses [<xref ref-type="bibr" rid="ref2">2</xref>]. Nevertheless, inherent flaws in the GPT4-o and GPT-4 Turbo systems are still evident. Therefore, patients should avoid ChatGPT diagnoses before evaluation of their suspected pigemented lesions by trained dermatologists.</p></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">GPT-4 Turbo</term><def><p>ChatGPT-4 Turbo</p></def></def-item><def-item><term id="abb3">GPT-4o</term><def><p>ChatGPT-4 Omni</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esteva</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kuprel</surname><given-names>B</given-names> </name><name name-style="western"><surname>Novoa</surname><given-names>RA</given-names> </name><etal/></person-group><article-title>Dermatologist-level classification of skin cancer with deep neural networks</article-title><source>Nature New Biol</source><year>2017</year><month>02</month><day>2</day><volume>542</volume><issue>7639</issue><fpage>115</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.1038/nature21056</pub-id><pub-id pub-id-type="medline">28117445</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tejeda</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Smyth</surname><given-names>P</given-names> </name><name name-style="western"><surname>Steyvers</surname><given-names>M</given-names> </name></person-group><article-title>AI-assisted decision-making: a cognitive modeling approach to infer latent reliance strategies</article-title><source>Comput Brain Behav</source><year>2022</year><month>12</month><volume>5</volume><issue>4</issue><fpage>491</fpage><lpage>508</lpage><pub-id pub-id-type="doi">10.1007/s42113-022-00157-y</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chow</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>V</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name></person-group><article-title>Generative pre-trained transformer-empowered healthcare conversations: current trends, challenges, and future directions in large language model-enabled medical chatbots</article-title><source>BioMedInformatics</source><year>2024</year><volume>4</volume><issue>1</issue><fpage>837</fpage><lpage>852</lpage><pub-id pub-id-type="doi">10.3390/biomedinformatics4010047</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanders</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chow</surname><given-names>JCL</given-names> </name></person-group><article-title>Chatbot for health care and oncology applications using artificial intelligence and machine learning: systematic review</article-title><source>JMIR Cancer</source><year>2021</year><month>11</month><day>29</day><volume>7</volume><issue>4</issue><fpage>e27850</fpage><pub-id pub-id-type="doi">10.2196/27850</pub-id><pub-id pub-id-type="medline">34847056</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hanna</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hatch</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hanna</surname><given-names>K</given-names> </name></person-group><article-title>Computer vision meets large language models: performance of ChatGPT 4.0 on dermatology boards-style practice questions</article-title><source>SKIN J Cutan Med</source><year>2024</year><volume>8</volume><issue>5</issue><fpage>1815</fpage><lpage>1821</lpage><pub-id pub-id-type="doi">10.25251/skin.8.5.5</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Alexander Scarlat</collab></person-group><article-title>Melanoma: augmented dermoscopic pigmented skin lesions from HAM10k</article-title><source>Kaggle</source><access-date>2024-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/drscarlat/melanoma">https://www.kaggle.com/datasets/drscarlat/melanoma</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Winkler</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Blum</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kommoss</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Assessment of diagnostic performance of dermatologists cooperating with a convolutional neural network in a prospective clinical study: Human With Machine</article-title><source>JAMA Dermatol</source><year>2023</year><month>06</month><day>1</day><volume>159</volume><issue>6</issue><fpage>621</fpage><lpage>627</lpage><pub-id pub-id-type="doi">10.1001/jamadermatol.2023.0905</pub-id><pub-id pub-id-type="medline">37133847</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cirone</surname><given-names>K</given-names> </name><name name-style="western"><surname>Akrout</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abid</surname><given-names>L</given-names> </name><name name-style="western"><surname>Oakley</surname><given-names>A</given-names> </name></person-group><article-title>Assessing the utility of multimodal large language models (GPT-4 Vision and Large Language and Vision Assistant) in identifying melanoma across different skin tones</article-title><source>JMIR Dermatol</source><year>2024</year><month>03</month><day>13</day><volume>7</volume><fpage>e55508</fpage><pub-id pub-id-type="doi">10.2196/55508</pub-id><pub-id pub-id-type="medline">38477960</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Confusion matrix of ChatGPT-4 Omni performance (top) and confusion matrix of ChatGPT-4 Turbo performance (bottom).</p><media xlink:href="derma_v8i1e67551_app1.png" xlink:title="PNG File, 56 KB"/></supplementary-material></app-group></back></article>