<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Dermatol</journal-id><journal-id journal-id-type="publisher-id">derma</journal-id><journal-id journal-id-type="index">29</journal-id><journal-title>JMIR Dermatology</journal-title><abbrev-journal-title>JMIR Dermatol</abbrev-journal-title><issn pub-type="epub">2562-0959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v8i1e74085</article-id><article-id pub-id-type="doi">10.2196/74085</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>ChatGPT-4&#x2019;s Level of Dermatological Knowledge Based on Board Examination Review Questions and Bloom&#x2019;s Taxonomy</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Tai</surname><given-names>Hansen</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kovarik</surname><given-names>Carrie</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>SUNY Upstate Medical University</institution><addr-line>Syracuse</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Dermatology, Perelman School of Medicine, University of Pennsylvania</institution><addr-line>3600 Spruce Street, 2 Maloney Building</addr-line><addr-line>Philadelphia</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Dellavalle</surname><given-names>Robert</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hitchcock</surname><given-names>Dakota</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Potla</surname><given-names>Ravi Teja</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Sunny</surname><given-names/></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Au</surname><given-names>Chi Lik</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to  Carrie Kovarik, MD, Department of Dermatology, Perelman School of Medicine, University of Pennsylvania, 3600 Spruce Street, 2 Maloney Building, Philadelphia, 19146, United States, 1 2156626597, 1 2153495615; <email>carrie.kovarik@pennmedicine.upenn.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>7</day><month>8</month><year>2025</year></pub-date><volume>8</volume><elocation-id>e74085</elocation-id><history><date date-type="received"><day>17</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>15</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Hansen Tai, Carrie Kovarik. Originally published in JMIR Dermatology (<ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>), 7.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Dermatology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://derma.jmir.org/2025/1/e74085"/><abstract><p>Our study demonstrated the ability of ChatGPT-4 to answer 77.5% of all sampled text-based board review type questions correctly. Questions requiring the recall of factual information were answered correctly most often, with slight decreases in correctness as higher-order thinking requirements increased. Improvements to ChatGPT&#x2019;s visual diagnostics capabilities will be required before it can be used reliably for clinical decision-making and visual diagnostics.</p></abstract><kwd-group><kwd>ChatGPT</kwd><kwd>dermatology</kwd><kwd>education</kwd><kwd>board exam</kwd><kwd>residency</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>ChatGPT, a multimodal language model capable of answering multiple choice questions, incorporates visual inputs in its latest version, GPT-4. Lewandowski et al [<xref ref-type="bibr" rid="ref1">1</xref>] recently assessed ChatGPT-3.5 and ChatGPT-4&#x2019;s performance in dermatology examinations, finding that ChatGPT-4 significantly outperformed its predecessor, achieving over a 60% pass rate overall and &#x003E;84% accuracy on photo-based questions. Building on this, our study classified ChatGPT-4&#x2019;s correctly answered question types using Bloom&#x2019;s taxonomy for cognitive complexity [<xref ref-type="bibr" rid="ref2">2</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>We evaluated ChatGPT-4&#x2019;s capabilities on the Basic, Core, and Applied examination questions from Dermatology-In-Review, an online dermatology board review preparation course. The Basic examination is a required examination for first-year US dermatology residents and tests dermatology fundamental knowledge. The Core and Applied examinations are taken late in residency and after residency, respectively. These tests examine more advanced clinical knowledge and focus on higher-order thinking. In total, 167 Basic, 210 Core, and 166 Applied multiple-choice questions without photos were formatted and fed into ChatGPT-4 using an algorithm in Python&#x2019;s Pandas. ChatGPT-4&#x2019;s in-depth responses to each query were captured, reviewed, and independently confirmed and coded as correct or incorrect (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>ChatGPT-4 cases correct by testing category.<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Correct</td><td align="left" valign="bottom">Incorrect</td><td align="left" valign="bottom">% Correct</td><td align="left" valign="bottom">Remember type questions: Correct %, Total %</td></tr></thead><tbody><tr><td align="left" valign="top">Basic</td><td align="left" valign="top">139</td><td align="left" valign="top">28</td><td align="left" valign="top">83.20%</td><td align="left" valign="top">71/82 (86.6%), 82/167 (49.1%)</td></tr><tr><td align="left" valign="top">Core</td><td align="left" valign="top">158</td><td align="left" valign="top">52</td><td align="left" valign="top">75.20%</td><td align="left" valign="top">52/66 (78.8%), 66/210 (31.4%)</td></tr><tr><td align="left" valign="top">Applied</td><td align="left" valign="top">123</td><td align="left" valign="top">43</td><td align="left" valign="top">74.10%</td><td align="left" valign="top">35/46 (76.1%), 46/166 (27.7%)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup><italic>P</italic> value=.0382, Pearson&#x2019;s Chi-squared test for the Basic versus Core+Applied Examinations.</p></fn></table-wrap-foot></table-wrap><p>We categorized text-based questions according to Bloom&#x2019;s taxonomy using a Python function. One author (CK) and ChatGPT-4 categorized each question into a specific category of Bloom&#x2019;s Taxonomy using guidelines [<xref ref-type="bibr" rid="ref2">2</xref>]. In the case of a discrepancy, ChatGPT-4&#x2019;s reasoning for the decision was considered, which assisted in the reconciliation of categorization. Bloom&#x2019;s categories included Remember (includes lower-level thinking, such as knowledge and comprehension), Apply, Analyze, Evaluate, and Synthesize. All statistics were performed using R statistical software, including the Pearson chi-squared test (<xref ref-type="table" rid="table1">Table 1</xref>) and Fisher exact test (<xref ref-type="table" rid="table2">Table 2</xref>).</p><p>Photo-based questions were entered directly into ChatGPT-4, along with structured messages and answer choices, and responses were recorded. Fifty-three photo cases from all board categories were used.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>ChatGPT-4 cases correct by Bloom category (all cases).<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Correct</td><td align="left" valign="bottom">Incorrect</td><td align="left" valign="bottom">Total</td><td align="left" valign="bottom">% Correct</td></tr></thead><tbody><tr><td align="left" valign="top">Remember</td><td align="left" valign="top">158</td><td align="left" valign="top">35</td><td align="left" valign="top">193</td><td align="left" valign="top">81.9%</td></tr><tr><td align="left" valign="top">Apply</td><td align="left" valign="top">168</td><td align="left" valign="top">51</td><td align="left" valign="top">219</td><td align="left" valign="top">76.7%</td></tr><tr><td align="left" valign="top">Analyze</td><td align="left" valign="top">56</td><td align="left" valign="top">19</td><td align="left" valign="top">75</td><td align="left" valign="top">74.7%</td></tr><tr><td align="left" valign="top">Evaluate</td><td align="left" valign="top">37</td><td align="left" valign="top">14</td><td align="left" valign="top">52</td><td align="left" valign="top">72.5%</td></tr><tr><td align="left" valign="top">Synthesize</td><td align="left" valign="top">1</td><td align="left" valign="top">3</td><td align="left" valign="top">4</td><td align="left" valign="top">25.0%</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">420</td><td align="left" valign="top">122</td><td align="left" valign="top">542</td><td align="left" valign="top">77.5%</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup><italic>P</italic> =.059, Fisher exact test.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3" sec-type="results"><title>Results</title><p>Overall, ChatGPT-4 answered 77.5% of all sampled text-based questions correctly. Varying levels of accuracy were demonstrated in answering board questions within different Bloom categories. In the &#x201C;Remember&#x201D; category, the model correctly answered 158/193 (81.9%). &#x201C;Remember&#x201D; is considered the most basic level of educational understanding, with the ability to recall or comprehend information without applying the concept [<xref ref-type="bibr" rid="ref3">3</xref>]. ChatGPT-4 performed the best in this category; however, it did significantly (<italic>P</italic>=.0382) better on the &#x201C;Remember&#x201D; questions from the Basic examination compared to those on the Core and Applied sections combined (<xref ref-type="table" rid="table1">Table 1</xref>). As the Bloom categories progress from Apply to Analyze, Evaluate, and Synthesize, a solid foundation of knowledge and higher-order thinking is necessary. <xref ref-type="table" rid="table2">Table 2</xref> demonstrates a decreasing trend (<italic>P</italic>=.059) in the percent correctness for the ChatGPT-4 answers moving from &#x201C;Remember&#x201D; to the classes of higher-order thinking.</p><p>Of the 53 questions, 18 (34%) with photos were answered correctly, with none of the &#x201C;What is the histologic diagnosis?&#x201D; question stems answered correctly. Excluding these, 18/38 (47.3%) had the correct answer. Notably, photo questions with leading information were more likely to be given the correct response.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>ChatGPT-4 correctly answered 77.5% of all text questions correctly, similar to the results of Lewandowski et al [<xref ref-type="bibr" rid="ref1">1</xref>], in which ChatGPT-4 answered 80.7%&#x2010;84% of the questions correctly on English-based Dermatology assessments. Our outcomes differed in that they were able to obtain a much higher number of correct responses on photo-based questions compared to our study, where ChatGPT-4 was only able to answer approximately one-third of the photo cases correctly. Hirosawa et al [<xref ref-type="bibr" rid="ref4">4</xref>]assessed the impact of adding image data to clinical textual data on ChatGPT-4&#x2019;s diagnostic accuracy. They found that integrating image data into ChatGPT-4 did not significantly enhance diagnostic accuracy, and it predominantly relies on textual data, limiting its ability to use the diagnostic potential of visual information fully [[<xref ref-type="bibr" rid="ref4">4</xref>]]. This corroborates our findings of poor analysis of photo cases and improved correctness when leading question stems were given.</p><p>Overall, our study demonstrates the ability of ChatGPT-4 to answer text-based questions from Dermatology-In-Review at a high level. Questions requiring the recall of factual information were answered correctly most often, with slight decreases in correctness as higher-order thinking requirements increased. Improvements to ChatGPT-4&#x2019;s visual diagnostics capabilities will be required before it can be used reliably for visual interpretation and clinical decision-making. In its current state, ChatGPT-4 may be used as an educational tool for students and trainees when exploring core factual knowledge; however, trainees and practitioners should not rely on ChatGPT for higher level inquiries, such as analyzing clinical scenarios or image interpretation.</p><p>Our study has several limitations. Bloom&#x2019;s taxonomy is a continuum, and question classification can be complex. We used board review questions, and this may not be generalizable to true board questions. The edition of ChatGPT-4 used in this study had been trained with data only up to December 2023 [<xref ref-type="bibr" rid="ref5">5</xref>].</p></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lewandowski</surname><given-names>M</given-names> </name><name name-style="western"><surname>&#x0141;ukowicz</surname><given-names>P</given-names> </name><name name-style="western"><surname>&#x015A;wietlik</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bara&#x0144;ska-Rybak</surname><given-names>W</given-names> </name></person-group><article-title>ChatGPT-3.5 and ChatGPT-4 dermatological knowledge level based on the Specialty Certificate Examination in Dermatology</article-title><source>Clin Exp Dermatol</source><year>2024</year><month>06</month><day>25</day><volume>49</volume><issue>7</issue><fpage>686</fpage><lpage>691</lpage><pub-id pub-id-type="doi">10.1093/ced/llad255</pub-id><pub-id pub-id-type="medline">37540015</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krathwohl</surname><given-names>DR</given-names> </name></person-group><article-title>A revision of Bloom&#x2019;s taxonomy: an overview</article-title><source>Theory Pract</source><year>2002</year><month>11</month><day>1</day><volume>41</volume><issue>4</issue><fpage>212</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1207/s15430421tip4104_2</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zaidi</surname><given-names>N</given-names> </name></person-group><article-title>Modified bloom&#x2019;s taxonomy for evaluating multiple choice questions</article-title><year>2015</year><access-date>2024-05-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://www.bcm.edu/sites/default/files/2019/04/principles-and-guidelines-for-assessments-6.15.15.pdf">www.bcm.edu/sites/default/files/2019/04/principles-and-guidelines-for-assessments-6.15.15.pdf</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tokumasu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Suzuki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Evaluating ChatGPT-4&#x2019;s diagnostic accuracy: impact of visual data integration</article-title><source>JMIR Med Inform</source><year>2024</year><month>04</month><day>9</day><volume>12</volume><fpage>e55627</fpage><pub-id pub-id-type="doi">10.2196/55627</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><source>OpenAI Platform</source><access-date>2024-04-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com">https://platform.openai.com</ext-link></comment></nlm-citation></ref></ref-list></back></article>