<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Dermatol</journal-id><journal-id journal-id-type="publisher-id">derma</journal-id><journal-id journal-id-type="index">29</journal-id><journal-title>JMIR Dermatology</journal-title><abbrev-journal-title>JMIR Dermatol</abbrev-journal-title><issn pub-type="epub">2562-0959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e91544</article-id><article-id pub-id-type="doi">10.2196/91544</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title><bold>Harmonized Dual Deep Learning Architectures for Image-Based Diagnostics of Skin Neglected Tropical Diseases: Benchmark Study via Novel Funnel Framework</bold></article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Minyilu</surname><given-names>Yohannes</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yimer</surname><given-names>Mohammed Abebe</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Meshesha</surname><given-names>Million</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Faculty of Computing and Software Engineering, Institute of Technology, Arba Minch University</institution><addr-line>SE</addr-line><addr-line>Arba Minch</addr-line><country>Ethiopia</country></aff><aff id="aff2"><institution>School of Information Science, College of Natural and Computational Sciences, Addis Ababa University</institution><addr-line>Addis Ababa</addr-line><country>Ethiopia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Subhadarshani</surname><given-names>Sweta</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wu</surname><given-names>Ruoyu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Banerjee</surname><given-names>Tathagat</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yohannes Minyilu, MSc, Faculty of Computing and Software Engineering, Institute of Technology, Arba Minch University, SE, Arba Minch, Ethiopia, 251 0911434681; <email>yohannes.minyilu@amu.edu.et</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>23</day><month>6</month><year>2026</year></pub-date><volume>9</volume><elocation-id>e91544</elocation-id><history><date date-type="received"><day>15</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>11</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>20</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yohannes Minyilu, Mohammed Abebe Yimer, Million Meshesha. Originally published in JMIR Dermatology (<ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>), 23.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Dermatology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="http://derma.jmir.org">http://derma.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://derma.jmir.org/2026/1/e91544"/><abstract><sec><title>Background</title><p>While deep learning&#x2013;based methods are the potential technological solutions for the diagnosis of skin Neglected Tropical Diseases (skin NTDs), limited efforts were seen toward the use of such tools in Ethiopia. Data scarcity, methods, and models selection issues created further challenges in an attempt to close the previous gap.</p></sec><sec><title>Objective</title><p>This study attempts to design a benchmark image-based diagnostic model for skin NTDs through a synergistic combination of feature extraction pretrained models, a custom-designed convolutional neural network (CNN) model trained on the extracted features, and an integrated data augmentation method applied dynamically.</p></sec><sec sec-type="methods"><title>Methods</title><p>For this study, a new skin images dataset is created using skin photographs collected by a team of researchers from the NTDs research center of Arba Minch University Medical College. The new dataset contains 1495 images in 3 classes having severe class imbalance. Extensive experiments were conducted to find the optimal deep learning approach by designing a new CNN model, applying transfer learning, and designing the 2-stage approach that uses pretrained models for feature extraction and trains the new CNN model using the extracted features from the pretrained models and applying data augmentation based on the integrated 2-stage approach. For model selection, the study proposed a novel approach, the funnel framework with cascaded selection of methods and models.</p></sec><sec sec-type="results"><title>Results</title><p>After hyperparameter tuning, the model trained using DenseNet121 feature extractor scored the highest accuracy of 96.6%, <italic>F</italic><sub>1</sub>-score of 95%, and sensitivity of 95%, while the MNv2-based model scored comparable results of 95.6% accuracy, 90% <italic>F</italic><sub>1</sub>-score, and 90% sensitivity. This study finally selected the DenseNet121 and MNv2 models for feature extraction to build the final model for skin NTDs classification.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The 2-stage approach significantly boosted the models&#x2019; performance compared with other methods, while the data augmentation method further enhanced the performance of the selected models. Finally, this study suggests further studies using advanced class-balancing methods with more data and a possible integration of other clinical data types.</p></sec></abstract><kwd-group><kwd>skin NTDs classification</kwd><kwd>2-stage approach</kwd><kwd>feature extraction</kwd><kwd>funnel framework</kwd><kwd>hyperparameter optimization</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Neglected Tropical Diseases (NTDs) represent 21 different diseases, including podoconiosis, scabies, and tungiasis, affecting more than 1 billion people globally among underserved communities in the tropical areas [<xref ref-type="bibr" rid="ref1">1</xref>]. As a tropical country, NTDs are highly prevalent in Ethiopia, with the majority of NTDs identified by the World Health Organization present except for Chagas disease and yaws [<xref ref-type="bibr" rid="ref2">2</xref>], particularly in the remote areas of the country [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. As estimated by the federal ministry of health [<xref ref-type="bibr" rid="ref5">5</xref>], more than 75 million people are at risk of contracting at least 1 NTD. Of the 21 NTDs recognized by the World Health Organization, more than half (about 18 of them) have skin manifestations and are called skin NTDs [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Accordingly, the diagnosis of skin NTDs primarily involves examination of patient skin, presenting opportunities for integrated diagnosis [<xref ref-type="bibr" rid="ref8">8</xref>], involving the use of artificial intelligence&#x2013;based skin NTDs diagnostic tools [<xref ref-type="bibr" rid="ref9">9</xref>]. Although it is not fully explored, previous studies [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>] have shown that deep learning (DL) methods can be used for the diagnosis of skin NTDs.</p><p>This study proposes a DL-based diagnostic model for skin NTDs based on skin images of patients using a new skin images dataset we created for this study. However, apart from the limited previous efforts in DL-based skin NTDs diagnostics, dataset-related issues created challenges in building our proposed model, as the dataset used is characterized by small-sized image samples with a severe class imbalance. Generally, as a computer vision task, the development of a DL-based diagnostic model using skin images requires a large-scale higher-quality skin images dataset [<xref ref-type="bibr" rid="ref11">11</xref>]. In the case of skin NTDs, data scarcity and dataset-related issues are the major challenges in building DL-based intelligent diagnostic tools for skin NTDs [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. These challenges mainly arise from several factors that include poor record keeping, management, and reporting practices [<xref ref-type="bibr" rid="ref13">13</xref>]. Additionally, model characteristics regarding usability and efficiency issues are also major challenges in building diagnostic models for skin NTDs. Beyond prediction accuracy, efficiency parameters, such as model complexity, inference speed, response time, and deployment platform options (web-based and mobile-based), are also major factors that determine the selection of DL tools, techniques, and models.</p><p>Accordingly, given the data-related issues of the dataset we used for this study and the expected operational platforms, which DL method is appropriate to develop an image-based diagnostic model for skin NTDs? While answering this question requires properly devised strategies based on carefully designed experiments, analysis, and interpretation of results, it establishes a foundational benchmarking effort that helps in identifying suitable DL methods to overcome the mentioned challenges. Therefore, this study conducts extensive experimentations to find the optimal DL solution based on the following guiding questions: (1) Which DL approach (baseline model design, transfer learning, or hybrid) would be a feasible strategy to develop the proposed model? (2) Which DL model architecture would help in developing a high-performance diagnostic model for skin NTDs, given the nature of the dataset used? (3) Which approach helps in creating the robust model development pipeline based on the experimental screening of both DL methods and architectures that collectively address high predictive performance with lower architectural and computational complexity?</p><p>While addressing these questions, this study develops a benchmark image-based diagnostic DL model for skin NTDs based on experimentally identified suitable methods and approaches that involve designing a new convolutional neural network (CNN) model and applying transfer learning. Evidently, the data scarcity created higher difficulty in capturing relevant features from input images using baseline models, including the custom-designed model, given the diversity and nature of the manifestations of skin NTDs that include &#x201C;mossy&#x201D; limbs in podoconiosis [<xref ref-type="bibr" rid="ref14">14</xref>]. While the use of transfer learning is ultimately the recommended DL strategy, several factors related to pretrained models create challenges that include huge data requirements, domain incompatibility, tendency of capturing noise features, and lack of a standardized robust diagnostic model development pipeline including skin NTDs. To address such challenges, we designed and implemented the 2-stage approach that presents a robust and extensible architectural pipeline integrating the feature mapping (extraction) models and applying domain adaptation. Furthermore, as the 2-stage approach integrates the 10-layer classification head with different regularization methods, it provides deeper feature filtering architectures.</p><p>Overall, this study presents several contributions to the problem domain (skin NTDs diagnostics) and to the field through multiple achievements, which include identification of optimal DL methods for skin NTDs that have higher data scarcity problems; establishment of a robust DL model development pipeline, which incorporates designing the 2-stage approach; methodological rigor that includes robust experimental setup and systematic architectural screening by adopting the funnel framework; and, ultimately, development of DL diagnostic models for skin NTDs, which can serve as an architectural benchmark for skin NTDs (skin-related diseases in general).</p></sec><sec id="s1-2"><title>Related Works</title><p>Previous studies showed the potential of the DL-based methods for skin NTDs. Accordingly, Steyve et al [<xref ref-type="bibr" rid="ref10">10</xref>] proposed an optimized real-time diagnostic approach for 3 skin NTDs (Buruli, leishmaniasis, and leprosy) using a support vector machine classifier optimized by a black hole optimization algorithm. Yotsu et al [<xref ref-type="bibr" rid="ref15">15</xref>] also presented DL methods for using major CNN architectures (ResNet50 and VGG16 models) for the diagnosis of 5 skin NTDs (Buruli ulcer, leprosy, mycetoma, scabies, and yaws). Another study by Pattnayak et al [<xref ref-type="bibr" rid="ref16">16</xref>] proposed a DL method for 5 skin NTDs (Buruli ulcer, leprosy, mycetoma, scabies, and yaws). Beesetty et al [<xref ref-type="bibr" rid="ref17">17</xref>] applied a Siamese-based Few Shot Learning model, trained it on an extremely small dataset with fewer disease classes (368 clinically diagnosed leprosy and 28 nonleprosy skin lesions), and reported higher accuracy.</p></sec><sec id="s1-3"><title>Challenges Toward Applying DL Methods for Skin NTDs</title><p>Multiple factors created challenges toward digitizing the skin NTDs diagnostic procedures using intelligent digital diagnostic tools. Some of the challenges are insufficient infrastructure, data security issues, and limited efforts toward the integration of digital diagnostic tools [<xref ref-type="bibr" rid="ref18">18</xref>], including the general DL model development challenge, data scarcity, and class imbalance. Multiple large-scale skin image datasets, such as the HAM10000 (Human Against Machine with 10,000 training images) [<xref ref-type="bibr" rid="ref19">19</xref>] and ISIC (International Skin Imaging Collaboration) [<xref ref-type="bibr" rid="ref20">20</xref>], are available to train DL models for non-NTD skin diseases. However, it is difficult to find such massive skin image datasets that are publicly available to train DL models for skin NTDs. The other major issues related to data scarcity are the completeness and class distribution imbalance.</p></sec><sec id="s1-4"><title>Potential DL Solutions for Intelligent Skin NTDs Diagnosis</title><p>Data augmentation, mostly for image classification tasks, is the most widely used machine learning operation to address the problem of data scarcity and distribution imbalance [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>] through artificially generating images. There are 2 major approaches of data augmentation. The traditional augmentation method uses basic general geometric transformations [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], such as cropping, padding, flipping (horizontal or vertical), rotations, permutations, scaling, translations, and addition of noise [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. The cropping, flipping, rotations, and scaling transformations are applied to simulate imaginary variations that might occur in reality. The other data augmentation approach is the class-based conditional augmentation, which is conducted based on predefined conditions either by applying basic geometric transformations or by using generative adversarial (GAN) models [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec><sec id="s1-5"><title>Gaps Identified</title><p>Overall, the vast literature exploration confirmed that very few efforts were seen toward the use of DL-based diagnostic tools for skin NTDs, specifically in the Ethiopian context. Additionally, the data scarcity issues are the subsequent challenges creating the other major research gaps. These major research gaps clearly suggest that further efforts in the area are mandatory, with a clear indication of having an initially established DL-based diagnostic framework that can serve as a benchmark for current and future studies. Therefore, we conducted this foundational study to develop an image-based skin NTDs diagnostic model using a novel skin NTDs image dataset created by using skin photographs of patients collected from one of the remote and highly affected areas in Ethiopia. We conducted this benchmarking study to identify optimal methods and DL model architectures based on properly designed experimental settings, given the nature of the dataset used for the study. Regarding the dataset-related problems, the study demonstrates the dynamic (online) data augmentation method based on the standard general geometric transformations to initially address the data scarcity problem.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>For this study, we created a new dataset using skin photographs of patients with skin NTDs obtained during clinical data collection from a remote affected area in the southwest of Ethiopia. While the data were initially collected by trained health care professionals based on strict ethical procedures using an ethical clearance letter obtained from the institutional review board (IRB) in Arba Minch University (AMU), we acquired the data through institutional research collaboration (&#x201C;Acknowledgments&#x201D; section) that warrants full access and use of the firsthand data. To acquire and use the skin images data for this study, proper ethical approval procedures were followed, starting with the acquisition of the ethical clearance letter from the IRB in AMU. Hence, for this study, the authors obtained a specific ethical clearance letter from the concerned institutional review board in AMU (approval protocol number YM23161).</p><p>The data collection process was conducted with the full respect of participants&#x2019; privacy, where data were acquired from each individual participant based on their will confirmed by a written consent. This was achieved using a dedicated checklist prepared and used to systematically monitor the data collection, ensuring that they were collected based on the free will of each participant. This, specifically, involved a mandatory checklist to ensure that each participant has read and signed the consent form using targeted questions, which include the following: &#x201C;Does the participant read information statement and willing to sign the consent form?&#x201D; and the consideration of underaged participants&#x2014;&#x201C;if the age is below 18, does the guardian sign the consent form?&#x201D; Overall, all involved participants provided written consent for their participation and authorized the use of their clinical data for academic purposes, including publications.</p><p>As the data collection was performed by a team of trained health care professionals, including public health officers, the entire data collection process was carried out in a professional and responsible manner, authorized by the IRB. Regarding this study, we have been authorized by the same institutional ethical review committee before acquiring and using the data based on a series of verification procedures that include careful analysis of the required data for our study, analysis of participant privacy vulnerability issues as a result of using the data, anonymizing all participant-level data in the images to remove all information in the images that could be used to identify patients (study participants), and strictly complying with ethical requirements for using human patient data&#x2014;which was confirmed by the ethical clearance letter from the IRB. Therefore, before using the images for this study, all images have been anonymized to remove any personal data to ensure that all participant-identifiable features in any images of the manuscript or supplementary material are not visible.</p><p>Finally, as the data were collected from one of the highly affected areas during a mass drug administration (MDA) campaign, the ultimate goal of the data collection was to assess the burden of the skin NTDs that will be used for immediate public health decisions. In this process, the patients living in the affected remote community primarily benefited from the MDA-based data collection, with the patients being diagnosed at the MDA site. However, no special compensations were implemented for the participants as a result of using the data for further study. Additionally, the acquired data are used to build a diagnostic model that intends to serve for diagnosing skin NTDs in the same resource-limited areas.</p></sec><sec id="s2-2"><title>Data Collection and Dataset Description</title><p>In this study, we used a new handcrafted dataset containing skin photographs of patients with skin NTDs that were captured to show skin areas affected by the skin NTDs. Initially, the data were collected by a team of researchers from the Collaborative Research and Training Center for Neglected Tropical Disease, College of Medicine Health Sciences of AMU. Data were collected in a project-based research for the assessment of skin NTDs burden through community screening during the scabies MDA campaign from Gacho Baba District, Gamo Zone, southwest of Ethiopia. The dataset contains skin photographs (images) of 3 different skin NTDs, namely, podoconiosis, scabies, and tungiasis. These 3 diseases were included in the dataset since they are identified as the most prevalent skin NTDs identified in the specified affected area. The entire data collection process was conducted in a professional and ethical manner, where the whole process was initiated after all legal and ethical issues were addressed, and an ethical clearance was obtained to collect the data. For this study, we acquired the collected data through institutional research collaboration between the NTDs research center of the medical college and computing faculty of the technology institute of AMU. Using the acquired data, we created a new skin NTDs image dataset and used it for this study to develop the proposed DL-based skin NTDs diagnostic model.</p></sec><sec id="s2-3"><title>Exploratory Data Analysis</title><p>The images were obtained for dermatologist verification and present skin lesions, scratches, excoriations, and other infestations that are typical clinical signs of podoconiosis, scabies, and tungiasis. After acquiring the data, we created 3 separate initial datasets containing the skin images, the unique image IDs, and their corresponding labels for each disease. The final dataset contains 1495 images, as shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. As shown in the statistical distribution, scabies has the largest proportion among the 3 disease classes with a total of 955 instances (955/1495, 63.88%), while tungiasis represents the second largest size with 474 instances (474/1495, 31.71%) and podoconiosis having only 66 instances (66/1495, 4.41%).</p></sec><sec id="s2-4"><title>Study Design</title><p>This study intends to conduct a DL architectural benchmarking research that requires a systematic approach to select optimal DL methods and algorithms, and we proposed a mixed research strategy based on the newly proposed funnel framework we adopted for this study, as summarized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Overall, our study needs a systematic approach to select optimal DL methods and models through multistaged experimental filtering. Specifically, the selection of DL pretrained models requires multiple experimentations with systematically devised screening and filtering methods based on comparative analysis of model performance results. Initially, we propose and experiment with 3 different DL methods: first, train a new custom-designed CNN model; second, transfer learning using the 21 selected pretrained DL models; and third, demonstrate the 2-stage approach.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Architecture of the proposed funnel framework based on the cascaded model selection mechanism, CNN: convolutional neural network.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v9i1e91544_fig01.png"/></fig><p>After the completion of these 3 experiments, the best approach and top 5 pretrained DL models will be selected to apply further enhancements to finally select the best 2 models. All these tasks required a systematically designed approach that can be used as a framework to guide the training, screening, and analysis processes. Therefore, we propose a new approach, the funnel framework with cascaded (phased) selection of models and DL methods, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The funnel framework, adapted from the business-related fields, is used to screen out top-performing models and methods initially identified based on a comparative and phased approach. Accordingly, the experiments are conducted in different training settings to select the optimal DL approach that produces comparatively maximum model performance and screen out the high-performing pretrained models in transfer learning and feature extraction.</p></sec><sec id="s2-5"><title>Dataset Preparation and Preprocessing</title><p>The entire data-splitting process is conducted using the stratified splitting approach with a ratio of 80:20 train-test split, followed by data preprocessing operations. Accordingly, for image resizing, we applied the standard image resolution to all images in the dataset, which required resizing the images to 224 &#x00D7; 224 &#x00D7; 3 pixels for 17 models, while 240 &#x00D7; 240 &#x00D7; 3 pixels and above were used to resize the images for higher EfficientNet models (B1, B3, B5, and V2S). As a next task in data preprocessing, image normalization was programmatically applied to all images.</p></sec><sec id="s2-6"><title>Model Development: Model Design and Selection</title><p>In this study, we use 3 strategies regarding the development of the proposed DL model, which include (1) training a new custom-designed CNN model, (2) applying transfer learning using selected pretrained DL model architectures, and (3) applying the proposed 2-stage approach&#x2014;a hybrid approach that uses pretrained DL architectures for feature extraction and training the new CNN model for classification. We applied this set of strategies to experimentally demonstrate possible methods to select the optimal strategy resulting in overall higher performance of models.</p></sec><sec id="s2-7"><title>The New CNN Model</title><p>As a first study, we begin our experiments by designing a new custom-designed CNN model that will be used for baseline training and evaluation as well as for the classification of the skin NTDs in the 2-stage approach. Accordingly, we designed a new CNN model consisting of 3 major components (blocks) that represent the 3 different stages of the entire pipeline: feature extraction, the dense layers (including the flattening layer), and the classification head. Based on this general architectural layout, the new model is designed to have 30 layers, containing 8 weight-bearing layers from 6 convolutional and 2 dense layers, 11 regularization layers properly applied across all blocks, 7 activation layers, and 4 spatial refinement layers&#x2014;3 pooling and 1 flatten layer. <xref ref-type="fig" rid="figure2">Figure 2</xref> visualizes the overall architecture of our custom-designed CNN model.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Architecture of the proposed custom-designed convolutional neural network model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v9i1e91544_fig02.png"/></fig><p>The model is designed in a consistent manner, implementing a hierarchical structure to capture complex and relevant visual patterns from each input sample image using 3 repeated convolutional blocks having variable filter sizes. Accordingly, the output depth of each block (number of filters) is sequentially doubled across the 3 feature extraction blocks (64 filters in block 1 to 256 filters in block 3). Internally, each feature extraction block is designed to have 2 convolutional layers (Conv2D with 3 &#x00D7; 3 kernels), including properly applied activation, regularization, and dimensionality reduction techniques. Accordingly, the model uses the &#x201C;relu&#x201D; activation function, along with robustly implemented regularization methods, which include (1) normalization (BatchNormalization)&#x2014;applied across all blocks including the dense layers, (2) dropout&#x2014;applied both in the feature extraction blocks (using Dropout [0.25]) and in the dense block (using Dropout [0.5]), and (3) pooling layers that apply the maximum pooling method&#x2014;serving dual purposes including regularization, while the maximum pooling method is primarily used for dimensionality reduction [<xref ref-type="bibr" rid="ref28">28</xref>]. This architectural setup is used to ensure that each feature extraction block extracts and refines features and applies normalization, all before dimensionality reduction (Dropout), to ensure that relevant features and patterns are captured. The overall feature dimension transformations and output shapes of each extraction block are presented in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Architectural summary of the new custom-designed CNN<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> model.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model block</td><td align="left" valign="bottom">Input shape</td><td align="left" valign="bottom">Convolutional layers</td><td align="left" valign="bottom">After pooling (2 &#x00D7; 2)</td><td align="left" valign="bottom">Output</td></tr></thead><tbody><tr><td align="left" valign="top">Input layer</td><td align="left" valign="top">224 &#x00D7; 224 &#x00D7; 3</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Block 1 (convolutional block 1)</td><td align="left" valign="top">224 &#x00D7; 224</td><td align="left" valign="top">220 &#x00D7; 220</td><td align="left" valign="top">110 &#x00D7; 110</td><td align="left" valign="top">64</td></tr><tr><td align="left" valign="top">Block 2 (convolutional block 2)</td><td align="left" valign="top">110 &#x00D7; 110</td><td align="left" valign="top">106 &#x00D7; 106</td><td align="left" valign="top">53 &#x00D7; 53</td><td align="left" valign="top">128</td></tr><tr><td align="left" valign="top">Block 3 (convolutional block 3)</td><td align="left" valign="top">53 &#x00D7; 53</td><td align="left" valign="top">49 &#x00D7; 49</td><td align="left" valign="top">24 &#x00D7; 24</td><td align="left" valign="top">256</td></tr><tr><td align="left" valign="top">Dense block</td><td align="left" valign="top">Flattened vector</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">512 units</td><td align="left" valign="top">3: number of classes</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>CNN: convolutional neural network.</p></fn><fn id="table1fn2"><p><sup>b</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>On the final block (classification head), the model applies feature map transformation and final skin NTDs classification using 6 layers: flatten&#x2014;the layer used to transform the final spatial feature map into a 1D feature vector, fully connected (Dense)&#x2014;the largest layer in the model having 512 units, regularization&#x2014;2 independent layers applying BatchNormalization and heavier dropout (Dropout [0.5]), activation layer using &#x201C;relu,&#x201D; and output layer&#x2014;the final layer that predicts the probability distribution among the 3 disease classes (podoconiosis, scabies, and tungiasis) using the SoftMax activation function. Overall, the Adam optimizer and &#x201C;categorical cross entropy&#x201D; loss function are used for the final model compilation. All these strategies are properly applied along with a synchronized implementation of early stopping, all to prevent overfitting.</p><p>Given the nature of our new dataset, having only 1495 images in the dataset, training a DL model from scratch appeared to be a bit challenging, as the limited size of the feature maps would potentially force the models to learn all the details (including noise pixels) resulting in difficulty to generalize well on new skin NTDs images due to overfitting [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. To overcome this challenge, we conducted further experimental inquiries to identify and use the optimal DL method based on our newly designed CNN model, demonstrating the transfer learning method entirely based on pretrained DL models, followed by the 2-stage approach.</p></sec><sec id="s2-8"><title>Transfer Learning</title><p>To improve the performance of our baseline CNN model and demonstrate the other potential DL methods, we deployed the transfer learning method by using a diverse set of pretrained DL architectures (CNN and transformer-based). To achieve this goal, we applied a systematic DL architectural selection procedure using a predefined set of model selection parameters to validate, comparatively analyze, and finally select the best model and method for the proposed skin NTDs diagnostic model. Hence, we identified 21 pretrained model architectures, selected based on 4 major selection parameters, which include architectural distribution&#x2014;defining model family and operational principles (CNN and transformer-based models), model complexity&#x2014;including both architectural (model size) and computational (efficiency) complexity, and novelty (recency) of models. <xref ref-type="table" rid="table2">Table 2</xref> presents a summary of the 21 selected pretrained models.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of DL<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> model architectures considered during initial screening.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Architectural family</td><td align="left" valign="bottom">Core architectural principle(s)</td><td align="left" valign="bottom" colspan="2">Model complexity</td><td align="left" valign="bottom" colspan="2">Model efficiency</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Total parameters</td><td align="left" valign="bottom">Model size</td><td align="left" valign="bottom">GFLOPs<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">Efficiency class</td></tr></thead><tbody><tr><td align="left" valign="top">ResNet50</td><td align="left" valign="top">Residual Networks</td><td align="left" valign="top">Skip connections [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">25.6 M</td><td align="left" valign="top">Large</td><td align="left" valign="top">4.1</td><td align="left" valign="top">Heavy</td></tr><tr><td align="left" valign="top">ResNet18</td><td align="left" valign="top">Residual Networks</td><td align="left" valign="top">Skip connections [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">11.3 M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">1.8</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">ConvNext-Small</td><td align="left" valign="top">Modern CNN<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> Architectures</td><td align="left" valign="top">Transformer-like CNN components using standard ConvNet modules [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">49.7 M</td><td align="left" valign="top">Large</td><td align="left" valign="top">4.5</td><td align="left" valign="top">Heavy</td></tr><tr><td align="left" valign="top">ConvNext-Tiny</td><td align="left" valign="top">Modern CNN<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> Architectures</td><td align="left" valign="top">Transformer-like CNN components using standard ConvNet modules [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">28M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">4.5</td><td align="left" valign="top">Heavy</td></tr><tr><td align="left" valign="top">CovNeXtv2-Tiny</td><td align="left" valign="top">Modern Pure CNN Architectures</td><td align="left" valign="top">Fully convolutional masked autoencoder framework with a global response normalization layer [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">28.1M</td><td align="left" valign="top">Large</td><td align="left" valign="top">4.47</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">CovNeXtv2-Atto</td><td align="left" valign="top">Modern Pure CNN Architectures</td><td align="left" valign="top">Fully convolutional masked autoencoder framework with a global response normalization layer [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">3.7M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.55</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">DenseNet121</td><td align="left" valign="top">Densely Connected Networks</td><td align="left" valign="top">Dense CNN (DenseNet) block architecture with feature reuse [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">7.3M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">2.9</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">Xception</td><td align="left" valign="top">Depth-wise Separable CNNs</td><td align="left" valign="top">Depth-wise separable convolutions [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">21.4</td><td align="left" valign="top">Large</td><td align="left" valign="top">8.4</td><td align="left" valign="top">Heavy</td></tr><tr><td align="left" valign="top">EfficientNetB5</td><td align="left" valign="top">Compound-Scaled CNNs</td><td align="left" valign="top">Compound scaling&#x2014;uniform scale of all dimensions (depth, width, and resolution), built on BMConv<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> blocks [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">29M</td><td align="left" valign="top">Large</td><td align="left" valign="top">9.9</td><td align="left" valign="top">Heavy</td></tr><tr><td align="left" valign="top">EfficientNetB3</td><td align="left" valign="top">Compound-Scaled CNNs</td><td align="left" valign="top">Compound scaling&#x2014;uniform scale of all dimensions (depth, width, and resolution), built on BMConv<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> blocks [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">11.2M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">1.8</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">EfficientNetB1</td><td align="left" valign="top">Compound-Scaled CNNs</td><td align="left" valign="top">Compound scaling&#x2014;uniform scale of all dimensions (depth, width, and resolution), built on BMConv<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> blocks [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">6.9M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">0.7</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">EfficientNetB0</td><td align="left" valign="top">Compound-Scaled CNNs</td><td align="left" valign="top">Compound scaling&#x2014;uniform scale of all dimensions (depth, width, and resolution), built on BMConv<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> blocks [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">4.4M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.39</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">MobileNetV2</td><td align="left" valign="top">Mobile CNNs</td><td align="left" valign="top">Inverted residual blocks with linear bottlenecks [<xref ref-type="bibr" rid="ref36">36</xref>]</td><td align="left" valign="top">2.6M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.3</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">MobileNetV3-Large</td><td align="left" valign="top">Mobile CNNs</td><td align="left" valign="top">Hardware-aware NAS<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> along with the NetAdapt algorithm (platform-aware adaptation [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">3.2M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.22</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">MobileNetV3-Small</td><td align="left" valign="top">Mobile CNNs</td><td align="left" valign="top">Hardware-aware NAS<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> along with the NetAdapt algorithm (platform-aware adaptation [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">1.1M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.06</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">EfficientNetV2B0</td><td align="left" valign="top">Advanced compound-scaled CNNs</td><td align="left" valign="top">Training-aware NAS and scaling&#x2014;joint optimization of training speed and parameter efficiency [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">6.2M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">0.72</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">EfficientNetV2S</td><td align="left" valign="top">Advanced compound-scaled CNNs</td><td align="left" valign="top">Training-aware NAS and scaling&#x2014;joint optimization of training speed and parameter efficiency [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">20.7M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">2.9</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top" colspan="7">Recent CNN and transformer-based models</td></tr><tr><td align="left" valign="top">RepViT</td><td align="left" valign="top">ViT<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>-Inspired Pure lightweight CNN</td><td align="left" valign="top">Reparameterization convolutions in ViT-like Meta-Former structure [<xref ref-type="bibr" rid="ref40">40</xref>]</td><td align="left" valign="top">5.1M</td><td align="left" valign="top">Moderate</td><td align="left" valign="top">0.80</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">FasterViT-0</td><td align="left" valign="top">Hybrid (CNN + ViT)</td><td align="left" valign="top">HAT<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup> using window-based self-attention, carrier tokens for local-global representation learning [<xref ref-type="bibr" rid="ref41">41</xref>]</td><td align="left" valign="top">31.4M</td><td align="left" valign="top">Large</td><td align="left" valign="top">3.34</td><td align="left" valign="top">Moderate</td></tr><tr><td align="left" valign="top">FastViT</td><td align="left" valign="top">Hybrid (CNN + ViT)</td><td align="left" valign="top">RepMixer (structural reparameterization for token mixing) and skip-connection elimination [<xref ref-type="bibr" rid="ref42">42</xref>]</td><td align="left" valign="top">3.6M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.70</td><td align="left" valign="top">Lightweight</td></tr><tr><td align="left" valign="top">EfficientViTB0</td><td align="left" valign="top">Hybrid (CNN + ViT)</td><td align="left" valign="top">Lightweight multiscale attention (for context extraction) and MBConv (for local information extraction) [<xref ref-type="bibr" rid="ref43">43</xref>]</td><td align="left" valign="top">0.7M</td><td align="left" valign="top">Lightweight</td><td align="left" valign="top">0.07</td><td align="left" valign="top">Lightweight</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>DL: deep learning.</p></fn><fn id="table2fn2"><p><sup>b</sup>GFLOPs: Giga floating point operations.</p></fn><fn id="table2fn3"><p><sup>c</sup>CNN: convolutional neural network.</p></fn><fn id="table2fn4"><p><sup>d</sup>BMConv: mobile-inverted bottleneck convolution.</p></fn><fn id="table2fn5"><p><sup>e</sup>NAS: neural architecture search.</p></fn><fn id="table2fn6"><p><sup>f</sup>ViT: vision transformer.</p></fn><fn id="table2fn7"><p><sup>g</sup>HAT: hierarchical attention.</p></fn></table-wrap-foot></table-wrap><p>All the 21 selected pretrained models were initially trained on ImageNet-1K dataset, initialized with weights from this standard, large, and diversified dataset having around 3.2 million images [<xref ref-type="bibr" rid="ref44">44</xref>], helping to create general feature extraction baselines. Furthermore, as shown in <xref ref-type="table" rid="table1">Table 1</xref>, maintaining architectural distribution, we selected the 21 representative models from nine architectural families, which include (1) models of residual networks (ResNet50 and ResNet18) that apply residual learning (or skip connections) method, (2) modern CNN architectures (ConvNextV1 Small/Tiny and ConvNextV2 Tiny/Atto) that have transformer-like CNN components, (3) model with densely connected CNNs (DenseNet121), (4) model with depth-wise separable CNNs (Xception), (5) the EfficientNet family (B0-B5), (6) mobile CNNs MobileNet (V2 and V3), (7) models that apply advanced compound scaling method (EfficientNetV2 B0 and S), (8) recent state-of-the-art lightweight CNN architecture (RepViT), and (9) hybrid transformer-based architectures (FastViT, FasterViT-0, and EfficientViTB0). Regarding architectural and computational complexities, 6 of these models are heavier models having parameters between 30 and 50 million (M) (30 M &#x003C; parameters &#x003C; 50 M), while having computational complexities that range between 4.0 and 9.9 Giga floating point operations (GFLOPs). The other 8 models have moderate levels of complexities (6.2 M &#x003C; parameters &#x003C; 28.1 M, and 0.7 billion [B] &#x003C; FLOP &#x003C; 4.0 B), while 7 of the 21 models are lightweight models (0.7 M &#x003C; parameters &#x003C; 5.1 M, and 0.06 B &#x003C; FLOP &#x003C; 0.55 B).</p></sec><sec id="s2-9"><title>The 2-Stage Approach: Feature Extraction With Integrated CNN Model</title><p>On the third experimental setting, our proposed 2-staged approach is demonstrated. In this approach, we crafted a robust hybrid model development pipeline that incorporates 2 different DL model architectures, the selected pretrained and our custom-designed CNN models, integrated to the utility modules (data loading, preprocessing, and evaluation). These 2 groups of models are used independently in 2 phases (stages) subsequently operating one after the other to achieve 2 exclusive DL operations, feature extraction (feature mapping) and disease classification (inference), representing the 2 fundamentally isolated but highly interdependent modules in the pipeline. The selected pretrained models are used only for the purpose of feature extraction. Given these 2 modules are highly fundamental that operate subsequently, the disease classification model operates using the output of the feature extraction model, we named the overall hybrid pipeline as the 2-stage approach. <xref ref-type="fig" rid="figure3">Figure 3</xref> presents the overall architecture of our 2-stage approach, depicting the 2 major stages as modules in the DL architecture.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Architecture of the new 2-stage approach. BN: batch normalization; DL: deep learning; skin NTDs: skin Neglected Tropical Diseases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v9i1e91544_fig03.png"/></fig><p>In the first stage (feature extractor module), the selected 21 DL architectures are solely used for feature extraction, which we named them as extractor(s) or feature map(s), and we created 21 specific extractors (maps) by freezing all the trainable weights in each of these 21 models. We use this method to effectively extract features using the mapping logic of each pretrained model and derive feature representations within the high-dimensional feature space (R<sup>2048</sup>) for each input image in our dataset. This method helped us in creating the full feature representations by avoiding pooling layers, as in the case of our custom-designed CNN model, helping us in preserving important spatial representations of features that would potentially be dropped as a result of using pooling method (MaxPooling). After mapping all the input images using the pretrained models, feature matrices are created for each of the separate train and test sets and are prepared to be used by our custom-designed model. This completes the first phase where the feature extraction models completed their only purposes of feature extraction, which are no longer used.</p><p>In the second stage (baseline classification module; <xref ref-type="fig" rid="figure3">Figure 3</xref>), the feature matrices created using the previously extracted features by each extractor are used to train the new CNN model. However, since our new model was initially designed as a full-fledged CNN architecture, its use in this phase required restructuring. Hence, we restructured our newly designed CNN model by excluding the extraneous feature extraction layers, as these operations are performed by different models in the separate previous module. Guided by our initially designed 30-layer deep and highly regularized CNN model, we specifically redesigned the final 10-layer architecture to enhance the robustness of the classification head, improve classification performance, and add architectural novelty. This approach totally eliminates the 3 feature extraction blocks of the new CNN model, leading to the exclusion of 20 layers. Specifically, this approach directly eliminates the 6 convolutional layers and their corresponding 10 layers (7 activation and 3 maximum pooling), while minimizing the normalization layers to only 3. This creates a final classification model having 10 layers with 3 dense blocks and a final output layer, where each dense block in turn includes regularization methods (normalization and dropout).</p><p>Overall, unlike most standard transfer learning pipelines where the pretrained models serve complex tasks with only 1 or 2 classification layers finally added [<xref ref-type="bibr" rid="ref45">45</xref>], the classification head in our 10-layer customized (redesigned) CNN model has deeper architectural layers. Through its 9 regularization layers, the new CNN head also serves a separate feature filtering purpose, where the final classification layer makes the classification decision using highly relevant skin lesion features.</p></sec><sec id="s2-10"><title>Model Training and Evaluation</title><p>We used a 5-fold cross-validation method to train and evaluate the models. The metrics used to evaluate the models are selected to assist performance analyses from different perspectives. The macro <italic>F</italic><sub>1</sub>-score and the class-specific metrics (using sensitivity and specificity) are highly used, as accuracy was found to be a misleading metric due to the highly skewed nature of our new dataset used. Hence, we prioritize the macro <italic>F</italic><sub>1</sub>-score of models on both the train and test sets to effectively evaluate the models&#x2019; generalizability and learning ability. The overall sensitivity (macro recall), the area under the precision-recall curve (AUPRC), and the area under the receiver operating characteristic (AUROC) scores are also highly used to measure how well each model performs in identifying positive disease classes. Furthermore, visual tools using tables, confusion matrices, ROC curves, including performance plots such as the slope charts and radar plots, are also used to analyze models&#x2019; performance.</p></sec><sec id="s2-11"><title>Final Model Selection</title><p>Ultimately, this study selects the final best-performing skin NTDs classification models with the highest classification performance. To achieve this, we applied systematic model selection procedures in 2 phases: first-level selection&#x2014;applied for the initial feature extraction model screening based on baseline performance score (end of phase 1), and final model selection&#x2014;applied at the end of model screening experiments (end of phase 2). Overall, the selection of top-performing models involved analysis of performance scores that include macro <italic>F</italic><sub>1</sub>-score, sensitivity, AUPRC, and 4 class-specific performance metrics (podo-recall, tungiasis-recall, scabies-recall, and podo-<italic>F</italic><sub>1</sub>-score). Hence, at the end of phase 1, using these extended screening parameters, top-performing models that achieve stable and outperforming scores across the 2 experiments are selected. Accordingly, selected models are used in the next experimental training that applies performance optimization using the dynamic (online) data augmentation method.</p><p>Finally, at the end of phase 2, the final 2 best-performing models are selected based on the results achieved during the experiment with the optimization method, and we applied a robust selection procedure based on weighted comparison of performance scores achieved during this last experiment. Accordingly, 6 evaluation metrics (<italic>F</italic><sub>1</sub>-score [macro], podo-<italic>F</italic><sub>1</sub>, macro recall [mean], podo-recall, inference speed [samples per second (sps)], and number of parameters) are used, and we applied the weighted scores comparison method with each metric given different weight. To achieve this strategic comparison, we conducted 4 procedures. First, the metrics are categorized as performance (<italic>F</italic><sub>1</sub>-score [macro], podo-<italic>F</italic><sub>1</sub>, macro recall [mean], and podo-recall) and efficiency metrics (inference speed and number of parameters). Out of all these metrics, the model complexity (number of parameters) is a metric that is mostly desired to be lower (&#x201C;lower is good&#x201D;), while the other 5 metrics represent best model performance when their values are higher (&#x201C;higher is better&#x201D;), having opposite directional symmetry with model size. Second, we normalized all values of the 6 comparison metrics using the minimum-maximum normalization method to ensure that all values fall between 0 and 1 and facilitate the combined scores comparison, which includes normalizing the values that already have values between 0 and 1, mainly for (1) avoiding range dilution&#x2014;minor differences in performance mostly lead to larger ranges that determine comparison; (2) baseline value definition, combined scores mostly perform well with the least score defined as &#x201C;0.0&#x201D;; and (3) simplifying directional symmetry. Therefore, to address all these, we applied the minimum-maximum normalization method by defining 2 normalization formulas. First, we normalized the &#x201C;higher is better&#x201D; metrics (the metrics that represent best values when their values are higher) using the formula:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">z</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">min</mml:mo></mml:mrow></mml:msub></mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">min</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where, <italic>x</italic><sub>normalized</sub> is a value in any 1 of the 5 metrics (eg, macro <italic>F</italic><sub>1</sub>-score) representing a single value for a specific model that is being normalized; <italic>x</italic><sub>min</sub> is the least value in that specific metric; and <italic>x</italic><sub>max</sub> is the maximum score in the same group of metric.</p><p>Next, for the &#x201C;lower is good&#x201D; metric (model complexity), we applied the normalization using the formula:</p><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">n</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">z</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">min</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mo movablelimits="true" form="prefix">min</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Third, we assigned weights <italic>w</italic> for all evaluation metrics, and as a diagnostic model, disease prediction performance scores are the primary requirements. Hence, we assigned higher weights (0.2) for performance metrics and a relatively lower weight (0.1) for the 2 efficiency metrics (inference speed and number of parameters). Finally, we define a unified objective function that computes the weighted sum of score (WSS) for each model using the formula:</p><disp-formula id="equWL3"><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mrow><mml:mi mathvariant="normal">W</mml:mi><mml:mi mathvariant="normal">S</mml:mi><mml:mi mathvariant="normal">S</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>6</mml:mn></mml:mrow></mml:munderover><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where WSS<sub><italic>m</italic></sub> is the weighted sum of score of any given model evaluated, <italic>x</italic><sub><italic>i</italic></sub> is a specific evaluation metric, and <italic>w</italic><sub><italic>i</italic></sub> is the weight of a given metric. After computing the weighted scores, models are ranked accordingly to identify the top 2 models on the overall performance. Accordingly, the first selected model would be a model with higher complexity, with a consideration of using the model as a back-end (web-based) classification model, while the second selected model would be a lightweight model with the potential of being embedded in edge (mobile) devices for the actual and real-time diagnosis of skin NTDs.</p></sec><sec id="s2-12"><title>Experimental Setup</title><sec id="s2-12-1"><title>Overview</title><p>As the study is guided by the funnel framework based on extended experiments, the whole training experiments are conducted in a phased approach having 3 phases. The first phase deals with the establishment of baseline skin NTDs classification models&#x2014;using the new CNN model, transfer learning, and the 2-stage approach; the second phase applies performance optimization (data augmentation); and the third phase applies the final performance optimization through hyperparameter tuning.</p></sec><sec id="s2-12-2"><title>Phase 1: Baseline Model Training With Cascaded Model Selection</title><p>In this first phase, only baseline skin NTDs classification models are trained in 3 separate experimental setups: the custom-designed CNN model, transfer learning, and the 2-staged approach. This phase is intended for the overall evaluation of baseline models&#x2019; performance, which includes comparative analysis and first-level model screening.</p><sec id="s2-12-2-1"><title>Custom-Designed CNN Model</title><p>The first experimental setting involves training the custom-designed 30-layered CNN model, where the new model is trained and evaluated under 2 different methods. The first training involves the baseline training, where the model is barely trained without applying any advanced machine learning methods for tweaking performance. On the next experiment, the same model is trained by applying data augmentation techniques. These experiments are conducted to evaluate and analyze the performance of this newly crafted model on our new small-sized skin NTDs dataset, as the results from these experiments determine subsequent strategies.</p></sec><sec id="s2-12-2-2"><title>Transfer Learning: Baseline Models Using Pretrained Models</title><p>On the second experimental setting, we demonstrated the transfer learning method using the selected pretrained DL architectures to build baseline models. Hence, 21 baseline DL models are fully trained on our new skin NTDs image dataset, with proper evaluation of each model on the classification of the skin NTDs, where the results are used for the overall analysis of models&#x2019; performance.</p></sec><sec id="s2-12-2-3"><title>Two-Stage Approach: Feature Extraction With Integrated CNN Model</title><p>On the third experimental setting, the 2-staged approach is demonstrated. To apply this robust training pipeline, we used the 21 systematically selected DL architectures for feature extraction and the restructured new 10-layered CNN model architecture. Hence, 21 different skin NTDs classification models are trained using this approach, where each model is evaluated using the predefined appropriate evaluation methods.</p></sec></sec></sec><sec id="s2-13"><title>Phase 2: Performance Optimization</title><sec id="s2-13-1"><title>Data Augmentation</title><p>Evidently, DL models are highly data-intensive and require a large amount of data to achieve excellent performance [<xref ref-type="bibr" rid="ref46">46</xref>]. Hence, using a relatively small-sized dataset, as the case of this study with only 1495 samples, developing a DL-based diagnostic model with higher classification accuracy is a real challenge; it might not be beneficial at all, compared with the use of large-sized datasets [<xref ref-type="bibr" rid="ref47">47</xref>]. Methodologically, several DL-based data augmentation methods are available, including the advanced standard and conditional augmentation using GAN models, which are also highly suitable for class imbalance handling. However, their execution requires further experiments, analysis, and selection, including higher computational requirements, which in turn extends this study by deviating from the intended benchmarking objective. Therefore, as a benchmarking study, the dynamic or online (on-the-fly) data augmentation approach [<xref ref-type="bibr" rid="ref29">29</xref>] is used by applying transformations on the training set to increase the number of training samples by a factor of 5 (including the original training images) and create a total of 4785 training images. This method is primarily used to alleviate the data scarcity issue, analyze performance changes, and increase both the size and the diversity of the training set without the permanent creation of the images. Accordingly, we applied selected geometric transformations such as rotation (0.2), scaling (0.2), and horizontal flipping during model training on the input images at the time of model training to mathematically simulate the real-world diagnostics of skin NTDs. Therefore, using this method, 5 different models are trained based on the 2-stage approach using the final 5 selected extractors.</p></sec></sec><sec id="s2-14"><title>Phase 3: The Final Optimization Methods and Hyperparameter Tuning</title><p>As the last experimental training, the hyperparameter tuning operation is applied to the final 2 selected models. To achieve this, the &#x201C;Hyperband&#x201D; algorithm was used, which is a faster and resource-efficient hyperparameter optimization algorithm than other hyperparameter searching algorithms. Hyperband is an efficient bandit-based Keras algorithm for hyperparameter optimization that uses early stopping with a successive halving algorithm to quickly find good configurations for models [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Therefore, the hyperparameter tuning operation is applied by running the Hyperband algorithm to the maximum of 30 epochs, with the maximum number of trials being 60. The optimization was carried out based on the validation accuracy, which was set to be the objective metric. Finally, the final best hyperparameters were saved to use for the final optimized training of the skin NTDs classification model.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>This section presents model evaluation results of the 5 experimental settings (baseline training, transfer learning, and the 2-stage approach, including the training with data augmentation and hyperparameter tuning).</p><sec id="s3-1"><title>Phase 1: Baseline Model Training With Cascaded Screening</title><sec id="s3-1-1"><title>The New CNN Model</title><p>The new CNN baseline model is trained on the new skin NTDs dataset in 2 different experimental settings, the baseline training that applies no advanced DL method and applying the data augmentation method, as summarized by the overall results in <xref ref-type="table" rid="table3">Table 3</xref>. Accordingly, on the baseline training, the model achieved an accuracy of 0.674 and <italic>F</italic><sub>1</sub>-score of 0.42 (with AUROC=0.676 and AUPRC=0.444), with a very high loss (0.978). However, during the second training with data augmentation, the new CNN model achieved improved performance with an <italic>F</italic><sub>1</sub>-score of 0.446 and an increased loss of 1.458.</p><p>However, class-wise, the new model showed the worst sensitivity for podoconiosis (scoring all 0.0 in precision, recall, and <italic>F</italic><sub>1</sub>-scores), with a macro recall of 0.43 (having a recall of 0.457 and 0.827 for tungiasis and scabies, respectively), as shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Overall performance scores of the new CNN<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> model across the 2 experimental settings.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment</td><td align="left" valign="bottom">Model/method</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Loss</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (macro)</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> (mean)</td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> (mean)</td></tr></thead><tbody><tr><td align="left" valign="top">First</td><td align="left" valign="top">Baseline</td><td align="left" valign="top">0.674</td><td align="left" valign="top">0.978</td><td align="left" valign="top">0.422</td><td align="left" valign="top">0.676</td><td align="left" valign="top">0.444</td></tr><tr><td align="left" valign="top">Second</td><td align="left" valign="top">Dynamic data augmentation</td><td align="left" valign="top">0.691</td><td align="left" valign="top">1.458</td><td align="left" valign="top">0.446</td><td align="left" valign="top">0.790</td><td align="left" valign="top">0.608</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>CNN: convolutional neural network.</p></fn><fn id="table3fn2"><p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table3fn3"><p><sup>c</sup>AUPRC: area under the precision-recall curve.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Class-specific performance of the new CNN<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> model across the 2 experiments.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment</td><td align="left" valign="bottom">Model/Method</td><td align="left" valign="bottom" colspan="4">Recall</td><td align="left" valign="bottom" colspan="4"><italic>F</italic><sub>1</sub>-score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Podoconiosis</td><td align="left" valign="bottom">Tungiasis</td><td align="left" valign="bottom">Scabies</td><td align="left" valign="bottom">Macro recall</td><td align="left" valign="bottom">Podoconiosis</td><td align="left" valign="bottom">Tungiasis</td><td align="left" valign="bottom">Scabies</td><td align="left" valign="bottom">Macro <italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">First</td><td align="left" valign="top">Baseline</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.457</td><td align="left" valign="top">0.827</td><td align="left" valign="top">0.428</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.489</td><td align="left" valign="top">0.776</td><td align="left" valign="top">0.422</td></tr><tr><td align="left" valign="top">Second</td><td align="left" valign="top">Dynamic data augmentation</td><td align="left" valign="top">0.154</td><td align="left" valign="top">0.181</td><td align="left" valign="top">0.979</td><td align="left" valign="top">0.438</td><td align="left" valign="top">0.235</td><td align="left" valign="top">0.296</td><td align="left" valign="top">0.806</td><td align="left" valign="top">0.446</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>CNN: convolutional neural network.</p></fn></table-wrap-foot></table-wrap><p>With data augmentation, however, the model achieved a macro recall of 0.438 and macro <italic>F</italic><sub>1</sub>-score of 0.446, with improved class-specific scores in recall (podoconiosis=0.154, tungiasis=0.181, and scabies=0.979) and <italic>F</italic><sub>1</sub>-score (podoconiosis=0.235, tungiasis=0.296, and scabies=0.806). As confirmed by the results, the use of the standard data augmentation method significantly improved the sensitivity and macro <italic>F</italic><sub>1</sub>-score of the model for the podoconiosis class (SD +0.109) and (SD +0.017).</p></sec><sec id="s3-1-2"><title>Transfer Learning: Baseline Performance</title><p>On the second and third training settings of the first phase, baseline models are trained using similar pretrained DL model architectures demonstrating (1) the transfer learning method, and (2) the 2-stage approach. <xref ref-type="table" rid="table5">Table 5</xref> presents the overall performance of models scored during these 2 experimental settings.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Performance of the models across the 2 experiments of phase 1 model screening experiments<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="5">Experiment 1: transfer learning (baseline models)</td><td align="left" valign="bottom" colspan="5">Experiment 2: 2-stage approach (baseline models)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Log-loss</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score<break/>(macro)</td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup><break/>(mean)</td><td align="left" valign="bottom">Sensitivity<break/>(mean)</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Log-loss</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score<break/>(macro)</td><td align="left" valign="bottom">AUPRC<break/>(mean)</td><td align="left" valign="bottom">Sensitivity<break/>(mean)</td></tr></thead><tbody><tr><td align="left" valign="top">ResNet50</td><td align="left" valign="top">0.695</td><td align="left" valign="top">0.733</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.516</td><td align="left" valign="top">0.39</td><td align="left" valign="top">0.641</td><td align="left" valign="top">1.027</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.430</td><td align="left" valign="top">0.33</td></tr><tr><td align="left" valign="top">ConvNext-Small</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.751</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.552</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.805</td><td align="left" valign="top">0.532</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.747</td><td align="left" valign="top">0.62</td></tr><tr><td align="left" valign="top">Xception</td><td align="left" valign="top"><italic>0.973</italic></td><td align="left" valign="top"><italic>0.106</italic></td><td align="left" valign="top"><italic>0.94</italic></td><td align="left" valign="top"><italic>0.975</italic></td><td align="left" valign="top"><italic>0.91</italic></td><td align="left" valign="top"><italic>0.94</italic></td><td align="left" valign="top"><italic>0.219</italic></td><td align="left" valign="top"><italic>0.9</italic></td><td align="left" valign="top"><italic>0.937</italic></td><td align="left" valign="top"><italic>0.88</italic></td></tr><tr><td align="left" valign="top">EfficientNetB5</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.785</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.378</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.638</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.34</td><td align="left" valign="top">0.368</td><td align="left" valign="top">0.36</td></tr><tr><td align="left" valign="top">ConvNext-Tiny</td><td align="left" valign="top">0.671</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.573</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.718</td><td align="left" valign="top">0.693</td><td align="left" valign="top">0.43</td><td align="left" valign="top">0.568</td><td align="left" valign="top">0.43</td></tr><tr><td align="left" valign="top">DenseNet121</td><td align="left" valign="top"><italic>0.95</italic></td><td align="left" valign="top"><italic>0.119</italic></td><td align="left" valign="top"><italic>0.89</italic></td><td align="left" valign="top"><italic>0.973</italic></td><td align="left" valign="top"><italic>0.84</italic></td><td align="left" valign="top"><italic>0.96</italic></td><td align="left" valign="top"><italic>0.133</italic></td><td align="left" valign="top"><italic>0.91</italic></td><td align="left" valign="top"><italic>0.974</italic></td><td align="left" valign="top"><italic>0.9</italic></td></tr><tr><td align="left" valign="top">EfficientNetB3</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.782</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.370</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.668</td><td align="left" valign="top">0.767</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.429</td><td align="left" valign="top">0.39</td></tr><tr><td align="left" valign="top">MNv3-Large</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.494</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.681</td><td align="left" valign="top">0.995</td><td align="left" valign="top">0.59</td><td align="left" valign="top">0.625</td><td align="left" valign="top">0.57</td></tr><tr><td align="left" valign="top">MNv3-Small</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.777</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.388</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.708</td><td align="left" valign="top">0.687</td><td align="left" valign="top">0.4</td><td align="left" valign="top">0.640</td><td align="left" valign="top">0.41</td></tr><tr><td align="left" valign="top">EfficientNetB0</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.786</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.360</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.641</td><td align="left" valign="top">1.012</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.396</td><td align="left" valign="top">0.33</td></tr><tr><td align="left" valign="top">EfficientNetB1</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.785</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.363</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.628</td><td align="left" valign="top">0.828</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.437</td><td align="left" valign="top">0.37</td></tr><tr><td align="left" valign="top">MNv2</td><td align="left" valign="top"><italic>0.956</italic></td><td align="left" valign="top"><italic>0.109</italic></td><td align="left" valign="top"><italic>0.91</italic></td><td align="left" valign="top"><italic>0.973</italic></td><td align="left" valign="top"><italic>0.87</italic></td><td align="left" valign="top"><italic>0.943</italic></td><td align="left" valign="top"><italic>0.181</italic></td><td align="left" valign="top"><italic>0.88</italic></td><td align="left" valign="top"><italic>0.915</italic></td><td align="left" valign="top"><italic>0.86</italic></td></tr><tr><td align="left" valign="top">ResNet18</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.805</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.352</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.735</td><td align="left" valign="top">0.737</td><td align="left" valign="top">0.48</td><td align="left" valign="top">0.545</td><td align="left" valign="top">0.49</td></tr><tr><td align="left" valign="top">EfficientNetV2B0</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.786</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.349</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.793</td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.393</td><td align="left" valign="top">0.33</td></tr><tr><td align="left" valign="top">EfficientNetV2S</td><td align="left" valign="top">0.674</td><td align="left" valign="top">0.694</td><td align="left" valign="top">0.39</td><td align="left" valign="top">0.501</td><td align="left" valign="top">0.4</td><td align="left" valign="top">0.654</td><td align="left" valign="top">0.867</td><td align="left" valign="top">0.44</td><td align="left" valign="top">0.487</td><td align="left" valign="top">0.45</td></tr><tr><td align="left" valign="top">CovNeXtv2-Tiny</td><td align="left" valign="top"><italic>0.95</italic></td><td align="left" valign="top"><italic>0.124</italic></td><td align="left" valign="top"><italic>0.919</italic></td><td align="left" valign="top"><italic>0.983</italic></td><td align="left" valign="top"><italic>0.886</italic></td><td align="left" valign="top"><italic>0.94</italic></td><td align="left" valign="top"><italic>0.222</italic></td><td align="left" valign="top"><italic>0.876</italic></td><td align="left" valign="top"><italic>0.935</italic></td><td align="left" valign="top"><italic>0.862</italic></td></tr><tr><td align="left" valign="top">CovNeXtv2-Atto</td><td align="left" valign="top"><italic>0.94</italic></td><td align="left" valign="top"><italic>0.226</italic></td><td align="left" valign="top"><italic>0.878</italic></td><td align="left" valign="top"><italic>0.921</italic></td><td align="left" valign="top"><italic>0.837</italic></td><td align="left" valign="top"><italic>0.936</italic></td><td align="left" valign="top"><italic>0.25</italic></td><td align="left" valign="top"><italic>0.882</italic></td><td align="left" valign="top"><italic>0.909</italic></td><td align="left" valign="top"><italic>0.857</italic></td></tr><tr><td align="left" valign="top">EfficientViTB0</td><td align="left" valign="top"><italic>0.963</italic></td><td align="left" valign="top"><italic>0.145</italic></td><td align="left" valign="top"><italic>0.93</italic></td><td align="left" valign="top"><italic>0.967</italic></td><td align="left" valign="top"><italic>0.898</italic></td><td align="left" valign="top"><italic>0.963</italic></td><td align="left" valign="top"><italic>0.124</italic></td><td align="left" valign="top"><italic>0.903</italic></td><td align="left" valign="top"><italic>0.967</italic></td><td align="left" valign="top"><italic>0.876</italic></td></tr><tr><td align="left" valign="top">FasterViT0-T8</td><td align="left" valign="top">0.936</td><td align="left" valign="top">0.165</td><td align="left" valign="top">0.883</td><td align="left" valign="top">0.969</td><td align="left" valign="top">0.855</td><td align="left" valign="top">0.671</td><td align="left" valign="top">0.814</td><td align="left" valign="top">0.382</td><td align="left" valign="top">0.469</td><td align="left" valign="top">0.392</td></tr><tr><td align="left" valign="top">FastViT-T8</td><td align="left" valign="top">0.896</td><td align="left" valign="top">0.37</td><td align="left" valign="top">0.602</td><td align="left" valign="top">0.903</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.223</td><td align="left" valign="top">0.862</td><td align="left" valign="top">0.939</td><td align="left" valign="top">0.833</td></tr><tr><td align="left" valign="top">RepViT-M0.9</td><td align="left" valign="top"><italic>0.943</italic></td><td align="left" valign="top"><italic>0.188</italic></td><td align="left" valign="top"><italic>0.915</italic></td><td align="left" valign="top"><italic>0.959</italic></td><td align="left" valign="top"><italic>0.888</italic></td><td align="left" valign="top"><italic>0.956</italic></td><td align="left" valign="top"><italic>0.166</italic></td><td align="left" valign="top"><italic>0.925</italic></td><td align="left" valign="top"><italic>0.959</italic></td><td align="left" valign="top"><italic>0.893</italic></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Values presented in italics represent high-performance</p></fn><fn id="table5fn2"><p><sup>b</sup>AUPRC: area under the precision-recall curve.</p></fn></table-wrap-foot></table-wrap><p>On the training with only transfer learning (experiment 1, <xref ref-type="table" rid="table5">Table 5</xref>), only 8 models scored top results, where Xception outperformed all models with the top accuracy (97.3%), macro <italic>F</italic><sub>1</sub>-score (0.94), and sensitivity (0.91), followed by EfficientViTB0 (accuracy=0.963, <italic>F</italic><sub>1</sub>-score=0.930, and sensitivity=0.898) and MNv2 (accuracy=95.6%, <italic>F</italic><sub>1</sub>-score=0.91, and sensitivity=0.87). DenseNet121 and ConvNeXtV2-Tiny are the next high-performing models scoring the same accuracy (95%) and sensitivity (0.84), with ConvNeXtV2-Tiny scoring better <italic>F</italic><sub>1</sub>-score (0.89). The RepViT, FasterViT, and ConvNeXtV2-Atto models are the other top-performing models with macro <italic>F</italic><sub>1</sub>-scores of 0.92, 0.88, and 0.84. FastViT, ResNet50, EfficientNetV2S, and ConvNext-Tiny scored macro <italic>F</italic><sub>1</sub>-scores of &#x003E;0.33 and average sensitivity &#x003E;0.37. However, the remaining 9 models similarly scored worst class-specific sensitivity (recall=0.0 for podoconiosis and tungiasis classes), leading to the least macro <italic>F</italic><sub>1</sub>-score (0.26) and average sensitivity (0.33). Using the 2-stage approach as shown in experiment 2 (<xref ref-type="table" rid="table5">Table 5</xref>), only 4 models trained using RepViT, DenseNet, EfficientViTB0, and Xception extractors helped their corresponding trained classification models to achieve macro <italic>F</italic><sub>1</sub>-scores of 0.90 and above, while 8 models scored macro <italic>F</italic><sub>1</sub>-scores of &#x003E;086. As shown, the model trained on RepViT-extracted features scored the highest macro <italic>F</italic><sub>1</sub>-score (0.93, mean sensitivity=0.89), where the model using DenseNet-extracted features had the second highest <italic>F</italic><sub>1</sub>-score (0.91) with the maximum mean sensitivity (0.90). The model using EfficientViTB0-extracted features scored the next top <italic>F</italic><sub>1</sub>-score (0.90) during this experiment, followed by the models using the Xception feature extractor (<italic>F</italic><sub>1</sub>-score=0.90 and sensitivity=0.88), CovNeXtv2-Atto feature extractor (<italic>F</italic><sub>1</sub>-score=0.88 and sensitivity=0.86), MNv2 feature extractor (<italic>F</italic><sub>1</sub>-score=88 and sensitivity=0.86), CovNeXtv2-Tiny feature extractor (<italic>F</italic><sub>1</sub>-score=0.88 and sensitivity=0.86), and FastViT feature extractor (<italic>F</italic><sub>1</sub>-score=0.86 and sensitivity=0.83).</p><p>Method-wise, the use of the 2-stage approach improved the overall performance of the majority of the models, where 62% of the models (13 models) exhibited improved macro <italic>F</italic><sub>1</sub>-scores, with 2 models showing no variations, while the remaining models exhibited minor declines. Class-wise, the 2-stage approach improved the podo sensitivity (podo-recall) of 5 models (using DenseNet121, MNv3-Large, ConvNeXt-Small, ConvNeXtV2-Atto, and FastViT) by the SD of +0.109, +0.218, +0.218, +0.054, and +0.435, respectively, with 3 models (using ConvNeXtV2-Tiny, EfficientViTB0, and FasterVit) showing declining podo-recall, while the other 13 models showing no changes.</p></sec></sec><sec id="s3-2"><title>Best Models Selection</title><p>Overall, the hybrid 2-stage approach applied using the 21 pretrained models (both CNN and transformer-based) yielded superior performance compared with other methods (baseline and transfer learning). Hence, as a DL architectural benchmarking study, we selected this hybrid architecture for further experiments applying optimization methods. However, to screen out short-listed top feature mapping models, we conducted deep and multidimensional analysis that includes performance stability analyses, as shown in <xref ref-type="fig" rid="figure4">Figure 4A and B</xref>.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Analysis of models&#x2019; performance stability across experiments. (A) Stability of the macro <italic>F</italic><sub>1</sub>-scores of each of the 21 models across the 2 experiments. (B) Stability of the mean sensitivity scores of the 21 models across the 2 experiments<bold>.</bold></p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v9i1e91544_fig04.png"/></fig><p>As shown, the box plots clearly depict the stability of the models&#x2019; performance primarily in macro <italic>F</italic><sub>1</sub>-score and overall sensitivity (macro recall) across the 2 experiments. Overall, the models fall into three major categories: (1) stable and least-performing models&#x2014;models trained using features extracted by EfficientNetB0 and EfficientNetV2B0, where the use of these models for feature extraction resulted in the worst macro <italic>F</italic><sub>1</sub>-score and sensitivity, while the other 6 feature extractors, EfficientNetB5, EfficientNetB3, MNv3-Small, EfficientNetB1, ResNet18, and EfficientNetV2S, resulted in comparatively better performance scores; (2) the unstable models&#x2014;models trained on ResNet50, ConvNeXt-Small, ConvNeXt-Tiny, MNv3-Large, FasterViT0, and FastViT-T8, for feature extraction, where the models using these extractors scored highly unpredictable performance scores (macro <italic>F</italic><sub>1</sub> and sensitivity), as shown by their wide-range scores (tall boxes); and (3) the stable and top-performing models&#x2014;representing 7 models trained using the features extracted by Xception, DenseNet, MNv2, ConvNeXtV2-Tiny, ConvNeXtV2-Atto, EfficientViTB0, and RepViT. The models using these 7 pretrained models exhibited minimum variability in their macro <italic>F</italic><sub>1</sub>-score and sensitivity, while achieving the highest median macro <italic>F</italic><sub>1</sub>-score (0.88) and median mean sensitivity (0.86) across the 2 experiments, as shown in <xref ref-type="fig" rid="figure4">Figure 4A B</xref>.</p><p>The next analysis includes performance comparison using 7 parameters, as shown by the radar plot in <xref ref-type="fig" rid="figure5">Figure 5</xref>. As shown, only 7 models using EfficientViTB0, RepViT, MNv2, Xception, DenseNet, ConvNeXt-Tiny, and ConvNeXt-Atto feature maps showed exceptionally outperforming performance collectively having a median <italic>F</italic><sub>1</sub>-score (macro) exceeding 0.95, AUPRC (macro) approaching 1.0, with a comparatively lower median podo-<italic>F</italic><sub>1</sub>-score approaching 0.90. However, the other 14 models (other than the 7 top-performing models) scored average macro <italic>F</italic><sub>1</sub>-scores &#x003C;0.89, average AUPRC&#x003C;0.92, while showing the worst sensitivity for the minority class with median podo-<italic>F</italic><sub>1</sub>-score below 0.35.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Overall model performance comparison for best model selection. The radar plot illustrates analysis of the models&#x2019; performance based on the average scores of the 7 evaluation metrics across the 2 experimental settings, depicting the exceptionally outstanding performance of the 7 top-performing models out of the 21. AUPRC: area under the precision-recall curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v9i1e91544_fig05.png"/></fig><p>All these results underlined that the 7 models identified as top-performing models (<xref ref-type="fig" rid="figure5">Figure 5</xref>) provided higher feature representation capabilities that led to better overall model performance. Hence, the EfficientViTB0, RepViT, MNv2, Xception, DenseNet, ConvNeXt-Tiny, and ConvNeXt-Atto models are selected for further experiments being used for feature mapping. However, the remaining 14 models were highly challenged to extract useful features that led to poor overall performance and are screened out from being used in further experiments.</p></sec><sec id="s3-3"><title>Phase 2: Performance Optimization</title><p>On the second phase of our experimental framework, the selected 7 models are retrained by applying data augmentation method, and the results in <xref ref-type="table" rid="table6">Table 6</xref> are achieved. In this experiment, the model trained using the MNv2 feature mapping significantly outperformed all other models with macro <italic>F</italic><sub>1</sub>-score of 0.95 and sensitivity of 0.93, while scoring the fourth highest AUPRC (0.97) and the fourth least errors (loss=0.137). As shown, the models using DenseNet121 and EfficientViTB0 extractors scored the next closer performance, where the model trained using DenseNet121 mapping achieved the second highest macro <italic>F</italic><sub>1</sub>-score and mean sensitivity of 0.928, while the model using EfficientViTB0 for feature mapping scored a macro <italic>F</italic><sub>1</sub>-score of 0.926 and mean sensitivity of 0.905.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Performance of the 7 screened models in the second phase experiment with data augmentation<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Log-loss</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (macro)</td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> (mean)</td><td align="left" valign="bottom">Sensitivity (mean)</td></tr></thead><tbody><tr><td align="left" valign="top">Xception</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.158</td><td align="left" valign="top">0.901</td><td align="left" valign="top">0.982</td><td align="left" valign="top">0.884</td></tr><tr><td align="left" valign="top">DenseNet121</td><td align="left" valign="top">0.966</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.928</td><td align="left" valign="top">0.974</td><td align="left" valign="top">0.928</td></tr><tr><td align="left" valign="top">MNv2</td><td align="left" valign="top"><italic>0.973</italic></td><td align="left" valign="top"><italic>0.137</italic></td><td align="left" valign="top"><italic>0.953</italic></td><td align="left" valign="top"><italic>0.967</italic></td><td align="left" valign="top"><italic>0.933</italic></td></tr><tr><td align="left" valign="top">CovNeXtv2-Tiny</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.128</td><td align="left" valign="top">0.908</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.869</td></tr><tr><td align="left" valign="top">CovNeXtv2-Atto</td><td align="left" valign="top">0.953</td><td align="left" valign="top">0.209</td><td align="left" valign="top">0.922</td><td align="left" valign="top">0.944</td><td align="left" valign="top">0.891</td></tr><tr><td align="left" valign="top">EfficientViTB0</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.098</td><td align="left" valign="top">0.926</td><td align="left" valign="top">0.985</td><td align="left" valign="top">0.905</td></tr><tr><td align="left" valign="top">RepViT-M0.9</td><td align="left" valign="top">0.956</td><td align="left" valign="top">0.168</td><td align="left" valign="top">0.914</td><td align="left" valign="top">0.959</td><td align="left" valign="top">0.897</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Values presented in italics represent high-performance models. </p></fn><fn id="table6fn2"><p><sup>b</sup>AUPRC: area under the precision-recall curve.</p></fn></table-wrap-foot></table-wrap><p>Overall, during this experiment, the 7 feature extraction pretrained models (MNv2, DenseNet121, EfficientViTB0, CovNeXtv2-Atto, RepViT, CovNeXtv2-Tiny, and Xception) helped their corresponding skin NTDs classification models to score macro <italic>F</italic><sub>1</sub>-scores &#x003E;0.90, AUPRC &#x003E;0.94, and mean sensitivity &#x003E;0.86, with overall loss &#x2264;0.21.</p></sec><sec id="s3-4"><title>Final Models Selection</title><p>For the systematic selection of the final 2 best feature extraction pretrained models that provide the best feature mapping for the skin NTDs classification models, we applied the robust weighted score comparison method defined in the &#x201C;Methods&#x201D; section of the study. Hence, applying our predefined WSS formula, we computed the WSS for the scores achieved on the third experiment that applies data augmentation and for the overall average performance of the models across the 3 experiments. For each group of evaluation, we applied two different sets of weights for the metrics: (1) weights given only to the predictive performance metrics&#x2014;equal weights of 0.25 are given to only the 4 predictive metrics (macro <italic>F</italic><sub>1</sub>, recall, podo-recall, and podo-<italic>F</italic><sub>1</sub>), while the speed and number of parameter are not considered to evaluate only predictive performance; and (2) weights given to all the 6 metrics&#x2014;the predefined weights of 0.2 given for the 4 predictive performance metrics, while the efficiency metrics are given the weights of 0.1. <xref ref-type="table" rid="table7">Table 7</xref> summarizes the overall results of the combined weighted evaluation method.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Combined weighted method results for the 7 prescreened models (final model selection)<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup>.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="4">Performance metrics</td><td align="left" valign="bottom" colspan="2">Efficiency metrics</td><td align="left" valign="bottom" colspan="2">Weighted scores (optimization)</td><td align="left" valign="bottom" colspan="2">Weighted scores (average)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score<break/>(macro)</td><td align="left" valign="bottom">Podo-F1</td><td align="left" valign="bottom">Macro recall<break/>(mean)</td><td align="left" valign="bottom">Podo-recall</td><td align="left" valign="bottom">Speed (sps)</td><td align="left" valign="bottom">Parameters</td><td align="left" valign="bottom">WSS<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup><break/>(predictive)</td><td align="left" valign="bottom">WSS<break/>(overall)</td><td align="left" valign="bottom">WSS<break/>(predictive)</td><td align="left" valign="bottom">WSS<break/>(overall)</td></tr></thead><tbody><tr><td align="left" valign="top">RepViT-M0.9</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.09</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.09</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.335</td><td align="left" valign="top">0.459</td><td align="left" valign="top"><italic>0.974</italic></td><td align="left" valign="top"><italic>0.975</italic></td></tr><tr><td align="left" valign="top">DenseNet121</td><td align="left" valign="top"><italic>0.10</italic></td><td align="left" valign="top"><italic>0.06</italic></td><td align="left" valign="top"><italic>0.18</italic></td><td align="left" valign="top"><italic>0.20</italic></td><td align="left" valign="top"><italic>0.07</italic></td><td align="left" valign="top"><italic>0.06</italic></td><td align="left" valign="top"><italic>0.681</italic></td><td align="left" valign="top"><italic>0.671</italic></td><td align="left" valign="top">0.511</td><td align="left" valign="top">0.528</td></tr><tr><td align="left" valign="top">EfficientViTB0</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.11</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.06</td><td align="left" valign="top">0.424</td><td align="left" valign="top">0.399</td><td align="left" valign="top"><italic>0.784</italic></td><td align="left" valign="top"><italic>0.772</italic></td></tr><tr><td align="left" valign="top">Xception</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.222</td><td align="left" valign="top">0.211</td><td align="left" valign="top">0.855</td><td align="left" valign="top">0.684</td></tr><tr><td align="left" valign="top">CovNeXtv2-Atto</td><td align="left" valign="top">0.08</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.07</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.442</td><td align="left" valign="top">0.554</td><td align="left" valign="top">0.009</td><td align="left" valign="top">0.204</td></tr><tr><td align="left" valign="top">MNv2</td><td align="left" valign="top"><italic>0.20</italic></td><td align="left" valign="top"><italic>0.20</italic></td><td align="left" valign="top"><italic>0.20</italic></td><td align="left" valign="top"><italic>0.20</italic></td><td align="left" valign="top"><italic>0.07</italic></td><td align="left" valign="top"><italic>0.04</italic></td><td align="left" valign="top"><italic>1.0</italic></td><td align="left" valign="top"><italic>0.916</italic></td><td align="left" valign="top">0.688</td><td align="left" valign="top">0.685</td></tr><tr><td align="left" valign="top">CovNeXtv2-Tiny</td><td align="left" valign="top">0.03</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.00</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.07</td><td align="left" valign="top">0.034</td><td align="left" valign="top">0.155</td><td align="left" valign="top">0.241</td><td align="left" valign="top">0.367</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>Values presented in italics represent high-performance models.</p></fn><fn id="table7fn2"><p><sup>b</sup>WSS: weighted sum of scores.</p></fn></table-wrap-foot></table-wrap><p>As shown, the model using the MNv2 feature mapping showed an exceptionally highest combined weighted score with an overall WSS of 0.92 on the aggregate weighted scores. This model still showed an exceptionally outperforming combined weighted score on the predictive performance metrics with a WSS of 1.0. The other model using the DenseNet121 mapping has the second highest weighted scores having an overall WSS of 0.67 and a WSS of 0.68 on the predictive performance. On the same experiment, the models using CovNeXtv2-Atto (0.55) showed the next top overall WSS, followed by the model using RepViT-M0.9 extractors (0.46). On this last experiment, the hybrid transformer-based architecture EffcientViTB0 showed lower weighted scores (0.399 and 0.424) on the overall and predictive WSS. Conversely, using the aggregated performance derived by computing the average scores of each metric for all models across 3 experiments, the combined weighted scores present a completely different set of scores. Based on these results, the model using the RepViT-M0.9 extractors showed an exceptionally higher combined weighted scores both on the overall WSS (0.98) and the predictive WSS (0.97), followed by the model using EfficientViTB0 with an overall WSS (0.98) and predictive WSS (0.97). The models based on MNv2 and DenseNet121 showed the next higher weighted scores (overall WSS of 0.69 and 0.53, respectively) on the aggregated scores of the models across the 3 experiments.</p><p>Overall, the data augmentation method is applied to optimize the models&#x2019; performance, given the study faced the data scarcity problem. However, the models using the RepViT-M0.9 and EfficientViTB0 extractors unexpectedly reacted negatively to this optimization method that expands the sample size and data variance, while the same method significantly boosted the performance of the models based on the MNv2 and DenseNet121 feature mapping. These results reveal a generalization paradox problem, showing (1) overspecialization&#x2014;the models based on the RepViT-M0.9 and EfficientViTB0 extractors are experiencing performance instability and a potential overfitting due to memorization of the initial small-sized training data, and (2) latent generalization&#x2014;the models using the MNv2 and DenseNet121 extractors showed their hidden generalization abilities that are exposed as the result of the data augmentation method. Therefore, based on these major driving facts, we selected the MNv2 and DenseNet121 pretrained models for feature mapping in our final skin NTDs diagnostic model.</p></sec><sec id="s3-5"><title>Phase 3: The Final Optimization</title><p>On the final experiment, hyperparameter tuning is applied on the last 2 models selected, which helped the model using the DenseNet121 feature extractor achieve improved performance, as shown in <xref ref-type="table" rid="table8">Table 8</xref>.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Performance scores of the final 2 selected models after hyperparameter tuning<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup>.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Macro<break/><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Loss</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup> (average)</td><td align="left" valign="bottom">AUPRC<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> (average)</td><td align="left" valign="bottom" colspan="4">Recall</td><td align="left" valign="bottom" colspan="4"><italic>F</italic><sub>1</sub>-score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Podoconiosis</td><td align="left" valign="bottom">Tungiasis</td><td align="left" valign="bottom">Scabies</td><td align="left" valign="bottom">Macro recall</td><td align="left" valign="bottom">Podoconiosis</td><td align="left" valign="bottom">Tungiasis</td><td align="left" valign="bottom">Scabies</td><td align="left" valign="bottom">Macro<break/><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">DenseNet121</td><td align="left" valign="top"><italic>0.966</italic></td><td align="left" valign="top"><italic>0.946</italic></td><td align="left" valign="top"><italic>0.181</italic></td><td align="left" valign="top"><italic>0.996</italic></td><td align="left" valign="top"><italic>0.974</italic></td><td align="left" valign="top"><italic>0.923</italic></td><td align="left" valign="top"><italic>0.979</italic></td><td align="left" valign="top"><italic>0.974</italic></td><td align="left" valign="top"><italic>0.959</italic></td><td align="left" valign="top"><italic>0.889</italic></td><td align="left" valign="top"><italic>0.968</italic></td><td align="left" valign="top"><italic>0.982</italic></td><td align="left" valign="top"><italic>0.946</italic></td></tr><tr><td align="left" valign="top">MNv2</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.906</td><td align="left" valign="top">0.150</td><td align="left" valign="top">0.987</td><td align="left" valign="top">0.955</td><td align="left" valign="top">0.769</td><td align="left" valign="top">0.947</td><td align="left" valign="top">0.979</td><td align="left" valign="top">0.898</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.937</td><td align="left" valign="top">0.982</td><td align="left" valign="top">0.906</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>Values presented in italics represent high-performance models.</p></fn><fn id="table8fn2"><p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table8fn3"><p><sup>c</sup>AUPRC: area under the precision-recall curve.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="fig" rid="figure6">Figures 6A-6D</xref> summarizes the overall performance of the final 2 models trained using the DenseNet121 and MNv2 feature extractors after hyperparameter tuning.</p><p>After hyperparameter tuning, the DenseNet121-based model scored improved performance, compared with the previous experiment with data augmentation, achieving an accuracy of 0.966, <italic>F</italic><sub>1</sub>-score of 0.95 (&#x0394;=+0.018), and mean sensitivity of 0.96 (&#x0394;=+0.031). However, the MNv2-based model exhibited declining performance in <italic>F</italic><sub>1</sub>-score (&#x0394;=&#x2212;0.047) and macro recall (&#x0394;=&#x2212;0.035), including podo-specific recall and <italic>F</italic><sub>1</sub>-score.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Comprehensive classification evaluation of the final 2 models. (A) Confusion matrix showing class-wise results for the model trained using DenseNet121 extractor, (B) AUROC and area under the precision-recall curve (AUPRC) of the model using DenseNet121 feature extractor, (C) confusion matrix showing class-wise results for the model trained using MNv2 extractor, and (D) AUROC and AUPRC of the model using MNv2 feature map. AUC: area under the curve; AUROC: area under the receiver operating characteristic curve; PR: precision-recall; ROC: receiver operating characteristic.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="derma_v9i1e91544_fig06.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>As a DL architectural benchmarking effort, this study developed a diagnostic model for skin NTDs based on the funnel framework with an extensive experiment-based cascaded architectural screening. A significant portion of the experimental design involved identifying robust pretrained DL architectures for feature mapping to be used with our custom-designed CNN model. On average, across the first 2 phases (5 experiments), the custom-designed CNN model and 10 other models trained using the feature mapping architectures yielded poor overall performance with macro <italic>F</italic><sub>1</sub>-scores below 0.50, 5 models (using MNv3-Large, ConvNext-Small, FasterViT, FastViT, and CovNeXtv2-Atto extractors) showed moderate performance with above 0.50, and 6 models using EfficientViTB0, RepViT, MNv2, Xception, DenseNet121, and ConvNeXtV2-Tiny extractors achieved exceptionally high performance with <italic>F</italic><sub>1</sub>-scores exceeding 0.90 and sensitivity above 0.87. This exceptionally high performance of the top 6 models is attributed to their robust feature mapping logic that allowed deriving high-dimensional representative features, given our small-sized skin NTDs dataset. Method-wise, the 2-stage approach applying data augmentation resulted in performance improvements for the majority of the models, which underscore the success of the 2-stage approach for skin NTDs classification, given the dataset problems.</p><p>The other critical finding of this study highlighted that modern and state-of-the-art DL architectures such as EfficinetNetB5 and EfficinetNetB3, even the highly anticipated lightweight models of EfficinetNetB0 variants, MNv3-Large and MNv3-Small, were unable to extract useful features from the dataset used. All these models proved to be highly data-intensive and extremely sensitive to class imbalance. Conversely, DenseNet121, MNv2, Xception, including the comparatively recent DL architectures of ConvNeXtV2 (Tiny/Atto), and RepViT provided highly robust feature mapping capability, while the transformer-based hybrid architecture EfficientViTB0 showed its potential. Ultimately, the DenseNet121 and MNv2 demonstrated their robustness across the 5 different experimental trainings, including the final optimization. Hence, these 2 models are selected due to their overall feature mapping capability to be integrated with our custom-designed CNN model. Overall, the success of the DenseNet121 model is architecturally linked to its dense connectivity patterns and inductive biases. These factors allowed the trained classifier to reuse feature maps from previous layers through dense blocks, which expands the feature maps into more representative data. Likewise, the success of the MNv2 models is attributed to its internal information bottleneck method. To combat the dataset-related problems such as high variance and possible data memorization, several regularization strategies were synergistically applied. The strategies included data augmentation, batch normalization, dropout, and early stopping with a synchronized implementation of callbacks. While these methods successfully improved model generalization on unseen test sets, regularization alone was insufficient to fully address the overfitting issues caused by the severely skewed distribution of disease classes. Specifically, for the 12 models excluded after the first phase experiments, all these regularization methods were unable to improve performance even with the application of data augmentation. This clearly proves that the severe class imbalance, especially between scabies and podoconiosis, greatly affected the models&#x2019; performance. This clearly requires experimental investigation by applying the different methods for class balancing, both at the data and algorithm levels.</p></sec><sec id="s4-2"><title>The Synergistic Hybrid Approach</title><p>Overall, the most effective strategy identified was the 2-stage hybrid approach that combined the high-performing feature extractors with the properly designed CNN classification module. This final harmonized integration involved 3 key strategies: the robust feature extraction architectures, the optimal CNN classification head configuration, and the dynamic data augmentation methods. The harmonized use of all these 3 strategies, along with hyperparameter tuning, provided high-performance classification models by ensuring that each model operated on the optimized output of its preceding module. Ultimately, this study selected the final 2 skin NTD diagnosis models that achieved an <italic>F</italic><sub>1</sub>-score &#x003E; 0.95 using MNv2, and an <italic>F</italic><sub>1</sub>-score &#x003E; 0.93 using DenseNet121 after data augmentation. All these higher-performance scores are attributed to the combined use of all DL methods, and feature mapping models we use based on our 30-layered very deep CNN model have brought tangible disease-predictive performance improvements across all models.</p><p>The integrated 2-stage approach demonstrated its potential for skin NTDs diagnosis, given the study used a dataset with severe class imbalance and small sample images. While most of the DL methods and pretrained models used in the 2-stage approach are preexisting, the harmonization of the feature mapping with our 10-layered classification head created a different DL architecture. This architecture allowed us to establish information bottlenecks using the compact, highly focused, and properly regularized 10-layer classification head that trains on the extracted features. This new architecture added robustness to our model development pipeline, allowing us to build top-performance skin NTDs diagnostic models using the highly constrained dataset, compared with the standard transfer learning method using the same pretrained models. Accordingly, the improved sensitivity of models for the minority class (podo-recall) confirmed the achievements of the architecture, where 5 models showed their sensitivity for podoconiosis, while 62% of the models (13 models) have shown lower overall performance and no variation in podo-recall. These facts clearly underline that the 2-stage approach provided methodological solutions to the severe class imbalance problem. Furthermore, the approach also demonstrated domain adaptation through its two separate modules: (1) feature extraction module&#x2014;the feature mapping logics of the models initially trained on ImageNet-1K adapted for our skin NTDs diagnostic model, where these diseases have diverse lesion types and textures such as &#x201C;mossy limbs&#x201D; that require symmetric analysis of limbs to detect podo based on swellings; and (2) feature refinement module&#x2014;the highly regularized 10-layered classification head that applies further refinements on the feature matrices extracted by the mapping models.</p></sec><sec id="s4-3"><title>Clinical Relevance and Further Considerations</title><p>This study developed benchmark image-based diagnostic model(s) for skin NTDs using the traditional augmentation method to initially alleviate data scarcity issues (limited sample images with severe class imbalance). We applied this method on our sample input image using our predefined basic geometric transformations at the time of model training to mathematically simulate the real-world diagnostics of skin NTDs. In the actual real-world scenarios, skin NTDs are highly prevalent in remote underserved areas, where the diagnostics of the diseases are mostly undertaken by middle-level health care workers under low-resource settings. Hence, our model design assumed these scenarios that our model is used by middle-level health care workers without the need for highly sophisticated devices, where moderate-quality handheld smartphones can properly serve the diagnostics. Accordingly, we added the basic transformation techniques to our augmentation pipeline that includes rotation (20 degrees), shearing (range=0.1), and zooming (range=0.1) to avoid variations in our models&#x2019; predictions that can be caused by variable camera angles, inconsistent framing, and varying focal distances in photographs of the skin NTDs. We also added the horizontal flipping transformation to maintain anatomical symmetry, and random brightness adjustments are also applied to simulate the diagnostic practices that can be conducted under different lighting adjustments.</p><p>While the study achieved its objective, the data scarcity issues still remained to be the major challenges. However, the top performance scores achieved by the final 2 optimized models (the models using the MN2 and DenseNet121 feature maps) underlined that further experiments using advanced DL-based data augmentations and balancing methods have the potential of further improving the predictive performance of the models. Hence, using each of the selected benchmark models as a basis, advanced DL-based methods such as the use of GAN-generated samples and weighted balancing methods are expected to significantly boost the models&#x2019; performance, as confirmed by the performance improvements achieved by applying the traditional dynamic data augmentation.</p><p>Additionally, as a clinical diagnostic model, the other important aspect expected from such systems is the treatment facility, once the diseases are correctly identified with higher diagnostic accuracy. Generally, the incorporation of treatment recommendations represents the deployment-ready (final) stage of a diagnostic tool, mostly indicating that the model passed several improvement and validation stages. It also requires further system validation and authorization. Therefore, marking the current gaps, incorporation of treatment recommendations, and performance improvements using advanced DL methods are the immediate research tasks.</p></sec><sec id="s4-4"><title>Conclusions</title><p>Ultimately, this study developed a benchmark image-based diagnostic model for skin NTDs through a robust hybrid DL pipeline using a novel skin NTDs dataset collected from a remote area representing an underserved community in the Southwest of Ethiopia. The study developed a new baseline model, applied transfer learning, designed a 2-stage approach, and applied dynamic data augmentation, where all experiments were conducted based on a novel research framework proposed for this study, the funnel framework. Using the funnel framework, optimal methods and high-performing models were selected in a phased approach. With the 2-stage approach being the best model-building approach, the DenseNet121 and MobileNetV2 were top-performing feature extractors. Finally, after the last training applying hyperparameter tuning, the models trained using these extractors still showed performance improvements, DenseNet121 (accuracy=96.6% and <italic>F</italic><sub>1</sub>-score=0.85) and MobileNetV2 (accuracy=96.6% and <italic>F</italic><sub>1</sub>-score=0.85). Therefore, the study ultimately selected the DenseNet121 and MobileNetV2 models for feature mapping (extraction) for the final DL-based skin NTDs diagnostic models.</p><p>While we developed the intended model for the diagnosis of skin NTDs using the novel skin images dataset, this study also exhibits some downsides that limit the study from contributing to its potential. Data scarcity and severe class imbalance are the primary challenges. The number of diseases represented in this study is also limited to only 3 diseases while there are more than 8 skin NTDs prevalent in Ethiopia. Additionally, the study does not demonstrate data-balancing methods due to resource-related constraints and the extended experiments required. Furthermore, only image data were considered to develop the proposed diagnostic model. Therefore, we recommend further efforts to address the mentioned limitations. First, it is recommended that additional data must be collected from different affected areas (if applicable) with the data being representative of all disease classes. Additionally, DL-based class-balancing methods, such as conditional augmentation based on generative models, are recommended to be experimented. We also suggest the inclusion of multiple types of data, such as text-based patients&#x2019; data, to expand the dimensionality of the dataset used.</p></sec></sec></body><back><ack><p>The authors of this study gratefully acknowledge Mr Alemayehu Bekele and his team for providing the data used to complete this study, and the technical support he provided us. This study used the data collected from patients with skin Neglected Tropical Diseases (skin NTDs) living in Gacho Baba District of the Gamo Zone, Southwest Ethiopia. The data were initially collected by Mr. Alemayehu Bekele for a project-based research intended to assess the burden of skin NTDs through community screening, led by Mr Alemayehu Bekele from the Collaborative Research and Training Center for Neglected Tropical Disease, College of Medicine Health Sciences of AMU. The authors acquired the dataset for this study through collaboration after having the required ethical clearance letter. All authors declared that they had insufficient funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee support for the publication of this paper. Finally, the authors would like to declare that no generative tools and generated contents are used for the preparation of any part of this study.</p></ack><notes><sec><title>Funding</title><p>The authors declared that no financial support was received for this work.</p></sec><sec><title>Data Availability</title><p>Currently, the new skin NTDs dataset used for this study is not publicly available, as it requires the final verification and consensus regarding dissemination.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AMU</term><def><p>Arba Minch University</p></def></def-item><def-item><term id="abb2">AUPRC</term><def><p>area under the precision-recall curve</p></def></def-item><def-item><term id="abb3">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb4">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb5">DL</term><def><p>deep learning</p></def></def-item><def-item><term id="abb6">GAN</term><def><p>generative adversarial</p></def></def-item><def-item><term id="abb7">GFLOP</term><def><p>Giga floating point operation</p></def></def-item><def-item><term id="abb8">IRB</term><def><p>institutional review board</p></def></def-item><def-item><term id="abb9">ISIC</term><def><p>International Skin Imaging Collaboration</p></def></def-item><def-item><term id="abb10">MDA</term><def><p> mass drug administration</p></def></def-item><def-item><term id="abb11">NTD</term><def><p>Neglected Tropical Disease</p></def></def-item><def-item><term id="abb12">skin NTD</term><def><p>skin Neglected Tropical Disease</p></def></def-item><def-item><term id="abb13">WSS</term><def><p>weighted sum of scores</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yohannes Minyilu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yimer</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Meshesha</surname><given-names>M</given-names> </name></person-group><article-title>Application of deep learning-based multimodal data fusion for the diagnosis of skin neglected tropical diseases: systematic review</article-title><source>JMIR AI</source><year>2025</year><month>12</month><day>4</day><volume>4</volume><fpage>e67584</fpage><pub-id pub-id-type="doi">10.2196/67584</pub-id><pub-id pub-id-type="medline">41344666</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deribe</surname><given-names>K</given-names> </name><name name-style="western"><surname>Meribo</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gebre</surname><given-names>T</given-names> </name><etal/></person-group><article-title>The burden of neglected tropical diseases in Ethiopia, and opportunities for integrated control and elimination</article-title><source>Parasit Vectors</source><year>2012</year><month>10</month><day>24</day><volume>5</volume><issue>1</issue><fpage>240</fpage><pub-id pub-id-type="doi">10.1186/1756-3305-5-240</pub-id><pub-id pub-id-type="medline">23095679</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="report"><article-title>The third national neglected tropical diseases strategic plan 2021-2025</article-title><year>2021</year><access-date>2026-06-16</access-date><publisher-name>Ministry of Health</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://espen.afro.who.int/sites/default/files/content/document/Third%20NTD%20national%20Strategic%20Plan%202021-2025_0.pdf">https://espen.afro.who.int/sites/default/files/content/document/Third%20NTD%20national%20Strategic%20Plan%202021-2025_0.pdf</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><article-title>Ethiopia: country disease outlook</article-title><source>World Health Organization, Regional Office for Africa, Country Disease Outlook</source><year>2023</year><month>08</month><access-date>2026-06-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.afro.who.int/sites/default/files/2023-08/Ethiopia.pdf">https://www.afro.who.int/sites/default/files/2023-08/Ethiopia.pdf</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="report"><article-title>Elimination of neglected tropical diseases (NTDS) in Ethiopia, Woreda level coordination toolkit</article-title><year>2019</year><access-date>2026-06-16</access-date><publisher-name>Federal Ministry of Health</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.susana.org/_resources/documents/default/3-3709-7-1571741955.pdf">https://www.susana.org/_resources/documents/default/3-3709-7-1571741955.pdf</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>Skin NTDS laboratory network (Skin NTDS LABNET)</article-title><source>World Health Organization</source><access-date>2026-06-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/groups/skin-ntds-laboratory-network">https://www.who.int/groups/skin-ntds-laboratory-network</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdela</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Diro</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zewdu</surname><given-names>FT</given-names> </name><etal/></person-group><article-title>Looking for NTDs in the skin; an entry door for offering patient centered holistic care</article-title><source>J Infect Dev Ctries</source><year>2020</year><month>06</month><day>29</day><volume>14</volume><issue>6.1</issue><fpage>16S</fpage><lpage>21S</lpage><pub-id pub-id-type="doi">10.3855/jidc.11707</pub-id><pub-id pub-id-type="medline">32614791</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Postigo</surname><given-names>JAR</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>P</given-names> </name><name name-style="western"><surname>Asiedu</surname><given-names>K</given-names> </name></person-group><article-title>Skin health for all: update on skin neglected tropical diseases with a focus on buruli ulcer and yaws</article-title><access-date>2026-06-16</access-date><publisher-name>Mectizan Donation Program</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://iris.who.int/server/api/core/bitstreams/54e59c06-67c7-4c86-8ca9-f210448dab18/content">https://iris.who.int/server/api/core/bitstreams/54e59c06-67c7-4c86-8ca9-f210448dab18/content</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quilter</surname><given-names>EEV</given-names> </name><name name-style="western"><surname>Butlin</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Carrion</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ruiz-Postigo</surname><given-names>JA</given-names> </name></person-group><article-title>The WHO Skin NTD mobile application&#x2014;a paradigm shift in leprosy diagnosis through Artificial Intelligence?</article-title><source>Leprosy</source><year>2024</year><month>06</month><volume>95</volume><issue>2</issue><fpage>e2024030</fpage><pub-id pub-id-type="doi">10.47276/lr.95.2.2024030</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steyve</surname><given-names>N</given-names> </name><name name-style="western"><surname>Steve</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ghislain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ndjakomo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pierre</surname><given-names>E</given-names> </name></person-group><article-title>Optimized real-time diagnosis of neglected tropical diseases by automatic recognition of skin lesions</article-title><source>Inform Med Unlocked</source><year>2022</year><volume>33</volume><fpage>101078</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2022.101078</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brigato</surname><given-names>L</given-names> </name><name name-style="western"><surname>Barz</surname><given-names>B</given-names> </name><name name-style="western"><surname>Iocchi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Denzler</surname><given-names>J</given-names> </name></person-group><article-title>Image classification with small datasets: overview and benchmark</article-title><source>IEEE Access</source><year>2022</year><volume>10</volume><fpage>49233</fpage><lpage>49250</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2022.3172939</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sales</surname><given-names>C</given-names> </name><name name-style="western"><surname>Coates</surname><given-names>SJ</given-names> </name></person-group><article-title>Applications of artificial intelligence for high-burden, underserved skin diseases in global settings: a review</article-title><source>Curr Derm Rep</source><year>2025</year><month>06</month><volume>14</volume><issue>1</issue><fpage>14</fpage><pub-id pub-id-type="doi">10.1007/s13671-025-00469-9</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hailu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gebre</surname><given-names>T</given-names> </name><name name-style="western"><surname>Seife</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Challenges and strategies for mainstreaming neglected tropical diseases campaign interventions in Ethiopia</article-title><source>Am J Trop Med Hyg</source><year>2025</year><month>02</month><day>5</day><volume>112</volume><issue>2</issue><fpage>467</fpage><lpage>478</lpage><pub-id pub-id-type="doi">10.4269/ajtmh.24-0261</pub-id><pub-id pub-id-type="medline">39591648</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tekola Ayele</surname><given-names>F</given-names> </name><name name-style="western"><surname>Adeyemo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Finan</surname><given-names>C</given-names> </name><etal/></person-group><article-title>HLA class II locus and susceptibility to podoconiosis</article-title><source>N Engl J Med</source><year>2012</year><month>03</month><day>29</day><volume>366</volume><issue>13</issue><fpage>1200</fpage><lpage>1208</lpage><pub-id pub-id-type="doi">10.1056/NEJMoa1108448</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yotsu</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hamm</surname><given-names>J</given-names> </name><name name-style="western"><surname>Blanton</surname><given-names>RE</given-names> </name></person-group><article-title>Deep learning for AI-based diagnosis of skin-related neglected tropical diseases: a pilot study</article-title><source>PLoS Negl Trop Dis</source><year>2023</year><month>08</month><volume>17</volume><issue>8</issue><fpage>e0011230</fpage><pub-id pub-id-type="doi">10.1371/journal.pntd.0011230</pub-id><pub-id pub-id-type="medline">37578966</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pattnayak</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mohanty</surname><given-names>A</given-names> </name><name name-style="western"><surname>Das</surname><given-names>T</given-names> </name><name name-style="western"><surname>Patnaik</surname><given-names>S</given-names> </name></person-group><article-title>Applying artificial intelligence and deep learning to identify neglected tropical skin disorders</article-title><conf-name>2024 3rd International Conference for Innovation in Technology (INOCON)</conf-name><conf-date>Mar 1-3, 2024</conf-date><conf-loc>Bangalore, India</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1109/INOCON60754.2024.10511323</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beesetty</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Leprosy skin lesion detection: an AI approach using few shot learning in a small clinical dataset</article-title><source>Indian J Lepr</source><year>2023</year><access-date>2026-06-16</access-date><volume>95</volume><fpage>89</fpage><lpage>102</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.ijl.org.in/published-articles/29062023223603/1-R-Beesetty-et-al-Final.pdf">https://www.ijl.org.in/published-articles/29062023223603/1-R-Beesetty-et-al-Final.pdf</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barnowska</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Fastenau</surname><given-names>A</given-names> </name><name name-style="western"><surname>Penna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bonkass</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Stuetzle</surname><given-names>S</given-names> </name><name name-style="western"><surname>Janssen</surname><given-names>R</given-names> </name></person-group><article-title>Diagnosing skin neglected tropical diseases with the aid of digital health tools: a scoping review</article-title><source>PLoS Digit Health</source><year>2024</year><month>10</month><volume>3</volume><issue>10</issue><fpage>e0000629</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000629</pub-id><pub-id pub-id-type="medline">39374195</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tschandl</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rosendahl</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kittler</surname><given-names>H</given-names> </name></person-group><article-title>The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions</article-title><source>Sci Data</source><year>2018</year><month>08</month><volume>5</volume><issue>1</issue><fpage>180161</fpage><pub-id pub-id-type="doi">10.1038/sdata.2018.161</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cassidy</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kendrick</surname><given-names>C</given-names> </name><name name-style="western"><surname>Brodzicki</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jaworek-Korjakowska</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yap</surname><given-names>MH</given-names> </name></person-group><article-title>Analysis of the ISIC image datasets: usage, benchmarks and recommendations</article-title><source>Med Image Anal</source><year>2022</year><month>01</month><volume>75</volume><fpage>102305</fpage><pub-id pub-id-type="doi">10.1016/j.media.2021.102305</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Khoshgoftaar</surname><given-names>TM</given-names> </name></person-group><article-title>Survey on deep learning with class imbalance</article-title><source>J Big Data</source><year>2019</year><month>12</month><volume>6</volume><issue>1</issue><fpage>27</fpage><pub-id pub-id-type="doi">10.1186/s40537-019-0192-5</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A comprehensive survey on data augmentation</article-title><source>arXiv</source><comment>Preprint posted online on  Oct, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.09591</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shorten</surname><given-names>C</given-names> </name><name name-style="western"><surname>Khoshgoftaar</surname><given-names>TM</given-names> </name></person-group><article-title>A survey on image data augmentation for deep learning</article-title><source>J Big Data</source><year>2019</year><month>12</month><volume>6</volume><issue>1</issue><fpage>60</fpage><pub-id pub-id-type="doi">10.1186/s40537-019-0197-0</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>T</given-names> </name><name name-style="western"><surname>Brennan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mileo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bendechache</surname><given-names>M</given-names> </name></person-group><article-title>Image data augmentation approaches: a comprehensive survey and future directions</article-title><source>IEEE Access</source><year>2024</year><volume>12</volume><fpage>187536</fpage><lpage>187571</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2024.3470122</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Islam</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hafiz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jim</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Kabir</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mridha</surname><given-names>MF</given-names> </name></person-group><article-title>A systematic review of deep learning data augmentation in medical imaging: recent advances and future research directions</article-title><source>Healthc Analytics</source><year>2024</year><month>06</month><volume>5</volume><fpage>100340</fpage><pub-id pub-id-type="doi">10.1016/j.health.2024.100340</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ribas</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Casaca</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fares</surname><given-names>RT</given-names> </name></person-group><article-title>Conditional generative adversarial networks and deep learning data augmentation: a multi-perspective data-driven survey across multiple application fields and classification architectures</article-title><source>AI</source><year>2025</year><month>02</month><volume>6</volume><issue>2</issue><fpage>32</fpage><pub-id pub-id-type="doi">10.3390/ai6020032</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wickramaratne</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Mahmud</surname><given-names>M</given-names> </name></person-group><article-title>Conditional-GAN based adversarial networks and deep learning data augmentation: a multi-perspective data-adversarial networks and deep learning data augmentation: a multi-perspective data-data augmentation for deep learning task classifier improvement using fNIRS data</article-title><source>Front Big Data</source><year>2021</year><volume>4</volume><fpage>659146</fpage><pub-id pub-id-type="doi">10.3389/fdata.2021.659146</pub-id><pub-id pub-id-type="medline">34396092</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>X</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Arik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>WK</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name></person-group><article-title>Max-pooling dropout for regularization of convolutional neural networks</article-title><source>Neural Information Processing</source><year>2015</year><volume>9489</volume><publisher-name>Springer International Publishing</publisher-name><fpage>46</fpage><lpage>54</lpage><pub-id pub-id-type="doi">10.1007/978-3-319-26532-2_6</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cerqueira</surname><given-names>V</given-names> </name><name name-style="western"><surname>Santos</surname><given-names>M</given-names> </name><name name-style="western"><surname>Roque</surname><given-names>L</given-names> </name><name name-style="western"><surname>Baghoussi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Soares</surname><given-names>C</given-names> </name></person-group><article-title>Online data augmentation for forecasting with deep learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.16918</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>Deep residual learning for image recognition</article-title><conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 27-30, 2016</conf-date><conf-loc>Las Vegas, NV, USA</conf-loc><fpage>770</fpage><lpage>778</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Feichtenhofer</surname><given-names>C</given-names> </name><name name-style="western"><surname>Darrell</surname><given-names>T</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>S</given-names> </name></person-group><article-title>A convnet for the 2020s</article-title><conf-name>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 18-24, 2022</conf-date><pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01167</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Woo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Debnath</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>R</given-names> </name><etal/></person-group><article-title>ConvNeXt V2: co-designing and scaling convnets with masked autoencoders</article-title><conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 17-24, 2023</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><fpage>16133</fpage><lpage>16142</lpage><pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01548</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Van Der Maaten</surname><given-names>L</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name></person-group><article-title>Densely connected convolutional networks</article-title><conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jul 21-26, 2017</conf-date><conf-loc>Honolulu, HI</conf-loc><fpage>2261</fpage><lpage>2269</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2017.243</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chollet</surname><given-names>F</given-names> </name></person-group><article-title>Xception: deep learning with depthwise separable convolutions</article-title><conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jul 21-26, 2017</conf-date><conf-loc>Honolulu, HI</conf-loc><fpage>1800</fpage><lpage>1807</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2017.195</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Le</surname><given-names>QV</given-names> </name></person-group><article-title>EfficientNet: rethinking model scaling for convolutional neural networks</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1905.11946</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sandler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Howard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhmoginov</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>LC</given-names> </name></person-group><article-title>MobileNetV2: inverted residuals and linear bottlenecks</article-title><conf-name>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 18-23, 2018</conf-date><conf-loc>Salt Lake City, UT</conf-loc><fpage>4510</fpage><lpage>4520</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2018.00474</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>TJ</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ferrari</surname><given-names>V</given-names> </name><name name-style="western"><surname>Hebert</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sminchisescu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Weiss</surname><given-names>Y</given-names> </name></person-group><article-title>NetAdapt: platform-aware neural network adaptation for mobile applications</article-title><source>Computer Vision ECCV</source><year>2018</year><volume>11214</volume><publisher-name>Springer International Publishing</publisher-name><fpage>289</fpage><lpage>304</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-01249-6_18</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Howard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sandler</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Searching for MobileNetV3</article-title><conf-name>2019 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name><conf-date>Oct 27 to Nov 2, 2019</conf-date><conf-loc>Seoul, South Korea</conf-loc><fpage>1314</fpage><lpage>1324</lpage><pub-id pub-id-type="doi">10.1109/ICCV.2019.00140</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Le</surname><given-names>QV</given-names> </name></person-group><article-title>EfficientNetV2: smaller models and faster training</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 1, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2104.00298</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Han</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>G</given-names> </name></person-group><article-title>Rep ViT: revisiting mobile CNN from ViT perspective</article-title><conf-name>2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 16-22, 2024</conf-date><conf-loc>Seattle, WA, USA</conf-loc><fpage>15909</fpage><lpage>15920</lpage><pub-id pub-id-type="doi">10.1109/CVPR52733.2024.01506</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hatamizadeh</surname><given-names>A</given-names> </name><etal/></person-group><article-title>FasterViT: fast vision transformers with hierarchical attention</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 9, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.06189</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Anasosalu Vasu</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Gabriel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tuzel</surname><given-names>O</given-names> </name><name name-style="western"><surname>Ranjan</surname><given-names>A</given-names> </name></person-group><article-title>FastViT: a fast hybrid vision transformer using structural reparameterization</article-title><conf-name>2023 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name><conf-date>Sep 30-30, 2023</conf-date><conf-loc>Paris, France</conf-loc><fpage>5762</fpage><lpage>5772</lpage><pub-id pub-id-type="doi">10.1109/ICCV51070.2023.00532</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Han</surname><given-names>S</given-names> </name></person-group><article-title>EfficientViT: lightweight multi-scale attention for high-resolution dense prediction</article-title><conf-name>2023 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name><conf-date>Oct 1-6, 2023</conf-date><conf-loc>Paris, France</conf-loc><fpage>17256</fpage><lpage>17267</lpage><pub-id pub-id-type="doi">10.1109/ICCV51070.2023.01587</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Socher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>LJ</given-names> </name></person-group><article-title>ImageNet: a large-scale hierarchical image database</article-title><conf-name>2009 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops (CVPR Workshops)</conf-name><conf-date>Jun 20-25, 2009</conf-date><pub-id pub-id-type="doi">10.1109/CVPR.2009.5206848</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sahito</surname><given-names>A</given-names> </name><name name-style="western"><surname>Frank</surname><given-names>E</given-names> </name><name name-style="western"><surname>Pfahringer</surname><given-names>B</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Gallagher</surname><given-names>M</given-names> </name><name name-style="western"><surname>Moustafa</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lakshika</surname><given-names>E</given-names> </name></person-group><article-title>Transfer of pretrained model weights substantially improves semi-supervised image classification</article-title><source>AI 2020: Advances in Artificial Intelligence</source><year>2020</year><volume>12576</volume><publisher-name>Springer International Publishing</publisher-name><fpage>433</fpage><lpage>444</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-64984-5_34</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alzubaidi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Al-Sabaawi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A survey on deep learning tools dealing with data scarcity: definitions, challenges, solutions, tips, and applications</article-title><source>J Big Data</source><year>2023</year><month>04</month><volume>10</volume><issue>1</issue><fpage>46</fpage><pub-id pub-id-type="doi">10.1186/s40537-023-00727-2</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rukundo</surname><given-names>O</given-names> </name></person-group><article-title>Effects of image size on deep learning</article-title><source>Electronics (Basel)</source><year>2023</year><month>02</month><volume>12</volume><issue>4</issue><fpage>985</fpage><pub-id pub-id-type="doi">10.3390/electronics12040985</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Jamieson</surname><given-names>K</given-names> </name><name name-style="western"><surname>DeSalvo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Rostamizadeh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Talwalkar</surname><given-names>A</given-names> </name></person-group><article-title>Hyperband: a novel bandit-based approach to hyperparameter optimization</article-title><source>arXiv</source><comment>Preprint posted online on  Jun, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1603.06560</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chiangpradit</surname><given-names>M</given-names> </name><name name-style="western"><surname>Busababodhin</surname><given-names>P</given-names> </name></person-group><article-title>Hyperband-optimized CNN-BiLSTM with attention mechanism for corporate financial distress prediction</article-title><source>Appl Sci (Basel)</source><year>2025</year><month>05</month><volume>15</volume><issue>11</issue><fpage>5934</fpage><pub-id pub-id-type="doi">10.3390/app15115934</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Distribution of data samples in the new skin Neglected Tropical Diseases image dataset. The figure illustrates data distribution portraying class imbalance among the 3 disease classes: scabies (dark blue, 63.9%), tungiasis (teal, 31.7%), and podo (light green, 4.4%).</p><media xlink:href="derma_v9i1e91544_app1.png" xlink:title="PNG File, 28 KB"/></supplementary-material></app-group></back></article>