<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Nursing</journal-id><journal-id journal-id-type="publisher-id">nursing</journal-id><journal-id journal-id-type="index">33</journal-id><journal-title>JMIR Nursing</journal-title><abbrev-journal-title>JMIR Nursing</abbrev-journal-title><issn pub-type="epub">2562-7600</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v9i1e88567</article-id><article-id pub-id-type="doi">10.2196/88567</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Voice-Based Structured Nursing Documentation Using Automatic Speech Recognition and Large Language Models: Development and Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Su</surname><given-names>Meng-Han</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wang</surname><given-names>Wei-Chun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hsu</surname><given-names>Yi-Min</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hou</surname><given-names>Shih-Yen</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chuang</surname><given-names>Su-Jung</given-names></name><degrees>MHA</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chang</surname><given-names>Shih-Sheng</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Artificial Intelligence and Robotics Innovation Center, China Medical University Hospital, China Medical University</institution><addr-line>No. 2, Yude Rd, North Dist</addr-line><addr-line>Taichung</addr-line><country>Taiwan</country></aff><aff id="aff2"><institution>China Medical University Hospital</institution><addr-line>Taichung</addr-line><country>Taiwan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Risling</surname><given-names>Tracie</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mcdonald</surname><given-names>Margaret</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ng</surname><given-names>Qin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Wei-Chun Wang, MD, Artificial Intelligence and Robotics Innovation Center, China Medical University Hospital, China Medical University, No. 2, Yude Rd, North Dist, Taichung, 404327, Taiwan, 886 0422052121 ext 12584; <email>017141@tool.caaumed.org.tw</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>5</day><month>6</month><year>2026</year></pub-date><volume>9</volume><elocation-id>e88567</elocation-id><history><date date-type="received"><day>27</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>02</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>04</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Meng-Han Su, Wei-Chun Wang, Yi-Min Hsu, Shih-Yen Hou, Su-Jung Chuang, Shih-Sheng Chang. Originally published in JMIR Nursing (<ext-link ext-link-type="uri" xlink:href="https://nursing.jmir.org">https://nursing.jmir.org</ext-link>), 5.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Nursing, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://nursing.jmir.org/">https://nursing.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://nursing.jmir.org/2026/1/e88567"/><abstract><sec><title>Background</title><p>For clinical nurses, manually entering information into hospital information systems (HISs) remains time-consuming and prone to omissions. Although speech recognition can reduce the need for manual entry, its use in clinical settings has historically been limited by code-switching, medical terminology, and noisy ward environments. Recent advances in customized automatic speech recognition (ASR) and large language models (LLMs) now make speech-based, structured documentation aligned with nursing frameworks such as DART (data, action, response, and teaching) increasingly feasible.</p></sec><sec><title>Objective</title><p>This study developed and evaluated an integrated ASR and LLM system that transforms spoken nursing input into structured DART notes and evaluated its accuracy, usability, and clinical feasibility within HIS workflows.</p></sec><sec sec-type="methods"><title>Methods</title><p>A code-switching nursing speech corpus from emergency and ward settings was used to fine-tune the Whisper large-v2 model with parameter-efficient adaptation. The LLM generated schema-constrained DART records from ASR transcripts, which were verified by nurses before being uploaded to the corresponding HIS fields. Evaluation included mixed error rate for ASR accuracy, <italic>F</italic><sub>1</sub>-scores, and agreement statistics for DART classification, hallucination assessments based on factual correctness, and analysis of nurse feedback on system use.</p></sec><sec sec-type="results"><title>Results</title><p>The fine-tuned ASR model reduced the mixed error rate from 44.79% to 6.67%. DART generation achieved a macroaveraged <italic>F</italic><sub>1</sub>-score of 0.82 (95% CI 0.80&#x2010;0.84) and met the noninferiority margin relative to human transcripts (&#x03B4;=&#x2212;0.04). The hallucination rate was 2.51%. During deployment, the monthly volume of valid nursing notes generated through voluntary use of the ASR system increased from 32,724 to 65,417, where each note represented a single documentation entry generated per patient care episode. Among 120 participating nurses, 91 (75.8%) reported reduced workload and improved completeness.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The integrated ASR and LLM system was feasible and showed strong performance, with good acceptance among clinical nurses. It reduced the manual documentation burden, improved record completeness, and demonstrated the value of an ASR- and LLM-supported workflow for nursing documentation.</p></sec></abstract><kwd-group><kwd>automatic speech recognition</kwd><kwd>code-switching</kwd><kwd>large language model</kwd><kwd>nursing</kwd><kwd>documentation</kwd><kwd>nursing records</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Taiwan has faced a sustained shortage of clinical nurses, driven largely by low retention among licensed professionals rather than limited training capacity. The national nursing practice rate&#x2014;representing the percentage of licensed nurses actively working in the profession&#x2014;remains at only 59.1% [<xref ref-type="bibr" rid="ref1">1</xref>]. This low retention of qualified staff exacerbates the clinical shortage, and excessive workloads and long shifts are strongly associated with burnout and turnover intention [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. Documentation occupies a substantial portion of clinical workflow time, and existing digital systems have not fully reduced the need for manual input. As hospitals have expanded the use of hospital information systems (HISs), additional documentation requirements have emerged, prompting efforts to standardize formats and improve system integration [<xref ref-type="bibr" rid="ref5">5</xref>]. Structured and interoperable records support continuity of care, and structured handoff protocols improve communication quality [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Moreover, higher perceived nursing information system quality is linked to greater use, satisfaction, and retention, while electronic record use embedded in routine workflows reduces documentation workload and intention to leave [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. These findings highlight the need for documentation tools that are structured, interoperable, and well-integrated into daily clinical practice.</p><p>Many hospitals have introduced structured documentation frameworks to improve consistency and communication. Among these, the Focus Charting method has been particularly influential in Taiwan. Originating in the late 1980s [<xref ref-type="bibr" rid="ref10">10</xref>] and adopted locally in the early 1990s [<xref ref-type="bibr" rid="ref11">11</xref>], it organizes each nursing note around a defined patient focus and follows the DART (data, action, response, and teaching) pattern to support concise and standardized recording of observations, interventions, patient responses, and teaching activities. Implementation studies have reported clearer note organization, improved execution of care plans, and enhanced interdisciplinary collaboration in intensive care settings [<xref ref-type="bibr" rid="ref12">12</xref>]. However, structured formats alone do not resolve the documentation burden. Even with electronic templates, nurses must still enter most information manually, and flowsheets&#x2014;standardized tabular records used for tracking routine, time-sequenced clinical parameters such as vital signs&#x2014;may occupy a substantial portion of each shift [<xref ref-type="bibr" rid="ref13">13</xref>]. These limitations highlight the need for intelligent support that preserves the clarity of DART records while reducing repetitive manual effort.</p><p>Automatic speech recognition (ASR) has been explored as a way to reduce manual documentation effort, with multisite evaluations reporting higher efficiency and satisfaction [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. In Taiwan, a longitudinal study involving 21 nurses (using a corpus of 30,112 words) found that mean accuracy improved from 87.06% to 95.07% across 4 evaluation sessions as users developed more stable speaking speeds and volumes [<xref ref-type="bibr" rid="ref16">16</xref>]. Furthermore, an ASR system designed for code-switching&#x2014;the practice of alternating between languages, such as Mandarin and English, within a single utterance&#x2014;achieved a word error rate (a standard metric representing the percentage of transcription errors) of 12.3% in intensive care settings [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>Recent reviews suggest that clinical speech technologies are rapidly evolving toward large language model (LLM)&#x2013;driven ambient clinical intelligence, with mature commercial solutions (such as Heidi, Tortus, and Rebrief.ai [Rebrief Inc]) being successfully deployed in various settings to provide summarization, repurposing, and assistive autonomy [<xref ref-type="bibr" rid="ref18">18</xref>]. However, although these ambient scribes excel in standard clinical encounters, their direct application to Taiwanese nursing workflows presents significant challenges. First, commercial systems are primarily optimized for monolingual or standard bilingual speech and often struggle with the dense, specialized Mandarin-English code-switching and local abbreviations prevalent in Taiwanese hospital wards. Second, existing tools typically generate generalized clinical summaries (eg, subjective, objective, assessment, and plan notes), lacking the fine-grained, schema-constrained precision required for specialized nursing frameworks such as DART and its direct interoperability with local HISs. Therefore, a bespoke ASR and LLM pipeline is necessary to address these specific linguistic and structural demands. To bridge this gap, this study developed and evaluated an integrated voice-to-report system that combines a domain-adapted ASR model with an LLM to generate structured DART nursing documentation aligned with HIS fields. The system was assessed using transcription accuracy, DART classification performance, and noninferiority testing, and its feasibility and workflow impact were examined during real-world deployment.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and System Overview</title><p>This development and evaluation study implemented a mobile-based nursing documentation system designed to convert voice input into structured clinical records (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The system was deployed at China Medical University Hospital (CMUH) to evaluate its feasibility, transcription accuracy, and impact on documentation workflow. During routine care, nurses recorded observations and interventions using a mobile device. Audio input was processed by a domain-adapted Whisper ASR model trained on a Mandarin-English code-switching corpus and adapted to the medical terminology common in nursing communication. Transcribed text was subsequently processed by GPT-4o (OpenAI) using a schema-constrained prompt to generate standardized DART notes primarily in Mandarin while preserving English medical terminology. Nurses reviewed the generated content and triggered its upload to predefined HIS fields.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of the voice-to-DART (data, action, response, and teaching) documentation system. ASR: automatic speech recognition; HIS: hospital information system; LLM: large language model; LoRA: low-rank adaptation; MER: mixed error rate.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="nursing_v9i1e88567_fig01.png"/></fig><p>The system was developed in-house by the Artificial Intelligence and Robotics Innovation Center at CMUH. The codebase and intellectual property are proprietary. Due to strict patient privacy regulations and direct HIS integration, the software and raw datasets are not publicly available. However, to ensure methodological reproducibility, all model hyperparameters, evaluation frameworks, and complete schema-constrained prompt templates are fully detailed in this paper and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-2"><title>Dataset and Ground-Truth Preparation</title><p>The ASR model was trained and evaluated on nursing speech data from 3 complementary sources. The primary corpus, CMaiSpeech, comprised spontaneous Mandarin-English code-switched recordings from 525 nurses across multiple clinical units, characterized by frequent English medical terms and abbreviations. <xref ref-type="table" rid="table1">Table 1</xref> summarizes the counts and durations of CMaiSpeech by category. Two synthetic corpora (approximately 9 hours) were generated using neural text-to-speech models from 1838 drug names and 1566 clinical terms to enrich underrepresented medical vocabulary. A real-world clinical dataset (totaling approximately 6 hours, 1608 utterances) captured authentic ward conversations, including vital-sign reporting and handovers. All recordings were normalized and resampled to 16-kHz pulse code modulation to ensure acoustic consistency. These datasets collectively served as the domain-adapted speech material used for training and evaluating the Whisper-based ASR model in nursing contexts.</p><p>In addition, a validation dataset of 327 nurse-authored documentation samples was constructed. Each sample included an audio recording collected during routine care, a manually transcribed reference serving as the ASR gold standard, and structured DART annotations verified by clinical nurses. This dataset was used to evaluate ASR transcription accuracy, LLM-based DART structuring, and agreement between fine-tuned and zero-shot conditions under a consistent reference standard.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Counts and audio durations by category in the CMaiSpeech dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Item count (N=7055), n (%)</td><td align="left" valign="bottom">Audio length (seconds)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Diastolic BP<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">20 (0.28)</td><td align="left" valign="top">36.57</td></tr><tr><td align="left" valign="top">Glasgow Coma Scale eye response component</td><td align="left" valign="top">20 (0.28)</td><td align="left" valign="top">99.61</td></tr><tr><td align="left" valign="top">Pain index</td><td align="left" valign="top">20 (0.28)</td><td align="left" valign="top">46.05</td></tr><tr><td align="left" valign="top">Pulse</td><td align="left" valign="top">20 (0.28)</td><td align="left" valign="top">34.27</td></tr><tr><td align="left" valign="top">Systolic BP</td><td align="left" valign="top">20 (0.28)</td><td align="left" valign="top">39.34</td></tr><tr><td align="left" valign="top">Body temperature</td><td align="left" valign="top">20 (0.28)</td><td align="left" valign="top">39.10</td></tr><tr><td align="left" valign="top">Vital signs<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">1035 (14.67)</td><td align="left" valign="top">12,350.10</td></tr><tr><td align="left" valign="top">Nursing notes<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">5900 (83.65)</td><td align="left" valign="top">75,149.10</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Total duration of all audio samples within each category.</p></fn><fn id="table1fn2"><p><sup>b</sup>BP: blood pressure.</p></fn><fn id="table1fn3"><p><sup>c</sup>Categories representing full-length clinical recordings captured during routine nursing documentation activities.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>ASR Model Adaptation and Evaluation</title><p>Whisper is a multilingual Transformer-based ASR model pretrained on 680,000 hours of diverse speech data [<xref ref-type="bibr" rid="ref19">19</xref>]. Although its large-scale pretraining supports robust zero-shot generalization, accuracy declines in nursing-specific communication, where frequent Mandarin-English code-switching, dense medical terminology, and numerous drug abbreviations remain underrepresented. These discrepancies limit its direct applicability to structured nursing documentation.</p><p>Prior studies have demonstrated that domain-targeted fine-tuning markedly improves Whisper&#x2019;s performance in medical and mixed-language contexts. Previous studies reported gains in ASR and named entity recognition on Mandarin speech [<xref ref-type="bibr" rid="ref20">20</xref>], and further improvements were achieved using domain-adapted fine-tuning and prompting on Mandarin-English medical data [<xref ref-type="bibr" rid="ref21">21</xref>]. Building on these findings, Whisper (large-v2; OpenAI) was adapted to the nursing domain through low-rank adaptation&#x2013;based parameter-efficient fine-tuning, which introduced trainable low-rank matrices into frozen weights to reduce computational cost while preserving performance [<xref ref-type="bibr" rid="ref22">22</xref>]. The model was trained on a domain-specific nursing speech corpus and evaluated using mixed error rate (MER) to assess bilingual transcription accuracy relative to the baseline model.</p><p>Prior to MER calculation, a rigorous text normalization pipeline was applied to both reference and hypothesis transcripts to ensure consistent scoring. All full-width alphanumeric characters were converted to half-width equivalents, and punctuation marks were removed. Original casing for English text and medical abbreviations (eg, &#x201C;SpO<sub>2</sub>&#x201D; and &#x201C;BP&#x201D;) was strictly preserved to evaluate the model&#x2019;s ability to output correctly capitalized clinical terms. Crucially, numeric values, including decimals, were evaluated as independent, contiguous tokens rather than as split characters to accurately reflect their semantic weight in clinical parameters such as vital signs.</p></sec><sec id="s2-4"><title>LLM Integration and Evaluation</title><p>The LLM was integrated to convert ASR transcripts into structured nursing documentation following the DART schema. GPT-4o (accessed via Azure OpenAI, application programming interface version 2024-12-01-preview), a multimodal variant of the GPT-4o family optimized for instruction following and multilingual text generation, was used for schema-based text structuring [<xref ref-type="bibr" rid="ref23">23</xref>]. The model hyperparameters were consistently set to a temperature of 0.7, a top_p of 0.95, and a maximum of 16,000 tokens. A schema-constrained prompting strategy was applied to ensure consistent field separation and syntactic completeness, informed by prior evidence that structured prompting improves validity in clinical text generation [<xref ref-type="bibr" rid="ref24">24</xref>]. The generation process used a dual-prompt structure: a system prompt defining the explicit task instructions and schema constraints, and a user prompt containing the ASR transcript. Two prompting configurations (a minimal version and a schema-constrained version) were compared during preliminary testing, and the schema-constrained configuration was selected for deployment. The full system prompt template is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Model outputs were postprocessed to verify structural integrity and mapped to predefined HIS fields. LLM performance was evaluated on the 327-case validation dataset using field-level and macroaveraged <italic>F</italic><sub>1</sub>-score values across DART categories. The 95% CIs were estimated using 1000 paired bootstrap resamples at the case level. Noninferiority was tested using a predefined margin of &#x03B4;=0.05, with noninferiority established when the lower bound of the &#x2206;<italic>F</italic><sub>1</sub>-score CI exceeded &#x2212;0.05. This 5% margin aligns with recent clinical evaluations of LLM-generated medical documentation, where a &#x003C;5% variance is considered clinically acceptable for draft generation [<xref ref-type="bibr" rid="ref25">25</xref>], and serves as a standard threshold in medical artificial intelligence performance evaluations [<xref ref-type="bibr" rid="ref26">26</xref>]. From a clinical safety perspective, because the system captures data directly at the bedside via mobile devices, it inherently mitigates the high, undocumented risk of human memory decay and omission associated with traditional delayed documentation at the nursing station. Furthermore, because the system strictly requires human-in-the-loop verification prior to HIS submission, this minor structural variance during the drafting stage was deemed an acceptable trade-off for reduced documentation burden, while human oversight securely mitigates the risk of high-severity clinical errors.</p></sec><sec id="s2-5"><title>Agreement Method</title><p>Three input conditions were compared while holding the structuring LLM constant: (1) human transcripts (reference), (2) Whisper large-v2 (zero-shot), and (3) Whisper large-v2 (fine-tuned). The primary end point was the <italic>F</italic><sub>1</sub>-score for DART slot classification against adjudicated references. A predefined noninferiority margin of &#x03B4;=0.05 for &#x0394;<italic>F</italic><sub>1</sub>-score was applied, and 95% CIs were estimated using paired bootstrap resampling. Noninferiority was established when the lower bound of the &#x0394;<italic>F</italic><sub>1</sub>-score CI exceeded &#x2212;0.05.</p><p>Field-level consistency was assessed because the &#x201C;data,&#x201D; &#x201C;action,&#x201D; &#x201C;response,&#x201D; and &#x201C;teaching&#x201D; components of the DART framework represent distinct documentation intents. Labels were binarized (present-or-absent) after whitespace trimming, and both percent agreement and Cohen &#x03BA; were calculated to adjust for chance agreement under skewed prevalence distributions [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>It should be noted that the binarized <italic>F</italic><sub>1</sub>-score and agreement metrics were specifically chosen to evaluate the system&#x2019;s capability in initial structural triaging, ensuring that the LLM correctly maps spoken intents to the corresponding DART fields. Because the system operates within a strict human-in-the-loop workflow in which nurses must review and adjust the drafted text prior to HIS submission, achieving high structural accuracy significantly reduces manual documentation burden, even if minor within-field reallocations are occasionally required. To address content accuracy and penalize clinically incorrect information within these fields, the FactualCorrectness metric was used separately during the hallucination assessment.</p></sec><sec id="s2-6"><title>Hallucination Assessment</title><p>Hallucination was defined as any generated content that lacked support from the corresponding reference transcript for the same case. Evaluation followed the FactualCorrectness metric from the Retrieval-Augmented Generation Assessment (RAGAS) framework (precision mode), which decomposes model output into atomic claims and verifies each claim against the reference transcript [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. A fixed Azure OpenAI model (GPT-4o, application programming interface version 2024-12-01-preview; temperature=0.7, top_p=0.95) was used as the evaluator via a LangChain wrapper to ensure consistent claim-level scoring. For each case, the 4 DART fields were concatenated into a single response, and both generated and reference texts were normalized prior to evaluation.</p><p>Mean factual-correctness precision and its complement (hallucination rate) were reported, along with the proportion of notes exhibiting hallucination under strict (&#x003C;1.00) and relaxed (&#x003C;0.95) thresholds. All computations were performed in Python (version 3.12; Python Software Foundation) using the RAGAS library [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s2-7"><title>System Use and User Feedback</title><p>To evaluate real-world feasibility and system adoption, use metrics were monitored during the initial deployment period at CMUH. Monthly use was quantified by extracting the total number of system-generated records from the application logs between March 2025 and August 2025.</p><p>Additionally, user feedback was collected to assess system acceptability and its impact on clinical workflow. Nursing staff from diverse clinical units, including wards and intensive care departments, voluntarily provided evaluations after integrating the system into their routine practice. The evaluation mechanism captured both quantitative satisfaction ratings (categorized as favorable or dissatisfied) and qualitative comments regarding user experience. Descriptive statistics were used to summarize the quantitative use and satisfaction data, while qualitative feedback underwent thematic analysis to identify common themes related to documentation efficiency, manual workload, and system accuracy.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study was approved by the Institutional Review Board of CMUH (CMUH110-REC2-181 and CMUH110-REC2-187). Since this was a retrospective study utilizing deidentified data, the requirement for informed consent was waived by the institutional review board. Patient privacy and data confidentiality were strictly maintained throughout the study, and no compensation was involved.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Evaluation Dataset Characteristics</title><p>The evaluation dataset comprised 327 annotated nursing documentation samples, as described in the Methods section. Across transcripts, the corpus contained 12,136 Chinese characters, 1361 English words, and 1130 numeric tokens, equivalent to approximately 11.2 English words per 100 Chinese characters (<xref ref-type="table" rid="table2">Table 2</xref>, panel A). A total of 540 DART annotations were identified: 257 (47.6%) for &#x201C;data,&#x201D; 123 (22.8%) for &#x201C;action,&#x201D; 123 (22.8%) for &#x201C;response,&#x201D; and 37 (6.9%) for &#x201C;teaching&#x201D; (<xref ref-type="table" rid="table2">Table 2</xref>, panel B). The lower frequency of &#x201C;teaching&#x201D; entries reflects routine documentation patterns and was accounted for when estimating field-specific CIs. This dataset served as the common benchmark for evaluating both ASR and LLM outputs.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Token composition and DART (data, action, response, and teaching) annotation distribution in the evaluation dataset.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="2">Panels and categories</td><td align="left" valign="top" colspan="2">Distribution, n (%)</td><td align="left" valign="top">Count per sample<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Panel A: token<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> composition (n=14,627)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chinese characters</td><td align="char" char="." valign="top" colspan="2">12,136 (83.0)</td><td align="char" char="." valign="top">37.11 (24.31)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>English words</td><td align="char" char="." valign="top" colspan="2">1361 (9.3)</td><td align="char" char="." valign="top">4.16 (5.56)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Numeric tokens</td><td align="char" char="." valign="top" colspan="2">1130 (7.7)</td><td align="char" char="." valign="top">3.46 (4.97)</td></tr><tr><td align="left" valign="top" colspan="5">Panel B: DART annotation distribution (n=540)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Data</td><td align="char" char="." valign="top" colspan="2">257 (47.6)</td><td align="char" char="." valign="top">0.79 (0.41)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Action</td><td align="char" char="." valign="top" colspan="2">123 (22.8)</td><td align="char" char="." valign="top">0.38 (0.49)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Response</td><td align="char" char="." valign="top" colspan="2">123 (22.8)</td><td align="char" char="." valign="top">0.38 (0.49)</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Teaching</td><td align="char" char="." valign="top" colspan="2">37 (6.8)</td><td align="char" char="." valign="top">0.11 (0.32)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Token counts were computed from manual transcripts after text normalization; Chinese tokens represent individual characters, English tokens represent space-delimited words, and numeric tokens represent digit sequences.</p></fn><fn id="table2fn2"><p><sup>b</sup>Values were calculated by dividing counts by the total number of samples (n=327).</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>ASR Model Performance</title><p>Whisper large-v2 was evaluated using MER, computed at the character level for Chinese and at the word level for English. In the zero-shot condition, MER was 44.79%. After low-rank adaptation&#x2013;based fine-tuning on the nursing corpus, MER decreased to 6.67%, corresponding to an 85.11% relative error reduction (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance of Whisper large-v2 on the evaluation dataset.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Models</td><td align="left" valign="bottom">Mixed error rate (%)</td><td align="left" valign="bottom">Relative error reduction (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Zero-shot model<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">44.79</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">Fine-tuned model<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">6.67</td><td align="left" valign="top">85.11</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Zero-shot: Whisper-Large-v2 inference without domain adaptation.</p></fn><fn id="table3fn2"><p><sup>b</sup>Not applicable.</p></fn><fn id="table3fn3"><p><sup>c</sup>Fine-tuned: Whisper-Large-v2 adapted using low-rank adaptation LoRA on the nursing speech corpus.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>DART Classification Performance</title><p>DART classification performance was evaluated under 3 input conditions: human-transcribed references, fine-tuned ASR, and zero-shot ASR (<xref ref-type="table" rid="table4">Table 4</xref>). A schema-constrained prompting strategy was used for all analyses after preliminary comparison showed higher slot-level completeness and <italic>F</italic><sub>1</sub>-score performance than a minimal prompt.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Per-field DART (data, action, response, and teaching) <italic>F</italic><sub>1</sub>-score performance with 95% CIs under 3 input conditions.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Fields</td><td align="left" valign="bottom">Zero-shot<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>, <italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="bottom">Fine-tuned<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>, <italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="bottom">Human transcript<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup>, <italic>F</italic><sub>1</sub>-score (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Data</td><td align="left" valign="top">0.89 (0.85&#x2010;0.92)</td><td align="left" valign="top">0.90 (0.87&#x2010;0.93)</td><td align="left" valign="top">0.91 (0.89&#x2010;0.94)</td></tr><tr><td align="left" valign="top">Action</td><td align="left" valign="top">0.59 (0.53&#x2010;0.66)</td><td align="left" valign="top">0.77 (0.73&#x2010;0.81)</td><td align="left" valign="top">0.83 (0.79&#x2010;0.86)</td></tr><tr><td align="left" valign="top">Response</td><td align="left" valign="top">0.72 (0.67&#x2010;0.77)</td><td align="left" valign="top">0.82 (0.79&#x2010;0.85)</td><td align="left" valign="top">0.85 (0.82&#x2010;0.88)</td></tr><tr><td align="left" valign="top">Teaching</td><td align="left" valign="top">0.62 (0.56&#x2010;0.68)</td><td align="left" valign="top">0.78 (0.73&#x2010;0.82)</td><td align="left" valign="top">0.83 (0.79&#x2010;0.86)</td></tr><tr><td align="left" valign="top">Macroaverage<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.70 (0.68&#x2010;0.72)</td><td align="left" valign="top">0.82 (0.80&#x2010;0.84)</td><td align="left" valign="top">0.85 (0.83&#x2010;0.87)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Zero-shot inference without domain adaptation.</p></fn><fn id="table4fn2"><p><sup>b</sup>Fine-tuned using low-rank adaptation on the nursing speech corpus.</p></fn><fn id="table4fn3"><p><sup>c</sup>Manually transcribed references used as the gold standard for automatic speech recognition evaluation.</p></fn><fn id="table4fn4"><p><sup>d</sup>Unweighted mean <italic>F</italic><sub>1</sub>-score across data, action, response, and teaching fields.</p></fn></table-wrap-foot></table-wrap><p>With this configuration, overall <italic>F</italic><sub>1</sub>-score values were 0.85 for human transcripts, 0.82 for fine-tuned ASR, and 0.70 for zero-shot ASR. The difference between fine-tuned ASR and human input (&#x0394;<italic>F</italic><sub>1</sub>-score=&#x2212;0.03) fell within the predefined noninferiority margin (&#x03B4;=0.05). Field-level &#x0394;<italic>F</italic><sub>1</sub>-score values ranged from &#x2212;0.01 (for &#x201C;data&#x201D;) to &#x2212;0.06 (for &#x201C;action&#x201D;). Zero-shot ASR showed uniformly lower performance across all fields.</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> presents field-level &#x0394;<italic>F</italic><sub>1</sub>-score estimates with 95% CIs. All field means remained within the noninferiority boundary, supporting noninferiority at the macro level.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Field-level &#x0394;<italic>F</italic><sub>1</sub>-score values (automatic speech recognition [ASR]&#x2013;human reference) with 95% CIs and the noninferiority boundary (&#x03B4;=&#x2212;0.05).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="nursing_v9i1e88567_fig02.png"/></fig></sec><sec id="s3-4"><title>Agreement Evaluation</title><p>Concordance was assessed across the 3 input conditions while holding the LLM constant. Agreement between classifications derived from the fine-tuned ASR and human transcripts ranged from 88.38% to 96.94%, with Cohen &#x03BA; ranging from 0.66 to 0.86 across the &#x201C;data,&#x201D; &#x201C;action,&#x201D; &#x201C;response,&#x201D; and &#x201C;teaching&#x201D; fields (<xref ref-type="table" rid="table5">Table 5</xref>), corresponding to substantial to almost perfect agreement [<xref ref-type="bibr" rid="ref28">28</xref>]. For the &#x201C;data&#x201D; field, raw agreement was 95.41% with &#x03BA;=0.66, a pattern consistent with imbalanced class prevalence. Overall, slot-level results were consistent with the macrolevel noninferiority findings.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Agreement and Cohen kappa for DART (data, action, response, and teaching) fields comparing large language model outputs generated from fine-tuned automatic speech recognition with outputs from human transcripts.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Fields</td><td align="left" valign="bottom">Agreement<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> (%)</td><td align="left" valign="bottom">Cohen &#x03BA;<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Data</td><td align="left" valign="top">95.41</td><td align="left" valign="top">0.66</td></tr><tr><td align="left" valign="top">Action</td><td align="left" valign="top">88.38</td><td align="left" valign="top">0.75</td></tr><tr><td align="left" valign="top">Response</td><td align="left" valign="top">89.91</td><td align="left" valign="top">0.79</td></tr><tr><td align="left" valign="top">Teaching</td><td align="left" valign="top">96.94</td><td align="left" valign="top">0.86</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Percent agreement between large language model outputs generated from fine-tuned automatic speech recognition transcripts and human transcript inputs.</p></fn><fn id="table5fn2"><p><sup>b</sup>Chance-corrected agreement coefficient computed using the standard Cohen kappa formulation.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Hallucination Evaluation</title><p>Hallucination rates were evaluated using the FactualCorrectness metric from the RAGAS framework in precision mode (<xref ref-type="table" rid="table6">Table 6</xref>). With human transcripts, the mean hallucination rate was 2.35% (SD 9.93%; 95% CI 1.27&#x2013;3.43), and hallucinations were detected in 28 (8.56%) samples (95% CI 5.99&#x2013;12.10). Fine-tuned ASR yielded a mean hallucination rate of 2.51% (SD 10.86%; 95% CI 1.33&#x2013;3.70), with hallucinations detected in 26 (7.95%) samples (95% CI 5.48&#x2013;11.40). Zero-shot ASR showed higher hallucination rates, with a mean rate of 8.98% (SD 23.99%; 95% CI 6.36&#x2013;11.60) and hallucinations detected in 65 (20%) samples (95% CI 16.01&#x2013;24.69).</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Hallucination rates of large language model outputs across different input sources.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Input sources</td><td align="left" valign="bottom">Hallucination rate<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> (%), mean (SD)</td><td align="left" valign="bottom">Samples with hallucination<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup>, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Human transcript</td><td align="left" valign="top">2.35 (9.93)</td><td align="left" valign="top">28 (8.56)</td></tr><tr><td align="left" valign="top">Fine-tuned ASR<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">2.51 (10.86)</td><td align="left" valign="top">26 (7.95)</td></tr><tr><td align="left" valign="top">Zero-shot ASR</td><td align="left" valign="top">8.98 (23.99)</td><td align="left" valign="top">65 (20.00)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Computed using the FactualCorrectness metric (precision mode), representing the proportion of hallucinated tokens relative to all generated tokens.</p></fn><fn id="table6fn2"><p><sup>b</sup>Percentage of evaluation samples containing at least 1 hallucination (n=327).</p></fn><fn id="table6fn3"><p><sup>c</sup>ASR: automatic speech recognition.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>System Use and User Feedback</title><p>Following deployment, the system was increasingly incorporated into routine documentation at CMUH. Monthly use nearly doubled between March 2025 and August 2025, rising from 32,724 to 65,417 valid DART notes generated and uploaded to the HIS through voluntary use of the system (<xref ref-type="fig" rid="figure3">Figure 3</xref>). This upward steady increase indicates expanded use across clinical units during the initial deployment period.</p><p>User feedback was collected from a subset of nursing staff who voluntarily provided evaluations (n=120, representing approximately 12.2% of the 982 nurses actively working in the 44 participating units across wards and intensive care departments). Among these participants, 91 (75.8%) nurses reported a favorable experience, whereas 29 (24.2%) expressed dissatisfaction. Positive feedback emphasized reductions in manual transcription burden and improvements in documentation efficiency, whereas negative feedback primarily concerned occasional recognition inaccuracies and the continued need for verification. Although the feedback sample does not represent all users, it reflects the perspectives of frontline staff actively engaged with the system. Detailed departmental participation is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Collectively, these findings demonstrate that the system is both technically feasible and operationally acceptable in real-world clinical environments.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Monthly system use at China Medical University Hospital (CMUH) from March 2025 to August 2025.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="nursing_v9i1e88567_fig03.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>This study designed and implemented a speech-based documentation system that integrates a domain-adapted ASR model with an LLM to generate structured nursing records and evaluated its performance in real clinical settings. The fine-tuned Whisper large-v2 model achieved a MER of 6.6%, demonstrating high accuracy for Mandarin-English nursing speech. With schema-guided LLM structuring, the system reached an <italic>F</italic><sub>1</sub>-score of 0.82, which was statistically noninferior to human-transcribed input, and exhibited a low hallucination frequency (2.51%). These findings indicate that domain-adapted ASR combined with LLM-based structuring can produce structurally consistent draft documentation that, alongside routine human verification, reduces cognitive and transcription burden in routine practice.</p><p>Field deployment demonstrated that the system could be incorporated into routine clinical workflows. By enabling point-of-care mobile dictation directly at the bedside, the system effectively replaced the traditional, error-prone workflow of delayed, memory-reliant documentation at the nursing station, directly addressing the critical pain points exacerbated by high nurse-to-patient ratios. System use doubled over the 6-month observation period. Because the hospital operates at a consistently high and stable capacity, this increase in absolute volume was not driven by fluctuations in patient census. Rather, as the system was deployed as an optional tool, this steady growth reflects successful voluntary adoption, supported by continuous iterative optimizations based on user feedback. Feedback from nursing staff further supported its usability. Most respondents reported reduced manual effort and improved efficiency, while negative feedback focused on recognition accuracy and the need for verification, which aligns with the human-in-the-loop design. Participation across 6 major ward categories indicates that the system was used in a wide range of clinical settings.</p><p>Several limitations warrant consideration. First, the dataset was modest and sourced from a single institution, which may constrain generalizability. Second, user feedback was voluntary and may overrepresent individuals who were more engaged with the system, and long-term patterns of use were not assessed. Third, our evaluation of structural content accuracy and hallucination relied primarily on an LLM-as-a-judge framework (RAGAS). The primary purpose of this evaluation was to conduct a strict semantic fidelity check, ensuring that the LLM strictly formatted the ASR transcripts into the DART schema without altering, inferring, or omitting clinical facts. Although RAGAS performs robust semantic contradiction checks, traditional non-LLM baselines (eg, exact string matching) were deemed unsuitable because of the heavy code-switching and spoken-to-written paraphrasing inherent in nursing documentation. Although senior nursing staff qualitatively reviewed the generated drafts within our human-in-the-loop workflow, the lack of a formal, quantitative manual fidelity evaluation on a stratified sample of high-risk note types by nursing professionals remains a limitation. Future work should include multisite validation; evaluation of interoperability with diverse HIS environments; dedicated quantitative manual safety reviews by senior nurses to further validate that the LLM does not alter critical nursing parameters (eg, medication dosages and invasive device settings); and longitudinal assessment of impacts on documentation quality, workflow efficiency, and staff satisfaction.</p></sec></body><back><notes><sec><title>Funding</title><p>The authors declare that no financial support was received for this work.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ASR</term><def><p>automatic speech recognition</p></def></def-item><def-item><term id="abb2">CMUH</term><def><p>China Medical University Hospital</p></def></def-item><def-item><term id="abb3">DART</term><def><p>data, action, response, and teaching</p></def></def-item><def-item><term id="abb4">HIS</term><def><p>hospital information system</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MER</term><def><p>mixed error rate</p></def></def-item><def-item><term id="abb7">RAGAS</term><def><p>Retrieval-Augmented Generation Assessment</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chiang</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>MH</given-names> </name><etal/></person-group><article-title>Understanding the factors associated with nurse employment in clinics: experiences in Taiwan</article-title><source>J Chin Med Assoc</source><year>2024</year><month>07</month><day>1</day><volume>87</volume><issue>7</issue><fpage>670</fpage><lpage>677</lpage><pub-id pub-id-type="doi">10.1097/JCMA.0000000000001118</pub-id><pub-id pub-id-type="medline">38819144</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>SF</given-names> </name><name name-style="western"><surname>Ching</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>RH</given-names> </name></person-group><article-title>Pathways among the nursing practice environment, job burnout, and job satisfaction to intention to leave: a cross-sectional study conducted in Taiwan</article-title><source>Rev Esc Enferm USP</source><year>2024</year><volume>58</volume><fpage>e20240025</fpage><pub-id pub-id-type="doi">10.1590/1980-220X-REEUSP-2024-0025en</pub-id><pub-id pub-id-type="medline">39392469</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>RT</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Hsia</surname><given-names>YF</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>CC</given-names> </name></person-group><article-title>Long working hours and burnout in health care workers: non-linear dose-response relationship and the effect mediated by sleeping hours-a cross-sectional study</article-title><source>J Occup Health</source><year>2021</year><month>01</month><volume>63</volume><issue>1</issue><fpage>e12228</fpage><pub-id pub-id-type="doi">10.1002/1348-9585.12228</pub-id><pub-id pub-id-type="medline">33957007</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>YL</given-names> </name><name name-style="western"><surname>Chin</surname><given-names>WS</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>NY</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Shiao</surname><given-names>JS</given-names> </name></person-group><article-title>Patient-nurse ratio is related to nurses&#x2019; intention to leave their job through mediating factors of burnout and job dissatisfaction</article-title><source>Int J Environ Res Public Health</source><year>2019</year><month>11</month><day>29</day><volume>16</volume><issue>23</issue><fpage>4801</fpage><pub-id pub-id-type="doi">10.3390/ijerph16234801</pub-id><pub-id pub-id-type="medline">31795420</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>FT</given-names> </name></person-group><article-title>Caring for computers: the hidden work of clinical nurses during the introduction of health information systems in a teaching hospital in Taiwan</article-title><source>Nurs Rep</source><year>2021</year><month>02</month><day>13</day><volume>11</volume><issue>1</issue><fpage>105</fpage><lpage>119</lpage><pub-id pub-id-type="doi">10.3390/nursrep11010011</pub-id><pub-id pub-id-type="medline">34968317</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hsu</surname><given-names>CN</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>FJ</given-names> </name><etal/></person-group><article-title>Continuity and completeness of electronic health record data for patients treated with oral hypoglycemic agents: findings from healthcare delivery systems in Taiwan</article-title><source>Front Pharmacol</source><year>2022</year><volume>13</volume><fpage>845949</fpage><pub-id pub-id-type="doi">10.3389/fphar.2022.845949</pub-id><pub-id pub-id-type="medline">35444533</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCarthy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Motala</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lawson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Shekelle</surname><given-names>PG</given-names> </name></person-group><article-title>Use of structured handoff protocols for within-hospital unit transitions: a systematic review from Making Healthcare Safer IV</article-title><source>BMJ Qual Saf</source><year>2025</year><month>09</month><day>18</day><volume>34</volume><issue>10</issue><fpage>680</fpage><lpage>690</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2024-018385</pub-id><pub-id pub-id-type="medline">40306923</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>TJ</given-names> </name><etal/></person-group><article-title>A comparison of two cross-sectional studies on successful model of introducing nursing information system in a regional teaching hospital in Taiwan</article-title><source>Comput Inform Nurs</source><year>2022</year><month>08</month><day>1</day><volume>40</volume><issue>8</issue><fpage>571</fpage><lpage>579</lpage><pub-id pub-id-type="doi">10.1097/CIN.0000000000000818</pub-id><pub-id pub-id-type="medline">34740222</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>GL</given-names> </name><name name-style="western"><surname>Lotus Shyu</surname><given-names>YI</given-names> </name><etal/></person-group><article-title>Deep structure usage of electronic patient records: enhancing the influence of nurses&#x2019; professional commitment to decrease turnover intention: deep structure usage and turnover</article-title><source>J Nurs Manag</source><year>2024</year><volume>2024</volume><fpage>5822368</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://pubmed.ncbi.nlm.nih.gov/40224842/">https://pubmed.ncbi.nlm.nih.gov/40224842/</ext-link></comment><pub-id pub-id-type="doi">10.1155/2024/5822368</pub-id><pub-id pub-id-type="medline">40224842</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lampe</surname><given-names>SS</given-names> </name></person-group><article-title>Focus charting: streamlining documentation</article-title><source>Nurs Manage</source><year>1985</year><month>07</month><volume>16</volume><issue>7</issue><fpage>43</fpage><lpage>46</lpage><pub-id pub-id-type="medline">3847830</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>CL</given-names> </name></person-group><article-title>Focus charting: streamlining documentation</article-title><source>Hu Li Za Zhi</source><year>1993</year><access-date>2026-05-27</access-date><volume>40</volume><issue>3</issue><fpage>87</fpage><lpage>92</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.airitilibrary.com/Article/Detail/0047262x-199309-40-3-87-92-a">https://www.airitilibrary.com/Article/Detail/0047262x-199309-40-3-87-92-a</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>HW</given-names> </name></person-group><article-title>The effectiveness of application the focus charting system in clinical nursing care plan</article-title><source>J Hosp Bimonthly</source><year>2016</year><volume>49</volume><issue>5</issue><fpage>22</fpage><lpage>33</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.airitilibrary.com/Article/Detail/P20130829001-201610-201612090014-201612090014-22-33">https://www.airitilibrary.com/Article/Detail/P20130829001-201610-201612090014-201612090014-22-33</ext-link></comment><pub-id pub-id-type="doi">10.30260/HOSPITALBi.201610_49(5).0004</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacques</surname><given-names>D</given-names> </name><name name-style="western"><surname>Will</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dauterman</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Evaluating nurses&#x2019; perceptions of documentation in the electronic health record: multimethod analysis</article-title><source>JMIR Nurs</source><year>2025</year><month>04</month><day>28</day><volume>8</volume><fpage>e69651</fpage><pub-id pub-id-type="doi">10.2196/69651</pub-id><pub-id pub-id-type="medline">40294588</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duggan</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Gervase</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schoenbaum</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Clinician experiences with ambient scribe technology to assist with documentation burden and efficiency</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2460637</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.60637</pub-id><pub-id pub-id-type="medline">39969880</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shour</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Anguzu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Onitilo</surname><given-names>AA</given-names> </name></person-group><article-title>Speech recognition technology and documentation efficiency</article-title><source>JAMA Netw Open</source><year>2025</year><month>03</month><day>3</day><volume>8</volume><issue>3</issue><fpage>e251526</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.1526</pub-id><pub-id pub-id-type="medline">40126483</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Li</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Chou</surname><given-names>KR</given-names> </name><etal/></person-group><article-title>Machine learning-based speech recognition system for nursing documentation - a pilot study</article-title><source>Int J Med Inform</source><year>2023</year><month>10</month><volume>178</volume><fpage>105213</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105213</pub-id><pub-id pub-id-type="medline">37690224</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hou</surname><given-names>SY</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>YL</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>KC</given-names> </name><etal/></person-group><article-title>Code-switching automatic speech recognition for nursing record documentation: system development and evaluation</article-title><source>JMIR Nurs</source><year>2022</year><month>12</month><day>7</day><volume>5</volume><issue>1</issue><fpage>e37562</fpage><pub-id pub-id-type="doi">10.2196/37562</pub-id><pub-id pub-id-type="medline">36476781</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ng</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Evaluating the performance of artificial intelligence-based speech recognition for clinical documentation: a systematic review</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>07</month><day>1</day><volume>25</volume><issue>1</issue><fpage>236</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-03061-0</pub-id><pub-id pub-id-type="medline">40598136</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Brockman</surname><given-names>G</given-names> </name><name name-style="western"><surname>McLeavey</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Robust speech recognition via large-scale weak supervision</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 6, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2212.04356</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>Y</given-names> </name></person-group><article-title>Chinese ASR and NER improvement based on Whisper fine-tuning</article-title><source>25th International Conference on Advanced Communication Technology (ICACT)</source><year>2023</year><publisher-name>IEEE</publisher-name><fpage>213</fpage><lpage>217</lpage><pub-id pub-id-type="doi">10.23919/ICACT56868.2023.10079686</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>CK</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Kuan</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Hsiao</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>HY</given-names> </name></person-group><article-title>Investigating zero-shot generalizability on Mandarin-English code-switched ASR and speech-to-text translation of recent foundation models with self-supervision and weak supervision</article-title><source>2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)</source><year>2024</year><publisher-name>IEEE</publisher-name><fpage>540</fpage><lpage>544</lpage><pub-id pub-id-type="doi">10.1109/ICASSPW62465.2024.10626762</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 17, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Shorten</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pierse</surname><given-names>C</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>TB</given-names> </name><etal/></person-group><article-title>StructuredRAG: JSON response formatting with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 7, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.11061</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Comparison of medical history documentation efficiency and quality based on GPT-4o: a study on the comparison between residents and artificial intelligence</article-title><source>Front Med (Lausanne)</source><year>2025</year><volume>12</volume><fpage>1545730</fpage><pub-id pub-id-type="doi">10.3389/fmed.2025.1545730</pub-id><pub-id pub-id-type="medline">40438356</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Decroos</surname><given-names>F</given-names> </name><name name-style="western"><surname>Springenberg</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>A deep learning approach for histopathological diagnosis of onychomycosis: not inferior to analogue diagnosis by histopathologists</article-title><source>Acta Derm Venereol</source><year>2021</year><month>08</month><day>31</day><volume>101</volume><issue>8</issue><fpage>adv00532</fpage><pub-id pub-id-type="doi">10.2340/00015555-3893</pub-id><pub-id pub-id-type="medline">34405243</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Es</surname><given-names>S</given-names> </name><name name-style="western"><surname>James</surname><given-names>J</given-names> </name><name name-style="western"><surname>Espinosa Anke</surname><given-names>L</given-names> </name><name name-style="western"><surname>Schockaert</surname><given-names>S</given-names> </name></person-group><article-title>RAGAs: automated evaluation of retrieval augmented generation</article-title><source>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations</source><year>2024</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>150</fpage><lpage>158</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.eacl-demo.16</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>Factual correctness</article-title><source>Ragas</source><access-date>2026-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness/">https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness/</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompt templates for DART (data, action, response, and teaching) versions 1 and 2 used to guide large language model&#x2013;based structuring of nursing narratives into DART records.</p><media xlink:href="nursing_v9i1e88567_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Distribution of user feedback by ward category.</p><media xlink:href="nursing_v9i1e88567_app2.docx" xlink:title="DOCX File, 13 KB"/></supplementary-material></app-group></back></article>