<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIRx Med</journal-id><journal-id journal-id-type="publisher-id">xmed</journal-id><journal-id journal-id-type="index">34</journal-id><journal-title>JMIRx Med</journal-title><abbrev-journal-title>JMIRx Med</abbrev-journal-title><issn pub-type="epub">2563-6316</issn></journal-meta><article-meta><article-id pub-id-type="publisher-id">45973</article-id><article-id pub-id-type="doi">10.2196/45973</article-id><title-group><article-title>Performance Drift in Machine Learning Models for Cardiac Surgery Risk Prediction: Retrospective Analysis</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Dong</surname><given-names>Tim</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sinha</surname><given-names>Shubhra</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhai</surname><given-names>Ben</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fudulu</surname><given-names>Daniel</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chan</surname><given-names>Jeremy</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Narayan</surname><given-names>Pradeep</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Judge</surname><given-names>Andy</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Caputo</surname><given-names>Massimo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dimagli</surname><given-names>Arnaldo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Benedetto</surname><given-names>Umberto</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Angelini</surname><given-names>Gianni D</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Bristol Heart Institute, Translational Health Sciences, University of Bristol</institution>, <addr-line>Bristol</addr-line>, <country>United Kingdom</country></aff><aff id="aff2"><institution>School of Computing Science, Northumbria University</institution>, <addr-line>Newcastle upon Tyne</addr-line>, <country>United Kingdom</country></aff><aff id="aff3"><institution>Department of Cardiac Surgery, Rabindranath Tagore International Institute of Cardiac Sciences</institution>, <addr-line>West Bengal</addr-line>, <country>India</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Meinert</surname><given-names>Edward</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Anonymous</surname><given-names/></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zeng</surname><given-names>Juntong</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tim Dong, MSc<email>qd18830@bristol.ac.uk</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>12</day><month>6</month><year>2024</year></pub-date><volume>5</volume><elocation-id>e45973</elocation-id><history><date date-type="received"><day>08</day><month>06</month><year>2023</year></date><date date-type="rev-recd"><day>27</day><month>02</month><year>2024</year></date><date date-type="accepted"><day>29</day><month>04</month><year>2024</year></date></history><copyright-statement>&#x00A9; Tim Dong, Shubhra Sinha, Ben Zhai, Daniel Fudulu, Jeremy Chan, Pradeep Narayan, Andy Judge, Massimo Caputo, Arnaldo Dimagli, Umberto Benedetto, Gianni D Angelini. Originally published in JMIRx Med (<ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org">https://med.jmirx.org</ext-link>), 12.6.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIRx Med, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://med.jmirx.org/">https://med.jmirx.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://xmed.jmir.org/2024/1/e45973"/><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/preprints.45973" xlink:title="Preprint (JMIR Preprints)" xlink:type="simple">https://preprints.jmir.org/preprint/45973</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.1101/2023.01.21.23284795" xlink:title="Preprint (Medrxiv)" xlink:type="simple">https://www.medrxiv.org/content/10.1101/2023.01.21.23284795v1</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/60428" xlink:title="Peer-Review Report by Anonymous" xlink:type="simple">https://med.jmirx.org/2024/1/e60428</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/60280" xlink:title="Peer-Review Report by Juntong Zeng (Reviewer CL)" xlink:type="simple">https://med.jmirx.org/2024/1/e60280</related-article><related-article related-article-type="companion" ext-link-type="doi" xlink:href="10.2196/60384" xlink:title="Authors' Response to Peer-Review Reports" xlink:type="simple">https://med.jmirx.org/2024/1/e60384</related-article><abstract><sec><title>Background</title><p>The Society of Thoracic Surgeons and European System for Cardiac Operative Risk Evaluation (EuroSCORE) II risk scores are the most commonly used risk prediction models for in-hospital mortality after adult cardiac surgery. However, they are prone to miscalibration over time and poor generalization across data sets; thus, their use remains controversial. Despite increased interest, a gap in understanding the effect of data set drift on the performance of machine learning (ML) over time remains a barrier to its wider use in clinical practice. Data set drift occurs when an ML system underperforms because of a mismatch between the data it was developed from and the data on which it is deployed.</p></sec><sec><title>Objective</title><p>In this study, we analyzed the extent of performance drift using models built on a large UK cardiac surgery database. The objectives were to (1) rank and assess the extent of performance drift in cardiac surgery risk ML models over time and (2) investigate any potential influence of data set drift and variable importance drift on performance drift.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a retrospective analysis of prospectively, routinely gathered data on adult patients undergoing cardiac surgery in the United Kingdom between 2012 and 2019. We temporally split the data 70:30 into a training and validation set and a holdout set. Five novel ML mortality prediction models were developed and assessed, along with EuroSCORE II, for relationships between and within variable importance drift, performance drift, and actual data set drift. Performance was assessed using a consensus metric.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 227,087 adults underwent cardiac surgery during the study period, with a mortality rate of 2.76% (n=6258). There was strong evidence of a decrease in overall performance across all models (<italic>P</italic>&#x003C;.0001). Extreme gradient boosting (clinical effectiveness metric [CEM] 0.728, 95% CI 0.728-0.729) and random forest (CEM 0.727, 95% CI 0.727-0.728) were the overall best-performing models, both temporally and nontemporally. EuroSCORE II performed the worst across all comparisons. Sharp changes in variable importance and data set drift from October to December 2017, from June to July 2018, and from December 2018 to February 2019 mirrored the effects of performance decrease across models.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>All models show a decrease in at least 3 of the 5 individual metrics. CEM and variable importance drift detection demonstrate the limitation of logistic regression methods used for cardiac surgery risk prediction and the effects of data set drift. Future work will be required to determine the interplay between ML models and whether ensemble models could improve on their respective performance advantages.</p></sec></abstract><kwd-group><kwd>cardiac surgery</kwd><kwd>artificial intelligence</kwd><kwd>risk prediction</kwd><kwd>machine learning</kwd><kwd>operative mortality</kwd><kwd>data set drift</kwd><kwd>performance drift</kwd><kwd>national data set</kwd><kwd>adult</kwd><kwd>data</kwd><kwd>cardiac</kwd><kwd>surgery</kwd><kwd>cardiology</kwd><kwd>heart</kwd><kwd>risk</kwd><kwd>prediction</kwd><kwd>United Kingdom</kwd><kwd>mortality</kwd><kwd>performance</kwd><kwd>model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Recently, the importance of machine learning (ML), a branch of artificial intelligence, has been highlighted as a potential alternative to traditional mortality risk stratification models such as the Society of Thoracic Surgeons (STS) [<xref ref-type="bibr" rid="ref1">1</xref>] and European System for Cardiac Operative Risk Evaluation (EuroSCORE) II risk scores [<xref ref-type="bibr" rid="ref2">2</xref>], which are prone to miscalibration over time and poor generalization across data sets [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. These traditional scoring methods are generally based on logistic regression (LR), with risk factors determined through consensus across experts within leading cardiac surgery organizations in the United States (STS) or Europe (EuroSCORE II). In particular, EuroSCORE II, which is based on LR using 18 items of information about the patient, has been shown by numerous studies to display poor discrimination and calibration across data sets with differing characteristics, including but not limited to age [<xref ref-type="bibr" rid="ref4">4</xref>], ethnicity [<xref ref-type="bibr" rid="ref5">5</xref>], and procedures groups [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Risk scoring models&#x2019; performance is challenged by numerous factors, such as differences in variable definitions, the management of incomplete data fields, surgical procedure selection criteria, and temporal changes in the prevalence of patients&#x2019; risk factors [<xref ref-type="bibr" rid="ref11">11</xref>]. ML approaches are increasingly used for prediction in health care research as they have the potential to overcome the limitations of linear models. By including pairwise and higher-order interactions and modeling nonlinear effects, ML may overcome heterogeneity in procedures and missing data [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Although ML has been shown to be beneficial over conventional scoring systems, the magnitude and clinical influence of such improvements remain uncertain [<xref ref-type="bibr" rid="ref2">2</xref>]. The ability to counter &#x201C;performance drift&#x201D; due to temporal changes in the prevalence of risk factors has also yet to be fully elucidated.</p><p>In ML, performance drift refers to the gradual loss in model performance caused by changes that call into question the model&#x2019;s training assumptions. Key causes of performance drift include data set drift, which refers to changes in the distribution of data between training and evaluation sets; variable importance drift, which involves changes in the significance of model variables; and calibration drift, which is characterized by decreased reliability in estimated probabilities. These factors can interact, as seen in a study of noncardiac surgery [<xref ref-type="bibr" rid="ref13">13</xref>]. Understanding the complex relationship between variable importance drift, performance drift, and data set drift is important. This relationship explains how changes in the importance of specific variables, combined with changes in the actual data distribution, collectively influence the model&#x2019;s overall accuracy and reliability as it performs over time. The wider implications are also significant, influencing decision-making, insight accuracy, generalization [<xref ref-type="bibr" rid="ref14">14</xref>], ethical considerations, and regulatory compliance across industries.</p><p>The aim of this study was to investigate performance drift in existing ML models that have been used in prior cardiac surgery risk prediction research. The objectives were to (1) rank and assess the extent of performance drift in such cardiac surgery risk ML models over time and (2) investigate any potential influence of data set drift and variable importance drift on performance drift. Therefore, we trained and evaluated 5 supervised ML models in addition to EuroSCORE II to (1) determine the best ML model in terms of overall accuracy, discrimination, calibration, and clinical effectiveness; (2) use variable importance drift as a measure for detecting data set drift; and (3) verify suspected data set drift informed through variable importance drift by assessing actual data set drift [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s1-2"><title>Related Work</title><p>In our previous study, we found that combining the metrics covering all 4 aspects of discrimination, calibration, clinical usefulness, and overall accuracy into a single clinical effectiveness metric (CEM) improved the efficiency of cognitive decision-making (according to the Miller law [<xref ref-type="bibr" rid="ref16">16</xref>]) for selecting the optimal ensemble models (ie, using several models to derive a consensus prediction) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. This approach is useful for providing a consensus metric that enables models to be ranked in scenarios where, for example, 1 model could outperform another using 1 metric but underperform under a different metric. Furthermore, we demonstrated that such a consensus metric could be combined with drill-down analysis to further interpret the models using individual metrics [<xref ref-type="bibr" rid="ref14">14</xref>]. Although area under the curve (AUC) evaluates the diagnostic or predictive performance of the model, it does not directly reflect patient benefit. This is why we included a suit of other metrics, including the decision curve analysis (DCA) net benefit index, that were found to be clinically pertinent from our prior study [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>In our previous work [<xref ref-type="bibr" rid="ref19">19</xref>], we studied the calibration changes across 2 different time intervals using the calibration belt (overall external calibration) and calibration drift (Hosmer-Lemeshow goodness-of-fit <italic>&#x03C7;</italic><sup>2</sup> statistics) approaches within a single UK hospital. A recent study extended our work to a Chinese national registry, Sino (Chinese) System for Coronary Artery Bypass Grafting (CABG) Operative Risk Evaluation II (SinoSCORE II), using a set of ML models such as LightGBM; CatBoost; and a combination of variable selection approaches including Optuna for stepwise regression, BorutaSHAP, and feature importance ranking [<xref ref-type="bibr" rid="ref20">20</xref>]. Another study in the United States also investigated the calibration performance difference between extreme gradient boosting (XGBoost) and LR models built for a cohort of patients who underwent CABG, using preoperative, intraoperative, and combined variable sets from the STS Adult Cardiac Surgery Database [<xref ref-type="bibr" rid="ref21">21</xref>].</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Set and Patient Population</title><p>The study was performed using the National Adult Cardiac Surgery Audit (NACSA) data set, which comprises data prospectively collected by the National Institute for Cardiovascular Outcomes Research on all cardiac procedures performed in all National Health Service hospitals and some private hospitals across the United Kingdom [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>A total of 227,087 adult patients who underwent cardiac surgery between January 1, 2012, and March 31, 2019, were included. Congenital, transplant, and mechanical support device insertion cases were excluded. The CONSORT (Consolidated Standards of Reporting Trials) patient flow diagram is shown in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. Missing and erroneously inputted data in the data set were cleaned according to the NACSA registry data preprocessing recommendations [<xref ref-type="bibr" rid="ref26">26</xref>]. Generally, for any variable data that were missing, it was assumed that the variable was at baseline level, that is, no risk factor was present. Missing patient age at the time of surgery was imputed as the median patient age for the corresponding year. Data standardization was performed by subtracting the variable mean and dividing by the SD values [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>The data set was split into 2 cohorts: training and validation set (n=157,196, 69.2%; 2012-2016; Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and holdout set (n=69,891, 30.8%; 2017-2019; Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The primary outcome of this study was in-hospital mortality.</p></sec><sec id="s2-2"><title>Baseline Statistical Analysis</title><p>Continuous variables were compared using nonparametric Wilcoxon rank sum tests, whereas categorical variables were compared using Pearson <italic>&#x03C7;</italic><sup>2</sup> tests or Fisher exact tests as appropriate.</p><p>The <italic>Scikit-learn</italic> (version 0.23.1) and <italic>Keras</italic> (version 2.4.0) Python libraries (Python Software Foundation) were used to develop the models and to evaluate their discrimination, calibration, and clinical effectiveness capabilities. Statistical analyses were conducted using Stata/MP (version 17; StataCorp) and R (version 4.0.2; StataCorp). ANOVA assumptions were checked using the <italic>rstatix</italic> R package.</p></sec><sec id="s2-3"><title>Model Development</title><p>In our study, we trained 5 supervised ML risk models based on the EuroSCORE II preoperative variable set (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Those 5 models included LR, neural network (NN) [<xref ref-type="bibr" rid="ref22">22</xref>], random forest (RF) [<xref ref-type="bibr" rid="ref27">27</xref>], weighted support vector machine (SVM) [<xref ref-type="bibr" rid="ref28">28</xref>], and extreme gradient boosting (XGBoost) [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. The EuroSCORE II score was calculated for baseline comparison. Internal validation was performed using 5-fold cross-validation on the training and validation set (2012-2016) to select model parameters. Final models were determined by retraining the models on the combined training and validation set using the selected model parameters. Temporal external validation was performed using the final models on the holdout set (2017-2019) [<xref ref-type="bibr" rid="ref15">15</xref>]. Each model calculated the probability of surgical mortality for each patient. Overall, 1000 bootstrap samples were taken for all metrics. Further details on model development can be found in the <italic>Model Specification</italic> section in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-4"><title>Assessment of Model Performance</title><p>The models&#x2019; performance was measured across four broad parameters:</p><list list-type="order"><list-item><p>Discrimination: AUC and <italic>F</italic><sub>1</sub>-score</p></list-item><list-item><p>Clinical utility: DCA net benefit index</p></list-item><list-item><p>Calibration: 1 &#x2013; expected calibration error (ECE)</p></list-item><list-item><p>Combination of calibration and discrimination: adjusted Brier score</p></list-item></list><p>The AUC performances of all variant models were evaluated, and the receiver operating characteristic (ROC) curves were plotted [<xref ref-type="bibr" rid="ref30">30</xref>]. As a sensitivity analysis, we calculated the <italic>F</italic><sub>1</sub>-score, which combines precision and recall without explicitly considering the true negative rate in the performance evaluation [<xref ref-type="bibr" rid="ref31">31</xref>]. This metric adjusts for the biased effect due to the high proportion of alive outcome samples. The DCA net benefit index was used to test clinical benefit [<xref ref-type="bibr" rid="ref32">32</xref>]. 1 &#x2013; ECE was used to determine calibration performance, with higher values being better [<xref ref-type="bibr" rid="ref33">33</xref>]. A special case of the Brier score (1 &#x2013; Brier score) without the normalization term was used (adjusted Brier score) [<xref ref-type="bibr" rid="ref34">34</xref>], with higher values indicating better discrimination and calibration performance.</p><p>To determine the best model in terms of both discrimination and calibration, we took the geometric average of AUC, <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref31">31</xref>], DCA net benefit index (treated + untreated), 1 &#x2013; ECE, and 1 &#x2013; Brier score. The consensus metric using the combined geometric average of the 5 metrics is named CEM for ease of reference. The consensus approach for combining different metrics has previously been applied in a study on COVID-19 prediction [<xref ref-type="bibr" rid="ref35">35</xref>]. In addition, this approach is similar to the simple additive weighting multicriteria evaluation approach for making a decision through the ranking of a set of competing criteria [<xref ref-type="bibr" rid="ref36">36</xref>]. Geometric average has previously been found to be effective for summarizing metrics for temporal-based model calibration and is robust for bootstrap-sampled Gaussian distributions [<xref ref-type="bibr" rid="ref37">37</xref>]. This metric is robust to outliers [<xref ref-type="bibr" rid="ref38">38</xref>] and is preferable for aggregation compared to the weighted arithmetic mean [<xref ref-type="bibr" rid="ref39">39</xref>]. As an exception, the arithmetic average was used for the DCA net benefit index over all thresholds as a measure of overall net benefit, before geometric averaging, since the values can be negative. An overview of the model and evaluation design is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Design overview of the study. Nontemporal performance and drift (temporal) analyses were performed. Drifts in discrimination, calibration, clinical utility, data set, and variable importance were assessed. Time point assessments were performed for the clinical effectiveness metric (CEM). Drifts in component metrics of CEM were evaluated. AUC: area under the curve; ECE: expected calibration error; EuroSCORE: European System for Cardiac Operative Risk Evaluation; F1: <italic>F</italic><sub>1</sub>-score; neuronetwork: neural network; SVM: support vector machine; Xgboost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v5i1e45973_fig01.png"/></fig></sec><sec id="s2-5"><title>Baseline Nontemporal Performance</title><p>Nontemporal comparison of models was conducted as a baseline, using all data across the holdout period. Differences across models were tested using repeated-measures 1-way ANOVA and Bonferroni-corrected, multiple pairwise, paired <italic>t</italic> tests (1-tailed); this was followed by Dunnett correction for multiple comparisons, with the overall best-performing model as the control. ANOVA assumptions for outliers were checked. Normality assumptions were checked using the Shapiro-Wilk test [<xref ref-type="bibr" rid="ref40">40</xref>]. The Delong test was applied to determine whether there was a statistically significant difference across the AUCs of the ROC curves for the top 2 best-performing models. A comparison of individual metrics was conducted.</p></sec><sec id="s2-6"><title>Drift Analysis</title><sec id="s2-6-1"><title>Overview</title><p>The statistical methods used for analyzing drift is shown in <xref ref-type="table" rid="table1">Table 1</xref>. More detailed explanations are provided below.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of statistical methods used for assessing drift.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Objective and statistical tests</td><td align="left" valign="bottom">General statistical situations</td><td align="left" valign="bottom">Rationale for choosing test</td><td align="left" valign="bottom">Assumptions checked</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5"><bold>Nontemporal comparison of models</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Repeated-measures 1-way ANOVA</td><td align="left" valign="top">Comparison of multiple groups for differences</td><td align="left" valign="top">Used for comparing means across multiple models</td><td align="left" valign="top">Outliers (ANOVA assumptions) and normality (Shapiro-Wilk test)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Paired <italic>t</italic> tests (Bonferroni corrected)</td><td align="left" valign="top">Comparison of paired observations between models</td><td align="left" valign="top">To compare specific model pairs simultaneously</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Dunnett correction</td><td align="left" valign="top">Control for multiple comparisons</td><td align="left" valign="top">Controls type I error rate in comparing multiple treatments to a control group in 1-way ANOVA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Delong test</td><td align="left" valign="top">Comparison of the AUCs<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> of 2 correlated ROC<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> curves</td><td align="left" valign="top">To compare the AUCs of 2 models or tests during sensitivity testing</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><bold>Analysis within specific time frames</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Kruskal-Wallis Test</td><td align="left" valign="top">Comparison of multiple groups for differences (nonparametric)</td><td align="left" valign="top">Nonparametric alternative for ANOVA in specific time frames</td><td align="left" valign="top">Outliers (ANOVA assumptions) and normality (Shapiro-Wilk test)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bonferroni-corrected, paired-samples Wilcoxon test (Wilcoxon signed rank test)</td><td align="left" valign="top">Comparison of paired observations within time frames</td><td align="left" valign="top">Nonparametric comparison of paired samples within time frames, with control for type I error rate in comparing multiple treatments</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Dunn test</td><td align="left" valign="top">Multiple pairwise comparisons within nonparametric groups</td><td align="left" valign="top">Post hoc test for pairwise comparisons after Kruskal-Wallis test; determines the magnitude of difference effects within time frames</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top" colspan="5"><bold>Analysis between the first 3 months of 2017 and 2019</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Kruskal-Wallis test</td><td align="left" valign="top">Comparison of multiple groups for differences (nonparametric)</td><td align="left" valign="top">Nonparametric comparison between time frames</td><td align="left" valign="top">Outliers (ANOVA assumptions) and normality (Kolmogorov-Smirnov Test)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Paired-samples Wilcoxon test (Wilcoxon signed rank test)</td><td align="left" valign="top">Comparison of paired observations between time frames</td><td align="left" valign="top">Nonparametric comparison of paired samples between time frames</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bonferroni-adjusted Dunn test</td><td align="left" valign="top">Multiple pairwise comparisons between time frames</td><td align="left" valign="top">Post hoc test for pairwise comparisons after significant Kruskal-Wallis results; determines the magnitude of difference effects between time frames, with control for type I error rate in comparing multiple treatments</td><td align="left" valign="top">Normality (Kolmogorov-Smirnov Test)</td></tr><tr><td align="left" valign="top" colspan="5"><bold>Analysis of discrimination, calibration, clinical utility, and overall accuracy drift</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Linear regression (with residual analysis)</td><td align="left" valign="top">Assessing relationships and regression parameters</td><td align="left" valign="top">To analyze linear relationships and model residuals</td><td align="left" valign="top">Normality through histograms and QQ plots</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Seasonal Kendall test (nonparametric alternative if assumptions not met)</td><td align="left" valign="top">Assessing association or trends when assumptions are not met</td><td align="left" valign="top">Nonparametric test for assessing associations without assumptions</td><td align="left" valign="top">Homoscedasticity through scale-location plots</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not applicable.</p></fn><fn id="table1fn2"><p><sup>b</sup>AUC: area under the curve.</p></fn><fn id="table1fn3"><p><sup>c</sup>ROC: receiver operating characteristic.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-6-2"><title>CEM Regression Trends</title><p>The geometric CEM mean (and 95% CI) value of 1000 bootstraps for each model against time (the month of the year) was calculated, and the results were plotted to compare trends across models. The models were compared by fitting multiple linear regression lines across time for CEM.</p><p>To check for normality assumptions, we plotted the histogram and a QQ plot of residuals before applying linear regressions [<xref ref-type="bibr" rid="ref41">41</xref>]. We also checked for homogeneity of residual variance (homoscedasticity) by plotting a scale-location plot, that is, the square root of standardized residual points against the values of the fitted outcome variable [<xref ref-type="bibr" rid="ref42">42</xref>]. For model metrics that do not satisfy these assumptions, the seasonal Kendall test (nonparametric) was used instead.</p></sec><sec id="s2-6-3"><title>Analysis Within the First 3 Months of 2017 and 2019</title><p>Differences in CEM values across models at 2 time points were independently tested using the Kruskal-Wallis test and Bonferroni-corrected, paired-samples Wilcoxon test (Wilcoxon signed rank test). The 2 time points were the first 3 months of 2017 and 2019. This was followed by the Dunn test for nonparametric multiple comparisons of the models at each of the 2 time points, with the overall best-performing model as a baseline. ANOVA assumptions for outliers were checked. Normality assumptions were checked using the Shapiro-Wilk test [<xref ref-type="bibr" rid="ref40">40</xref>].</p></sec><sec id="s2-6-4"><title>Analysis Between the First 3 Months of 2017 and 2019</title><p>Differences in CEM values across the first 3 months of 2017 and 2019 were tested using the Kruskal-Wallis test and paired-samples Wilcoxon test (Wilcoxon signed rank test). The Bonferroni-adjusted Dunn test was used to determine the magnitude and evidence of change across the 2 time points for each model. ANOVA assumptions for outliers were checked. Normality assumptions were checked using the Kolmogorov-Smirnov Test.</p></sec><sec id="s2-6-5"><title>Analysis of Discrimination, Calibration, Clinical Utility, and Overall Accuracy Drift</title><p>As a sensitivity analysis, we analyzed performance drift in terms of component metrics within CEM. Discrimination (AUC), positive outcome discrimination (<italic>F</italic><sub>1</sub>-score), calibration (1 &#x2013; ECE), clinical utility (net benefit), and overall accuracy of prediction probability (adjusted Brier score) were assessed by fitting multiple (model) linear regression lines across time for each metric.</p><p>To check for normality assumptions, the same methods as those used for CEM regression trends were used.</p></sec></sec><sec id="s2-7"><title>Analysis of Variable Importance Drift</title><p>Variable importance drift was assessed for the best-performing model. For each month of the holdout set, 5-fold nested cross-validation was performed to derive the importance of each EuroSCORE II variable in the model&#x2019;s decision-making. The geometric mean of 5-fold importance at each time point was plotted along with the importance of each of the 5 folds. The Shapley additive explanations (SHAP) mean absolute magnitude of importance was used [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Locally estimated scatterplot smoothing was used to simplify the visual representation. Line plots of the top 6 most important variables were used as a sensitivity analysis.</p></sec><sec id="s2-8"><title>Data Set Drift</title><p>Data set drift across time was visualized using a stacked bar plot for the top 3 variables as identified by SHAP variable importance. Continuous variables were binned into intervals to enable ease of analysis.</p></sec><sec id="s2-9"><title>Net Benefit Projection</title><p>To further understand the clinical significance of the performance drift over time, the fitted linear regression model intercepts and slopes were used to extrapolate the net benefit up to January 2030 for the XGBoost and NN models.</p></sec><sec id="s2-10"><title>Ethical Considerations</title><p>The study was part of a research project approved by the Health Research Authority and Health and Care Research Wales on July 23, 2019 (Integrated Research Application System project ID: 257758). As the study included retrospective interrogation of the National Institute for Cardiovascular Outcomes Research database, the need for individual patient consent was waived in accordance with the research guidance. The study was performed in accordance with the ethical standards as laid down in the 1964 Declaration of Helsinki and its later amendments.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Baseline Patient Characteristics</title><p>A total of 227,087 procedures of adults from 42 hospitals were included in this analysis. This followed the removal of 3930 congenital cases, 1586 transplant and mechanical support device insertion cases, and 3395 procedures with missing information on mortality (<xref ref-type="table" rid="table2">Table 2</xref>). There were 6258 deaths during the study period (mortality rate of 2.76%).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Patient demographics and summary of cleaned EuroSCORE<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> II variables. Variables are from the time period from 2012 to 2019. Records with missing mortality status were excluded.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Variable</td><td align="left" valign="bottom" colspan="2">Mortality status</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">No (n=220,829)</td><td align="left" valign="bottom">Yes (n=6258)</td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Age (years), mean (SD)</td><td align="left" valign="top">67.53 (11.23)</td><td align="left" valign="top">70.77 (11.42)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="4"><bold>NYHA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> classification, n (%)</bold></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0 (I)</td><td align="left" valign="top">48,625 (22)</td><td align="left" valign="top">1055 (17)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 (II)</td><td align="left" valign="top">96,888 (44)</td><td align="left" valign="top">1609 (26)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 (III)</td><td align="left" valign="top">64,049 (29)</td><td align="left" valign="top">2228 (36)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">3 (IV)</td><td align="left" valign="top">11,267 (5.1)</td><td align="left" valign="top">1366 (22)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Renal impairment, n (%)</bold></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0 (normal)</td><td align="left" valign="top">103,196 (47)</td><td align="left" valign="top">1704 (27)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 (moderate)</td><td align="left" valign="top">92,411 (42)</td><td align="left" valign="top">2451 (39)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 (on dialysis)</td><td align="left" valign="top">2187 (1)</td><td align="left" valign="top">330 (5.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">3 (severe)</td><td align="left" valign="top">23,035 (10)</td><td align="left" valign="top">1773 (28)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="2">Chronic lung disease, n (%)</td><td align="left" valign="top">26,644 (12)</td><td align="left" valign="top">1211 (19)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Poor mobility, n (%)</td><td align="left" valign="top">8305 (3.8)</td><td align="left" valign="top">514 (8.2)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Previous cardiac surgery, n (%)</td><td align="left" valign="top">12,012 (5.4)</td><td align="left" valign="top">1141 (18)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Left ventricle function, n (%)</bold></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0 (good; &#x003E;50%)</td><td align="left" valign="top">184,721 (84)</td><td align="left" valign="top">4706 (75)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 (moderate; 31%-50%)</td><td align="left" valign="top">30,608 (14)</td><td align="left" valign="top">1089 (17)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 (poor; 21%-30%)</td><td align="left" valign="top">4241 (1.9)</td><td align="left" valign="top">318 (5.1)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">3 (very poor; &#x2264;20%)</td><td align="left" valign="top">1259 (0.6)</td><td align="left" valign="top">145 (2.3)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Pulmonary hypertension, n (%)</bold></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0 (PA<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> systolic &#x003C;31 mm Hg)</td><td align="left" valign="top">201,643 (91)</td><td align="left" valign="top">5000 (80)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 (PA systolic 31-55 mm Hg)</td><td align="left" valign="top">13,126 (5.9)</td><td align="left" valign="top">705 (11)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 (PA systolic &#x003E;55 mm Hg)</td><td align="left" valign="top">6060 (2.7)</td><td align="left" valign="top">553 (8.8)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="2">CCS<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> class 4 angina, n (%)</td><td align="left" valign="top">18,370 (8.3)</td><td align="left" valign="top">956 (15)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Urgency, n (%)</bold></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0 (elective)</td><td align="left" valign="top">141,617 (64)</td><td align="left" valign="top">2442 (39)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 (urgent)</td><td align="left" valign="top">72,090 (33)</td><td align="left" valign="top">2134 (34)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 (emergency)</td><td align="left" valign="top">6533 (3)</td><td align="left" valign="top">1230 (20)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">3 (salvage)</td><td align="left" valign="top">589 (0.3)</td><td align="left" valign="top">452 (7.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="4"><bold>Weight of the intervention, n (%)</bold></td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">0 (isolated CABG<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup>)</td><td align="left" valign="top">111,243 (50)</td><td align="left" valign="top">1546 (25)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">1 (single non-CABG)</td><td align="left" valign="top">62,568 (28)</td><td align="left" valign="top">2153 (34)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">2 (two procedures)</td><td align="left" valign="top">42,649 (19)</td><td align="left" valign="top">2108 (34)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">3 (three procedures)</td><td align="left" valign="top">4369 (2)</td><td align="left" valign="top">451 (7.2)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="2">Diabetes on insulin, n (%)</td><td align="left" valign="top">12,818 (5.8)</td><td align="left" valign="top">453 (7.2)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Female gender, n (%)</td><td align="left" valign="top">59,467 (27)</td><td align="left" valign="top">2328 (37)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Recent myocardial infarction, n (%)</td><td align="left" valign="top">43,316 (20)</td><td align="left" valign="top">1594 (25)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Critical preoperative state, n (%)</td><td align="left" valign="top">7255 (3.3)</td><td align="left" valign="top">1382 (22)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Extracardiac arteriopathy, n (%)</td><td align="left" valign="top">22,327 (10)</td><td align="left" valign="top">1215 (19)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Active endocarditis, n (%)</td><td align="left" valign="top">5816 (2.6)</td><td align="left" valign="top">493 (7.9)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">Surgery on thoracic aorta, n (%)</td><td align="left" valign="top">9070 (4.1)</td><td align="left" valign="top">896 (14)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="2">EuroSCORE II, mean (SD)</td><td align="left" valign="top">0.03 (0.04)</td><td align="left" valign="top">0.12 (0.14)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>EuroSCORE: European System for Cardiac Operative Risk Evaluation.</p></fn><fn id="table2fn2"><p><sup>b</sup>Wilcoxon rank sum test or Pearson <italic>&#x03C7;</italic><sup>2</sup> test</p></fn><fn id="table2fn3"><p><sup>c</sup>NYHA: New York Heart Association.</p></fn><fn id="table2fn4"><p><sup>d</sup>PA: pulmonary artery.</p></fn><fn id="table2fn5"><p><sup>e</sup>CCS: Canadian Cardiovascular Society.</p></fn><fn id="table2fn6"><p><sup>f</sup>CABG: coronary artery bypass grafting.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Baseline Nontemporal Performance</title><p>No extreme outliers were found when testing for ANOVA assumptions. The CEM values from 1000 bootstraps were normally distributed for LR, NN, and RF but not XGBoost, as assessed by the Shapiro-Wilk test (<italic>P</italic>&#x003E;.05). A histogram plot of the XGBoost CEM values did not show substantial deviation from the normal distribution. There was strong evidence of a difference across all models (<italic>P</italic>&#x003C;.0001; Table S4 and Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). <xref ref-type="table" rid="table3">Table 3</xref> shows that XGBoost (CEM 0.728, 95% CI 0.728-0.729) and RF (CEM 0.727, 95% CI 0.727-0.728) were the overall best-performing models, with moderate to strong evidence (nonoverlapping CIs) of the former outperforming the latter. This was followed by LR, NN, SVM, and EuroSCORE II. The Dunnett test showed that there was moderate to strong evidence that XGBoost was superior to all other models (<italic>P</italic>&#x003C;.001; <xref ref-type="table" rid="table4">Table 4</xref>). The performance of XGBoost was the least different from RF but the most different from EuroSCORE II (CEM difference to XGBoost: 0.0009 vs 0.1876).</p><p>The sensitivity analysis of CEM component metrics showed that the adjusted Brier score was unable to distinguish between XGBoost, RF, NN, and LR (<xref ref-type="table" rid="table3">Table 3</xref>; all 0.976). AUC performance was the best for XGBoost (0.834) and RF (0.835), with the Delong test showing no statistically significant difference (<italic>P</italic>&#x003E;.05). <italic>F</italic><sub>1</sub>-score showed that XGBoost performed the best, followed by RF (0.279 vs 0.277). LR and NN (adjusted ECE: both 0.997) showed better calibration performance than RF and XGBoost (adjusted ECE: both 0.996). Net benefit was the best for XGBoost and RF (both 0.904).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Geometric mean of individual metrics for each model in the holdout set. In all, 1000 bootstrap samples were used to derive the geometric mean of each metric. Adjusted ECE<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> and Brier score values are shown. Net benefit is the average absolute overall benefit across all thresholds.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model category</td><td align="left" valign="bottom">1 &#x2013; ECE</td><td align="left" valign="bottom">AUC<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">1 &#x2013; Brier score</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Net benefit</td><td align="left" valign="bottom" colspan="5">CEM<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Mean (SD)</td><td align="left" valign="bottom" colspan="2">95% CI</td><td align="left" valign="bottom">Value, n</td></tr></thead><tbody><tr><td align="left" valign="top">EuroSCORE<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> II</td><td align="left" valign="top">0.641</td><td align="left" valign="top">0.800</td><td align="left" valign="top">0.814</td><td align="left" valign="top">0.240</td><td align="left" valign="top">0.461</td><td align="left" valign="top" colspan="2">0.541 (0.004)</td><td align="left" valign="top" colspan="2">0.540-0.541</td><td align="left" valign="top">1000</td></tr><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">0.997</td><td align="left" valign="top">0.819</td><td align="left" valign="top">0.976</td><td align="left" valign="top">0.264</td><td align="left" valign="top">0.902</td><td align="left" valign="top" colspan="2">0.717 (0.005)</td><td align="left" valign="top" colspan="2">0.717-0.717</td><td align="left" valign="top">1000</td></tr><tr><td align="left" valign="top">NN<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">0.997</td><td align="left" valign="top">0.813</td><td align="left" valign="top">0.976</td><td align="left" valign="top">0.259</td><td align="left" valign="top">0.901</td><td align="left" valign="top" colspan="2">0.713 (0.006)</td><td align="left" valign="top" colspan="2">0.713-0.714</td><td align="left" valign="top">1000</td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">0.996</td><td align="left" valign="top">0.835</td><td align="left" valign="top">0.976</td><td align="left" valign="top">0.277</td><td align="left" valign="top">0.904</td><td align="left" valign="top" colspan="2">0.727 (0.005)</td><td align="left" valign="top" colspan="2">0.727-0.728</td><td align="left" valign="top">1000</td></tr><tr><td align="left" valign="top">Weighted SVM<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="top">0.775</td><td align="left" valign="top">0.819</td><td align="left" valign="top">0.916</td><td align="left" valign="top">0.257</td><td align="left" valign="top">0.685</td><td align="left" valign="top" colspan="2">0.634 (0.005)</td><td align="left" valign="top" colspan="2">0.634-0.634</td><td align="left" valign="top">1000</td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup></td><td align="left" valign="top">0.996</td><td align="left" valign="top">0.834</td><td align="left" valign="top">0.976</td><td align="left" valign="top">0.279</td><td align="left" valign="top">0.904</td><td align="left" valign="top" colspan="2">0.728 (0.005)</td><td align="left" valign="top" colspan="2">0.728-0.729</td><td align="left" valign="top">1000</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ECE: expected calibration error.</p></fn><fn id="table3fn2"><p><sup>b</sup>AUC: area under the curve.</p></fn><fn id="table3fn3"><p><sup>c</sup>CEM: clinical effectiveness metric.</p></fn><fn id="table3fn4"><p><sup>d</sup>EuroSCORE: European System for Cardiac Operative Risk Evaluation.</p></fn><fn id="table3fn5"><p><sup>e</sup>LR: logistic regression.</p></fn><fn id="table3fn6"><p><sup>f</sup>NN: neural network.</p></fn><fn id="table3fn7"><p><sup>g</sup>RF: random forest.</p></fn><fn id="table3fn8"><p><sup>h</sup>SVM: support vector machine.</p></fn><fn id="table3fn9"><p><sup>i</sup>XGBoost: extreme gradient boosting.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>The Dunnett test with XGBoost<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> as a control and the rest of the models as comparisons.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Group 1</td><td align="left" valign="top">Group 2 (control)</td><td align="left" valign="top">CEM<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> difference (group 1 &#x2013; group 2; 95% family-wise CI)</td><td align="left" valign="top"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">EuroSCORE<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> II</td><td align="left" valign="top">XGBoost</td><td align="left" valign="top">&#x2212;0.1876 (&#x2212;0.1881 to &#x2212;0.1870)</td><td align="left" valign="top">&#x003C;2&#x00D7;10<sup>&#x2013;16</sup><sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">LR<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">XGBoost</td><td align="left" valign="top">&#x2212;0.0110 (&#x2212;0.0116 to &#x2212;0.0105)</td><td align="left" valign="top">&#x003C;2&#x00D7;10<sup>&#x2013;16</sup><sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">NN<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">XGBoost</td><td align="left" valign="top">&#x2212;0.0148 (&#x2212;0.0154 to &#x2212;0.0142)</td><td align="left" valign="top">&#x003C;2&#x00D7;10<sup>&#x2013;16</sup><sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">RF<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">XGBoost</td><td align="left" valign="top">&#x2212;0.0009 (&#x2212;0.0015 to &#x2212;0.0003)</td><td align="left" valign="top">.00039<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">Weighted SVM<sup><xref ref-type="table-fn" rid="table4fn8">h</xref></sup></td><td align="left" valign="top">XGBoost</td><td align="left" valign="top">&#x2212;0.0941 (&#x2212;0.0947 to &#x2212;0.0935)</td><td align="left" valign="top">&#x003C;2&#x00D7;10<sup>&#x2013;16</sup><sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table4fn2"><p><sup>b</sup>CEM: clinical effectiveness metric.</p></fn><fn id="table4fn3"><p><sup>c</sup>EuroSCORE: European System for Cardiac Operative Risk Evaluation.</p></fn><fn id="table4fn4"><p><sup>d</sup><italic>P</italic>&#x003C;.001.</p></fn><fn id="table4fn5"><p><sup>e</sup>LR: logistic regression.</p></fn><fn id="table4fn6"><p><sup>f</sup>NN: neural network.</p></fn><fn id="table4fn7"><p><sup>g</sup>RF: random forest.</p></fn><fn id="table4fn8"><p><sup>h</sup>SVM: support vector machine.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Drift Analysis</title><sec id="s3-3-1"><title>Overall CEM</title><p><xref ref-type="fig" rid="figure2">Figure 2A</xref> shows that XGBoost and RF were candidates for the best overall CEM performance across time. There was minor evidence of LR outperforming NN across time. Seasonal fluctuations were observed. EuroSCORE II performed the worst across time, followed by SVM.</p><p>There was strong evidence of a decrease in overall performance across all models (<italic>P</italic>&#x003C;.0001). Linear regression plots showed that XGBoost had the best starting CEM (intercept: 0.755 vs 0.753 [RF], 0.742 [LR], and 0.741 [NN]), but the rate of performance decrease (slope &#x2212;0.000720) was less than NN (&#x2212;0.00083) and greater than RF (&#x2212;0.000685) and LR (&#x2212;0.000696; <xref ref-type="fig" rid="figure3">Figure 3A-C</xref> and Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). By March 2019, the overall CEM performance ranking was not changed, with XGBoost performing the best, followed by RF, LR, and NN. EuroSCORE II (intercept 0.484; slope &#x2212;0.000847) performed the worst in terms of starting CEM and rate of performance decrease, followed by SVM (intercept 0.658; slope &#x2212;0.000625; <xref ref-type="fig" rid="figure3">Figure 3D</xref> and Figure S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Normality and homogeneity assumptions were satisfied for all models&#x2019; CEM values, as checked by a QQ plot of residuals and scale-location plot (Figure S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>(A) Plot of CEM values by model and time. Geometric mean (95% CI) of 1000 bootstraps at each time point is shown. The horizontal line represents the CEM geometric mean of all models. (B) Box plot of difference in models&#x2019; CEM values across the first 3 months of 2017 and 2019. Kruskal-Wallis results for CEM across the time points are shown. (C) Paired-samples Wilcoxon test (Wilcoxon signed rank test) for the first 3 months of 2019 bootstrap CEM values. <italic>P</italic> values are adjusted using the Bonferroni method. ****<italic>P</italic>&#x003C;.0001. CEM: clinical effectiveness metric; EuroSCORE: European System for Cardiac Operative Risk Evaluation; ns: not significant; neuronetwork: neural network; SVM: support vector machine; Xgboost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v5i1e45973_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Plots of CEM values by model and time: (A) XGBoost, (B) random forest, (C) logistic regression, and (D) EuroSCORE II. The geometric mean of 1000 bootstraps at each time point is shown. The red dotted line shows linear regression, and the blue line shows generalized additive model fit. Parameters and <italic>P</italic> values for the linear regressions are shown. (E) Discrimination (AUC) performance drift by time. Linear regression lines are plotted for each model, with slope, intercept, and <italic>P</italic> values displayed in the legend. (F) Calibration (adjusted ECE) performance drift by time. Linear regression lines are plotted for each model, with slope, intercept and <italic>P</italic> values displayed in the legend. SVM and EuroSCORE II are removed to enable a clearer separation of models with similar performance. AUC: area under the curve; CEM: clinical effectiveness metric; ECE: expected calibration error; EuroSCORE: European System for Cardiac Operative Risk Evaluation; neuronetwork: neural network; SVM: support vector machine; Xgboost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v5i1e45973_fig03.png"/></fig></sec><sec id="s3-3-2"><title>Analysis Within the First 3 Months of 2017</title><p>No extreme outliers were found for the models&#x2019; CEM values in the first 3 months of 2017. The CEM values were nonnormally distributed for all models (<italic>P</italic>&#x003C;.05; Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). There was strong evidence of a difference across all models (<italic>P</italic>&#x003C;.0001; <xref ref-type="table" rid="table3">Table 3</xref> and Figure S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The Dunn test showed strong evidence of XGBoost having the best overall performance (Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>; <italic>P</italic>&#x003C;.0001), followed by RF, NN, and LR (CEM difference to XGBoost: &#x2212;0.0076, &#x2212;0.0124, and &#x2212;0.0138, respectively; <italic>P</italic>&#x003C;.0001). EuroSCORE II performed the worst, followed by weighted SVM (CEM difference to XGBoost: &#x2212;0.2739 and &#x2212;0.0961, respectively; <italic>P</italic>&#x003C;.0001).</p></sec><sec id="s3-3-3"><title>Analysis Within the First 3 Months of 2019</title><p>No extreme outliers were found for the models&#x2019; CEM values in the first 3 months of 2019. The CEM values were nonnormally distributed for 50% (3/6) of models (<italic>P</italic>&#x003C;.05). There was strong evidence of a difference across all models (<italic>P</italic>&#x003C;.0001; Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and <xref ref-type="fig" rid="figure2">Figure 2B</xref>). The Dunn test showed strong evidence of XGBoost having the best overall performance (Table S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>; <italic>P</italic>&#x003C;.05), followed by RF, LR, and NN (CEM difference to XGBoost: &#x2212;0.0032, &#x2212;0.0055, and &#x2212;0.0108, respectively; <italic>P</italic>&#x003C;.05). EuroSCORE II performed the worst, followed by weighted SVM (CEM difference to XGBoost: &#x2212;0.2594 and &#x2212;0.0856, respectively; <italic>P</italic>&#x003C;.0001).</p></sec><sec id="s3-3-4"><title>Analysis Between the First 3 Months of 2017 and 2019</title><p>No extreme outliers were found for the models&#x2019; CEM values in the first 3 months of 2017 and 2019. The CEM values were nonnormally distributed for the first 3 months of 2017 and 2019, as assessed by the Kolmogorov-Smirnov test (<italic>P</italic>&#x003C;.05). There was strong evidence of an overall difference across the 2 time points (<italic>P</italic>&#x003C;.0001; Table S9 and Figure S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). There was strong evidence of a difference across the 2 time points for each individual model (<italic>P</italic>&#x003C;.05; <xref ref-type="fig" rid="figure2">Figure 2C</xref> and Table S10 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). XGBoost retained the best overall performance across the time points examined. This model showed the largest decrease in CEM performance (median difference 0.0288; <italic>P</italic>&#x003C;.0001), followed by NN, RF, and LR (median difference: 0.0272, 0.0244, and 0.0205, respectively; <italic>P</italic>&#x003C;.0001). Following a performance decrease from 2017 to 2019, XGBoost still had the best overall performance, with RF being the second best (median CEM: 0.716 and 0.713, respectively). Although NN had a better starting performance than LR, the larger performance drift resulted in NN having a lower overall performance than LR in 2019 (median CEM: 0.705 vs 0.710). Although the performance drift was smaller, LR&#x2019;s CEM performance never exceeded RF&#x2019;s (median CEM: 0.710 vs 0.713). EuroSCORE II showed the least performance drift, followed by weighted SVM (median difference: 0.0142 and 0.0183, respectively; <italic>P</italic>&#x003C;.05), but both performed the worst in terms of absolute CEM value.</p></sec><sec id="s3-3-5"><title>Analysis of Discrimination, Calibration, and Clinical Effectiveness Drift</title><sec id="s3-3-5-1"><title>Discrimination</title><sec id="s3-3-5-1-1"><title>AUC</title><p>Linear regression plots showed that XGBoost had the best starting AUC (intercept: 0.843 vs 0.839 [RF] and 0.831 [LR, NN, and SVM]), but the rate of performance decrease was greater than RF and EuroSCORE II (slope: &#x2212;0.000678 vs &#x2212;0.000381 [RF] and &#x2212;0.000604 [EuroSCORE II]; <xref ref-type="fig" rid="figure3">Figure 3E</xref>). By March 2019, XGBoost&#x2019;s AUC had decreased below RF&#x2019;s, resulting in RF being the best-performing model, followed by XGBoost, SVM, LR, and NN. NN showed the largest rate of AUC decrease, followed by LR and SVM (slope: &#x2212;0.0014, &#x2212;0.00093, and &#x2212;0.000873, respectively). EuroSCORE II performed the worst in terms of AUC across all time points (intercept 0.766). There was strong evidence of a decrease in AUC performance across all models (<italic>P</italic>&#x003C;.0001). Normality and homogeneity assumptions were satisfied for all models&#x2019; AUC values, as checked by a QQ plot of residuals and scale-location plot (Figure S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-3-5-1-2"><title><italic>F</italic><sub>1</sub>-score</title><p>The best-performing model across all holdout time periods was XGBoost, followed by RF, LR, NN, SVM, and EuroSCORE II. There was strong evidence of a decrease in <italic>F</italic><sub>1</sub>-score performance across all models (<italic>P</italic>&#x003C;.0001). More details can be found in the <italic>Positive Outcome Discrimination</italic> section and Figures S9-10 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec></sec></sec><sec id="s3-4"><title>Calibration</title><p>Linear regression plots showed that NN has the best starting adjusted ECE (intercept: 0.9907 vs 0.9903 [RF], 0.9902 [XGBoost], and 0.9898 [LR]), but the rate of performance decrease was greater than LR and RF (slope: &#x2212;5.29&#x00D7;10<sup>&#x2013;5</sup> vs &#x2212;2.93&#x00D7;10<sup>&#x2013;6</sup> [LR] and &#x2212;4.58&#x00D7;10<sup>&#x2013;5</sup> [RF]; <xref ref-type="fig" rid="figure3">Figure 3F</xref>). By March 2019, NN&#x2019;s adjusted ECE had decreased below LR&#x2019;s, resulting in LR being the best-performing model, followed by NN, RF, and XGBoost. Although SVM and EuroSCORE II had lower rates of adjusted ECE decrease (slope: &#x2212;0.000251 and &#x2212;0.000479, respectively), the calibration performance was much lower at all time points compared to the other models (Figure S11 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). There was strong evidence of a decrease in adjusted ECE performance across all models (<italic>P</italic>&#x003C;.0001), except LR (<italic>P</italic>&#x003E;.05). Normality and homogeneity assumptions were satisfied for all models&#x2019; adjusted ECE values, as checked by a QQ plot of residuals and scale-location plot (Figure S12 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-5"><title>Clinical Effectiveness</title><p>Linear regression plots showed that XGBoost had the best starting net benefit (intercept: 0.9051 vs 0.9043 [RF] and 0.9035 [NN and LR]), but the rate of performance decrease was greater than RF (slope: &#x2212;5.68&#x00D7;10<sup>&#x2013;5</sup> vs &#x2212;2.5&#x00D7;10<sup>&#x2013;6</sup>; <xref ref-type="fig" rid="figure4">Figure 4A</xref>), slower than LR (&#x2212;9.38&#x00D7;10<sup>&#x2013;5</sup>), and even slower than NN (&#x2212;0.000145). By March 2019, XGBoost&#x2019;s net benefit had decreased below RF&#x2019;s, resulting in RF being the best-performing model, followed by XGBoost, LR, and NN. EuroSCORE II showed the largest rate of net benefit decrease and performed the worst across all time points, followed by SVM (intercept: 0.314 and 0.690; slope: &#x2212;0.000846 and &#x2212;0.000364, respectively; Figure S13 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). There was strong evidence of a decrease in net benefit performance across all models (<italic>P</italic>&#x003C;.0001), except RF (<italic>P</italic>&#x003E;.05). Normality and homogeneity assumptions were satisfied for all models&#x2019; net benefit values, as checked by a QQ plot of residuals and scale-location plot (Figure S14 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>(A) Clinical effectiveness (net benefit) performance drift by time. Linear regression lines are plotted for each model, with slope, intercept, and <italic>P</italic> values displayed in the legend. SVM and EuroSCORE II are removed to enable a clearer separation of models with similar performance. (B) SHAP variable importance drift for the holdout set over 27 months (EuroSCORE II and XGBoost). Solid dots show geometric mean values of 5-fold cross-validation. Smoothed locally estimated scatterplot lines are plotted, with green bands showing 95% CIs. (C) SHAP variable importance drift for the holdout set over 27 months for the top 6 most important variables (EuroSCORE II and XGBoost). The trends are unsmoothed. (D) Operative urgency data set drift across time for the holdout set. The percentages of each category are shown for each time point. CCS: Canadian Cardiovascular Society; CPS: critical preoperative state; EuroSCORE: European System for Cardiac Operative Risk Evaluation; ES: EuroSCORE; LV: left ventricle; MI: myocardial infarction; neuronetwork: neural network; NYHA: New York Heart Association; PA: pulmonary artery; PVD: peripheral vascular disease; SHAP: Shapley additive explanations; SVM: support vector machine; Xgboost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v5i1e45973_fig04.png"/></fig></sec><sec id="s3-6"><title>Accuracy of Prediction Probability</title><p>By March 2019, XGBoost was the best model, followed by RF, LR, and NN. EuroSCORE II performed the worst in terms of adjusted Brier score and rate of decrease, followed by SVM. There was strong evidence of a decrease in adjusted Brier score performance across all models (<italic>P</italic>&#x003C;.0001), except XGBoost and RF. More details can be found in the <italic>Accuracy of Prediction Probability</italic> section and Figures S15-S17 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-7"><title>Analysis of Variable Importance Drift</title><p>SHAP mean absolute magnitude of importance was used to measure variable importance drift for the best temporal and nontemporal model (XGBoost). Smoothed trend lines showed substantial drift in numerous variables, including the most important variables: age, operative urgency, the weight of intervention, New York Heart Association classification, renal impairment, and previous cardiac surgery (<xref ref-type="fig" rid="figure4">Figure 4B</xref>). The sensitivity analysis showed a substantial drift in variable importance across the holdout set for all 6 variables (<xref ref-type="fig" rid="figure4">Figure 4C</xref>). When compared with the CEM performance drop from October to December 2017 and from June to July 2018 (<xref ref-type="fig" rid="figure3">Figure 3</xref> generalized additive model line), it could be seen that the CEM decrease was mirrored by decreases in the importance of the top variables, age and operative urgency, at these time periods (<xref ref-type="fig" rid="figure4">Figure 4C</xref>). A decrease in CEM performance in the 3 months of 2019 was likely to be at least partly contributed by the sudden rise in the importance of the weight of intervention (<xref ref-type="fig" rid="figure3">Figure 3</xref> and <xref ref-type="fig" rid="figure4">Figure 4B and C</xref>).</p></sec><sec id="s3-8"><title>Data Set Drift Across Time</title><p>Data set drift was observed throughout the holdout time periods for operative urgency, with sharp drifts observed across all categories from November to December 2017 and from June to July 2018 (<xref ref-type="fig" rid="figure4">Figure 4D</xref>). Data set drift was observed across the holdout time periods for the &#x003C;60 and &#x003E;60 years patient age groups (Figure S18 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), with marked data drifts observed from October to November 2017 and from July to August 2018. Data set drift was observed across the holdout time periods for the weight of intervention (Figure S19 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Sharp data set drifts were observed for the single non-CABG and 3 procedures categories from December 2018 to February 2019.</p></sec><sec id="s3-9"><title>Net Benefit Projection</title><p>To further understand the clinical significance of performance drift over time, <xref ref-type="fig" rid="figure5">Figure 5</xref> illustrates the expected net benefit decrease for the NN and XGBoost models. The blue line depicts the actual net benefit drop for the NN model (as represented by the slope), transitioning to the projected red line after March 2019. The green line represents the actual net benefit drop for the XGBoost model up to March 2019, changing to the projected purple line after March 2019. A clinically significant decrease (from 0.9035 to 0.8808) is shown for NN but not for XGBoost (from 0.9051 to 0.8962).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>The actual and projected net benefit drift for the NN and Xgboost models over time. NN: neural network; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="xmed_v5i1e45973_fig05.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The main finding of the study was that the XGBoost model performed the best, followed by RF, LR, and NN, when all metrics were simultaneously considered, both temporally and nontemporally. Furthermore, EuroSCORE II substantially underperformed against all ML models across all comparisons; this presents an urgent need to understand the drift effects of this score and is not limited to calibration drift. By first combining all metrics and then analyzing the temporal drift of each metric individually, we were able to determine the contribution of individual metrics to the overall performance drift of each model. We found strong evidence that all models showed a decrease in at least 3 of the 5 individual metrics within the CEM. This demonstrated the importance for clinicians and ML governance teams to actively monitor the effects of data set drift (as explained later) on &#x201C;big data&#x201D; models that are prepared for or being clinically used to minimize the risk of harm to patients.</p><p>&#x201C;Big data&#x201D; refers to large and detailed data sets that are suited to ML analyses rather than traditional statistical analyses [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>], and they are increasingly used in health care. These analyses can inform, personalize, and potentially improve care [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. Despite growing interest [<xref ref-type="bibr" rid="ref49">49</xref>] in ML and health care data linkage initiatives such as the Cardiac Quality Assurance Programme in the United Kingdom [<xref ref-type="bibr" rid="ref50">50</xref>], there have been limited reports of use within cardiac surgery [<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref53">53</xref>], with one of the main reasons being a lack of understanding by clinicians of the underlining processes [<xref ref-type="bibr" rid="ref54">54</xref>].</p><p>As more countries follow in the steps of the United States to deploy ML to the medical settings [<xref ref-type="bibr" rid="ref55">55</xref>], it becomes increasingly critical that clinicians and ML governance teams are adequately prepared for situations in which ML systems fail to perform their intended functions [<xref ref-type="bibr" rid="ref56">56</xref>]. A major factor in ML malfunction is &#x201C;data set drift,&#x201D; where ML performance declines due to a mismatch between the data on which the model was trained and the new unseen data to which the model is applied [<xref ref-type="bibr" rid="ref57">57</xref>]. Several factors have been reported to influence data set drift, including changes in technology, demographics, and patient or clinician behavior [<xref ref-type="bibr" rid="ref56">56</xref>].</p><p>In our previous systematic review, we found that despite ML models achieving better discriminatory ability than traditional LR approaches, few cardiac surgery studies assessed calibration, clinical utility, discrimination, and data set drift collectively; these aspects should be assessed to determine the clinical implications of ML [<xref ref-type="bibr" rid="ref2">2</xref>]. Our previous study [<xref ref-type="bibr" rid="ref19">19</xref>], although not involving the assessment of XGBoost, had also shown that the calibration drift of LR was less than that of RF, whereas EuroSCORE I, na&#x00EF;ve Bayes, and NN performed poorly in terms of calibration. A recent study extending on our work had shown temporal and spatial calibration drift (comparison across regions and hospitals) to be severe across a range of ML models using a national Chinese registry [<xref ref-type="bibr" rid="ref20">20</xref>]. In accordance with our view, the study highlighted that &#x201C;future efforts may need to shift more towards enhancing model calibration robustness or recalibration for greater practical value&#x201D; and that the inclusion of intraoperative variables may be important to enhancing model performance. The STS Adult Cardiac Surgery Database study [<xref ref-type="bibr" rid="ref21">21</xref>] had shown that the inclusion of intraoperative variables improved both the discrimination and calibration performance of XGBoost and LR models in patients who underwent CABG from the United States. Although calibration drift over time is well documented among EuroSCORE and LR models for hospital mortality, the susceptibility of competing ML modeling methods to data set drift has not been well studied in cardiac surgery [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>This study heeds the call for additional metrics to address the lack of sensitivity of the most commonly used C-statistic and calibration slope in capturing the advantage of ML models [<xref ref-type="bibr" rid="ref58">58</xref>]; we demonstrated the use of a consensus score [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref59">59</xref>-<xref ref-type="bibr" rid="ref61">61</xref>] named CEM to take into account numerous metrics that have been found to be beneficial, covering overall accuracy [<xref ref-type="bibr" rid="ref58">58</xref>], discrimination, calibration, and clinical utility. We wanted to analyze model performance across multiple metrics across time in this study.</p><p>This study showed invariance in model ranking for the CEM in both temporal and nontemporal analyses, indicating that there is value for this consensus scoring approach in performance drift evaluation.</p><p>This study also addresses the gap in understanding the effect of data set drift on the performance of ML and traditional models over time, which presents a barrier to their clinical application. The shift between XGBoost and RF having the best performance for AUC and net benefit and between NN and LR having the best performance for &#x201C;adjusted ECE&#x201D; demonstrates that the comparison of models at a single time point was insufficient to understand the clinical limitations of ML models and that at least 2 time points should be considered.</p><p>Our study has also found that although RF showed comparable discrimination (AUC) and clinical utility (net benefit) performance across time, the reason for XGBoost&#x2019;s superior overall temporal performance was in its better overall accuracy (adjusted Brier score) and positive outcome discrimination (<italic>F</italic><sub>1</sub>-score). <italic>F</italic><sub>1</sub>-score is often overlooked but is especially important in cardiac surgery data sets, whereby the incidence for the outcome of interest is typically very low and introduces bias in the performance evaluation when AUC is used. We found that RF performed the second best overall. Unlike XGBoost, RF performed better in terms of resistance to drift for AUC and net benefit, suggesting that further work is required to determine whether the synergistic (ensemble) effects across models are beneficial for improving cardiac surgery risk prediction.</p><p>Although XGBoost is currently the best temporal and nontemporal model for the NACSA data set, periodic monitoring of performance drift for each yearly revision of this data set should be mandated to determine whether or not performance has been overtaken by RF, and if so, at what point in time this happens [<xref ref-type="bibr" rid="ref56">56</xref>]. As all models showed strong evidence of a decrease in overall performance from January 2017 to March 2019, further work will be required to develop either better-performing models or models that are less susceptible to performance drift. However, through projecting the net benefit into the year 2030 based on the fitted linear regression, the decreases in the net benefit for XGBoost over time were shown to be clinically insignificant. On the contrary, the NN model showed a clinically significant drop in net benefit.</p><p>Although the reported decreases in measures such as CEM and AUC may appear small, such changes are likely to impact the potential use of ML models within clinical scenarios. If such models are to be used clinically for making decisions about the patient, even small changes in these metrics (which have been previously discussed [<xref ref-type="bibr" rid="ref18">18</xref>] to be important in cardiac surgery ML performance) can have an influence on risk assessment and patient outcomes, necessitating constant model drift monitoring. Prior research has shown that improving model calibration robustness or recalibration is necessary for practical value and that the &#x201C;the significant decline in performance of previously established models in this study calls for continuing model updates&#x201D; [<xref ref-type="bibr" rid="ref20">20</xref>]. It is envisaged that collaboration between physicians and ML scientists is critical. Before mandating model updates, it is critical to establish metric-specific thresholds for acceptable reductions. A consensus approach, extensive experience in this area, or a meta-analysis of current literature may be required for this collaborative decision-making process.</p><p>We have demonstrated that by associating relationships between smoothed [<xref ref-type="bibr" rid="ref62">62</xref>] and unsmoothed trend lines for CEM performance and EuroSCORE II variable importance, it was possible to detect subtle data set drifts that could result in model performance drifts. Our findings of variable importance and data set drift from October to December 2017, from June to July 2018, and from December 2018 to February 2019 are likely to reflect seasonality changes and mirrored effects of sharp drifts in CEM performance across models. The detection of data set drift was verified by checking for actual drifts in the data set variables. A noncardiac surgery study used actual data set drift to check for variable importance&#x2013;detected data set drift [<xref ref-type="bibr" rid="ref13">13</xref>]. However, drift in the actual data set was only analyzed across 2 data points [<xref ref-type="bibr" rid="ref13">13</xref>], without consideration for smoothed and unsmoothed relationships across performance, variable importance, and actual variable incidence. This study provides the foundations for which further work analyzing ML performance drift are recommended, to analyze relationships between drifts in a consensus score such as CEM and in variable importance, followed by the confirmation of any detected drifts using actual data set trends (data set drift).</p></sec><sec id="s4-2"><title>Limitations and Future Studies</title><p>Although statistical rigor has been applied to determine whether performance drift is a barrier to clinical risk modeling and decision-making, further work could be done to apply more statistically sensitive approaches for comparing the interactions of trends in data set drift, performance drift, and variable importance drift. As NACSA patient identifiers and the Hospital Episode Statistics data set were not available for linkage, it was not possible to determine whether there were any same patient individuals in both the training and validation set and holdout set, where they had multiple surgeries. Clinical judgment suggests that the proportion of multiple surgeries would be very low. Nonetheless, future work should consider the collection of such information to minimize any potential bias. Our previous work using CEM and constituent metrics to study random effects ML had also shown that hospital-related systematic variations may be better adjusted for by including hospital location variables as part of the input covariates rather than specifically using mixed effects ML models [<xref ref-type="bibr" rid="ref17">17</xref>]. Future work may consider the incorporation of such systematic variation adjustments when studying drift effects to further investigate the optimal approach for modeling drift across individual hospitals. Although CEM is a consensus score that enhances the clinical evaluation of complex relationships across different aspects of model performance, compressing the net benefit measure into a single value would mean that further DCA may be required if individual-specific, threshold-based decisions were to be fully considered. Future studies should also delve deeper into the relationships of the studied drift types with concept drift in cardiac surgery risk prediction.</p></sec><sec id="s4-3"><title>Conclusion</title><p>This study found that performance drift of ML and EuroSCORE II over time could be explained through data set drift patterns in cardiac surgery risk prediction. It was also found that variable importance drift could help to explain performance drift and support the detection of data set drift in the assessed models. The strong evidence of all models showing a decrease in at least 3 of the 5 individual metrics within CEM demonstrates the potential need to update the models over time, but future work are required to determine suitable thresholds for mandating an update. Future work will be required to determine the interplay between XGBoost and RF, which have demonstrated less drift over time, and whether combining these models through additional ensemble modeling could take advantage of their respective performance advantages.</p></sec></sec></body><back><ack><p>This work was supported by a grant from the British Heart Foundation&#x2013;Turing Institute, the National Institute for Health and Care Research (NIHR) Biomedical Research Centre at University Hospitals Bristol, Weston NHS Foundation Trust, and the University of Bristol.</p></ack><notes><sec><title>Data Availability</title><p>All data used in this study are from the National Adult Cardiac Surgery Audit (NACSA) data set. These data may be requested from the Healthcare Quality Improvement Partnership (HQIP) [<xref ref-type="bibr" rid="ref63">63</xref>]. Code for deriving training, update, and holdout data sets is available on GitHub upon reasonable request to the corresponding author, and the authors can provide confirmatory deidentified record IDs for each set upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>TD, SS, AD, DF, JC, BZ, PN, UB, AJ, and GA contributed to the experimental design. TD and SS acquired the data. TD and SS performed the data preprocessing. TD wrote the source code to perform the experiments and is accountable for all aspects of the work. TD, SS, AD, DF, JC, BZ, PN, AJ, and GA analyzed the results. TD wrote the first version of the paper. All authors revised the paper and approved the submission.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">CABG</term><def><p>coronary artery bypass grafting</p></def></def-item><def-item><term id="abb3">CEM</term><def><p>clinical effectiveness metric</p></def></def-item><def-item><term id="abb4">CONSORT</term><def><p>Consolidated Standards of Reporting Trials</p></def></def-item><def-item><term id="abb5">DCA</term><def><p>decision curve analysis</p></def></def-item><def-item><term id="abb6">ECE</term><def><p>expected calibration error</p></def></def-item><def-item><term id="abb7">EuroSCORE</term><def><p>European System for Cardiac Operative Risk Evaluation</p></def></def-item><def-item><term id="abb8">LR</term><def><p>logistic regression</p></def></def-item><def-item><term id="abb9">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb10">NACSA</term><def><p>National Adult Cardiac Surgery Audit</p></def></def-item><def-item><term id="abb11">NN</term><def><p> neural network</p></def></def-item><def-item><term id="abb12">RF</term><def><p>random forest</p></def></def-item><def-item><term id="abb13">ROC</term><def><p>receiver operating characteristic</p></def></def-item><def-item><term id="abb14">SHAP</term><def><p>Shapley additive explanations</p></def></def-item><def-item><term id="abb15">SinoSCORE II</term><def><p>Sino (Chinese) System for Coronary Artery Bypass Grafting Operative Risk Evaluation II</p></def></def-item><def-item><term id="abb16">STS</term><def><p>Society of Thoracic Surgeons</p></def></def-item><def-item><term id="abb17">SVM</term><def><p> support vector machine</p></def></def-item><def-item><term id="abb18">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Reinertsen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Prediction of operative mortality for patients undergoing cardiac surgical procedures without established risk scores</article-title><source>J Thorac Cardiovasc Surg</source><year>2023</year><month>04</month><volume>165</volume><issue>4</issue><fpage>1449</fpage><lpage>1459</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2021.09.010</pub-id><pub-id pub-id-type="medline">34607725</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Benedetto</surname><given-names>U</given-names> </name><name name-style="western"><surname>Dimagli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Machine learning improves mortality risk prediction after cardiac surgery: systematic review and meta-analysis</article-title><source>J Thorac Cardiovasc Surg</source><year>2022</year><month>06</month><volume>163</volume><issue>6</issue><fpage>2075</fpage><lpage>2087</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2020.07.105</pub-id><pub-id pub-id-type="medline">32900480</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kieser</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Head</surname><given-names>SJ</given-names> </name></person-group><article-title>Comparison of logistic EuroSCORE and EuroSCORE II in predicting operative mortality of 1125 total arterial operations</article-title><source>Eur J Cardiothorac Surg</source><year>2016</year><month>09</month><volume>50</volume><issue>3</issue><fpage>509</fpage><lpage>518</lpage><pub-id pub-id-type="doi">10.1093/ejcts/ezw072</pub-id><pub-id pub-id-type="medline">27005979</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poullis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pullan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chalmers</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mediratta</surname><given-names>N</given-names> </name></person-group><article-title>The validity of the original EuroSCORE and EuroSCORE II in patients over the age of seventy</article-title><source>Interact Cardiovasc Thorac Surg</source><year>2015</year><month>02</month><volume>20</volume><issue>2</issue><fpage>172</fpage><lpage>177</lpage><pub-id pub-id-type="doi">10.1093/icvts/ivu345</pub-id><pub-id pub-id-type="medline">25348730</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>GX</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Validation of EuroSCORE II in Chinese patients undergoing heart valve surgery</article-title><source>Heart Lung Circ</source><year>2013</year><month>08</month><volume>22</volume><issue>8</issue><fpage>606</fpage><lpage>611</lpage><pub-id pub-id-type="doi">10.1016/j.hlc.2012.12.012</pub-id><pub-id pub-id-type="medline">23375874</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Silaschi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Conradi</surname><given-names>L</given-names> </name><name name-style="western"><surname>Seiffert</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Predicting risk in transcatheter aortic valve implantation: comparative analysis of EuroSCORE II and established risk stratification tools</article-title><source>Thorac Cardiovasc Surg</source><year>2015</year><month>09</month><volume>63</volume><issue>6</issue><fpage>472</fpage><lpage>478</lpage><pub-id pub-id-type="doi">10.1055/s-0034-1389107</pub-id><pub-id pub-id-type="medline">25191764</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carnero-Alc&#x00E1;zar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Silva Guisasola</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Reguillo Lacruz</surname><given-names>FJ</given-names> </name><etal/></person-group><article-title>Validation of EuroSCORE II on a single-centre 3800 patient cohort</article-title><source>Interact Cardiovasc Thorac Surg</source><year>2013</year><month>03</month><volume>16</volume><issue>3</issue><fpage>293</fpage><lpage>300</lpage><pub-id pub-id-type="doi">10.1093/icvts/ivs480</pub-id><pub-id pub-id-type="medline">23178391</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arangalage</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cimadevilla</surname><given-names>C</given-names> </name><name name-style="western"><surname>Alkhoder</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Agreement between the new EuroSCORE II, the logistic EuroSCORE and the Society of Thoracic Surgeons score: implications for transcatheter aortic valve implantation</article-title><source>Arch Cardiovasc Dis</source><year>2014</year><volume>107</volume><issue>6-7</issue><fpage>353</fpage><lpage>360</lpage><pub-id pub-id-type="doi">10.1016/j.acvd.2014.05.002</pub-id><pub-id pub-id-type="medline">24996564</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atashi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Amini</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tashnizi</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>External validation of European System for Cardiac Operative Risk Evaluation II (EuroSCORE II) for risk prioritization in an Iranian population</article-title><source>Braz J Cardiovasc Surg</source><year>2018</year><volume>33</volume><issue>1</issue><fpage>40</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.21470/1678-9741-2017-0030</pub-id><pub-id pub-id-type="medline">29617500</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Provench&#x00E8;re</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chevalier</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ghodbane</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Is the EuroSCORE II reliable to estimate operative mortality among octogenarians?</article-title><source>PLoS One</source><year>2017</year><month>11</month><day>16</day><volume>12</volume><issue>11</issue><fpage>e0187056</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0187056</pub-id><pub-id pub-id-type="medline">29145434</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nilsson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ohlsson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Thulin</surname><given-names>L</given-names> </name><name name-style="western"><surname>H&#x00F6;glund</surname><given-names>P</given-names> </name><name name-style="western"><surname>Nashef</surname><given-names>SAM</given-names> </name><name name-style="western"><surname>Brandt</surname><given-names>J</given-names> </name></person-group><article-title>Risk factor identification and mortality prediction in cardiac surgery using artificial neural networks</article-title><source>J Thorac Cardiovasc Surg</source><year>2006</year><month>07</month><volume>132</volume><issue>1</issue><fpage>12</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2005.12.055</pub-id><pub-id pub-id-type="medline">16798296</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kurlansky</surname><given-names>P</given-names> </name></person-group><article-title>Commentary: the risk of risk models</article-title><source>J Thorac Cardiovasc Surg</source><year>2020</year><month>07</month><volume>160</volume><issue>1</issue><fpage>181</fpage><lpage>182</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2019.12.063</pub-id><pub-id pub-id-type="medline">32007255</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duckworth</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chmiel</surname><given-names>FP</given-names> </name><name name-style="western"><surname>Burns</surname><given-names>DK</given-names> </name><etal/></person-group><article-title>Using explainable machine learning to characterise data drift and detect emergent health risks for emergency department admissions during COVID-19</article-title><source>Sci Rep</source><year>2021</year><month>11</month><day>26</day><volume>11</volume><issue>1</issue><fpage>23017</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-02481-y</pub-id><pub-id pub-id-type="medline">34837021</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhai</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Cardiac surgery risk prediction using ensemble machine learning to incorporate legacy risk scores: a benchmarking study</article-title><source>Digit Health</source><year>2023</year><month>07</month><day>20</day><volume>9</volume><fpage>20552076231187605</fpage><pub-id pub-id-type="doi">10.1177/20552076231187605</pub-id><pub-id pub-id-type="medline">37492033</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hickey</surname><given-names>GL</given-names> </name><name name-style="western"><surname>Blackstone</surname><given-names>EH</given-names> </name></person-group><article-title>External model validation of binary clinical risk prediction models in cardiovascular and thoracic surgery</article-title><source>J Thorac Cardiovasc Surg</source><year>2016</year><month>08</month><volume>152</volume><issue>2</issue><fpage>351</fpage><lpage>355</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2016.04.023</pub-id><pub-id pub-id-type="medline">27215928</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>X</given-names> </name></person-group><article-title>The effect of color on short-term memory in information visualization</article-title><source>VINCI &#x2019;16: Proceedings of the 9th International Symposium on Visual Information Communication and Interaction</source><year>2016</year><publisher-name>Association for Computing Machinery</publisher-name><fpage>144</fpage><lpage>145</lpage><pub-id pub-id-type="doi">10.1145/2968220.2968237</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fudulu</surname><given-names>DP</given-names> </name><etal/></person-group><article-title>Random effects adjustment in machine learning models for cardiac surgery risk prediction: a benchmarking study</article-title><source>medRxiv</source><comment>Preprint posted online on  Jun 12, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.06.08.23291129</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dimagli</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Comparison of machine learning techniques in prediction of mortality following cardiac surgery: analysis of over 220 000 patients from a large national database</article-title><source>Eur J Cardiothorac Surg</source><year>2023</year><month>06</month><day>1</day><volume>63</volume><issue>6</issue><fpage>ezad183</fpage><pub-id pub-id-type="doi">10.1093/ejcts/ezad183</pub-id><pub-id pub-id-type="medline">37154705</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Benedetto</surname><given-names>U</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lyon</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Can machine learning improve mortality prediction following cardiac surgery?</article-title><source>Eur J Cardiothorac Surg</source><year>2020</year><month>12</month><day>1</day><volume>58</volume><issue>6</issue><fpage>1130</fpage><lpage>1136</lpage><pub-id pub-id-type="doi">10.1093/ejcts/ezaa229</pub-id><pub-id pub-id-type="medline">32810233</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Comparative analysis of machine learning vs. traditional modeling approaches for predicting in-hospital mortality after cardiac surgery: temporal and spatial external validation based on a nationwide cardiac surgery registry</article-title><source>Eur Heart J Qual Care Clin Outcomes</source><year>2024</year><month>03</month><day>1</day><volume>10</volume><issue>2</issue><fpage>121</fpage><lpage>131</lpage><pub-id pub-id-type="doi">10.1093/ehjqcco/qcad028</pub-id><pub-id pub-id-type="medline">37218710</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Durant</surname><given-names>TJS</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Toward dynamic risk prediction of outcomes after coronary artery bypass graft: improving risk prediction with intraoperative events using gradient boosting</article-title><source>Circ Cardiovasc Qual Outcomes</source><year>2021</year><month>06</month><volume>14</volume><issue>6</issue><fpage>e007363</fpage><pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.120.007363</pub-id><pub-id pub-id-type="medline">34078100</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Benedetto</surname><given-names>U</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Deep recurrent reinforced learning model to compare the efficacy of targeted local versus national measures on the spread of COVID-19 in the UK</article-title><source>BMJ Open</source><year>2022</year><month>02</month><day>21</day><volume>12</volume><issue>2</issue><fpage>e048279</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-048279</pub-id><pub-id pub-id-type="medline">35190408</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kamaleswaran</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mahajan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Akbilgic</surname><given-names>O</given-names> </name></person-group><article-title>A robust deep convolutional neural network for the classification of abnormal cardiac rhythm using single lead electrocardiograms of variable length</article-title><source>Physiol Meas</source><year>2018</year><month>03</month><day>27</day><volume>39</volume><issue>3</issue><fpage>035006</fpage><pub-id pub-id-type="doi">10.1088/1361-6579/aaaa9d</pub-id><pub-id pub-id-type="medline">29369044</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zou</surname><given-names>F</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><name name-style="western"><surname>Jie</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name></person-group><article-title>A sufficient condition for convergences of Adam and RMSProp</article-title><conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 15 to 20, 2019:</conf-date><conf-loc>Long Beach, CA</conf-loc><fpage>11119</fpage><lpage>11127</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2019.01138</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chakraborty</surname><given-names>D</given-names> </name><name name-style="western"><surname>Awolusi</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name></person-group><article-title>An explainable machine learning model to predict and elucidate the compressive behavior of high-performance concrete</article-title><source>Results Eng</source><year>2021</year><month>09</month><volume>11</volume><fpage>100245</fpage><pub-id pub-id-type="doi">10.1016/j.rineng.2021.100245</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hickey</surname><given-names>GL</given-names> </name><name name-style="western"><surname>Grant</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Cosgriff</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Clinical registries: governance, management, analysis and applications</article-title><source>Eur J Cardiothorac Surg</source><year>2013</year><month>10</month><volume>44</volume><issue>4</issue><fpage>605</fpage><lpage>614</lpage><pub-id pub-id-type="doi">10.1093/ejcts/ezt018</pub-id><pub-id pub-id-type="medline">23371972</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarica</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cerasa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Quattrone</surname><given-names>A</given-names> </name></person-group><article-title>Random forest algorithm for the classification of neuroimaging data in Alzheimer's disease: a systematic review</article-title><source>Front Aging Neurosci</source><year>2017</year><month>10</month><day>6</day><volume>9</volume><fpage>329</fpage><pub-id pub-id-type="doi">10.3389/fnagi.2017.00329</pub-id><pub-id pub-id-type="medline">29056906</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Prabhakararao</surname><given-names>E</given-names> </name><name name-style="western"><surname>Dandapat</surname><given-names>S</given-names> </name></person-group><article-title>A weighted SVM based approach for automatic detection of posterior myocardial infarction using VCG signals</article-title><conf-name>2019 National Conference on Communications (NCC)</conf-name><conf-date>Feb 20 to 23, 2019:</conf-date><conf-loc>Bangalore, India</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1109/NCC.2019.8732238</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rajliwall</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Davey</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chetty</surname><given-names>G</given-names> </name></person-group><article-title>Cardiovascular risk prediction based on XGBoost</article-title><conf-name>2018 5th Asia-Pacific World Congress on Computer Science and Engineering (APWC on CSE)</conf-name><conf-date>Dec 10 to 12, 2018:</conf-date><conf-loc>Nadi, Fiji</conf-loc><fpage>246</fpage><lpage>252</lpage><pub-id pub-id-type="doi">10.1109/APWConCSE.2018.00047</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>NK</given-names> </name><name name-style="western"><surname>Sindhu</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Prashanthi</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Sulthana</surname><given-names>AS</given-names> </name></person-group><article-title>Analysis and prediction of cardio vascular disease using machine learning classifiers</article-title><conf-name>2020 6th International Conference on Advanced Computing and Communication Systems (ICACCS)</conf-name><conf-date>Mar 6 to 7, 2020:</conf-date><conf-loc>Coimbatore, India</conf-loc><fpage>15</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1109/ICACCS48705.2020.9074183</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tiwari</surname><given-names>P</given-names> </name><name name-style="western"><surname>Colborn</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Xing</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rosenberg</surname><given-names>MA</given-names> </name></person-group><article-title>Assessment of a machine learning model applied to harmonized electronic health record data for the prediction of incident atrial fibrillation</article-title><source>JAMA Netw Open</source><year>2020</year><month>01</month><day>3</day><volume>3</volume><issue>1</issue><fpage>e1919396</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.19396</pub-id><pub-id pub-id-type="medline">31951272</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Allyn</surname><given-names>J</given-names> </name><name name-style="western"><surname>Allou</surname><given-names>N</given-names> </name><name name-style="western"><surname>Augustin</surname><given-names>P</given-names> </name><etal/></person-group><article-title>A comparison of a machine learning model with EuroSCORE II in predicting mortality after elective cardiac surgery: a decision curve analysis</article-title><source>PLoS One</source><year>2017</year><month>01</month><day>6</day><volume>12</volume><issue>1</issue><fpage>e0169772</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0169772</pub-id><pub-id pub-id-type="medline">28060903</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrtash</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wells</surname><given-names>WM</given-names> </name><name name-style="western"><surname>Tempany</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Abolmaesumi</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kapur</surname><given-names>T</given-names> </name></person-group><article-title>Confidence calibration and predictive uncertainty estimation for deep medical image segmentation</article-title><source>IEEE Trans Med Imaging</source><year>2020</year><month>12</month><volume>39</volume><issue>12</issue><fpage>3868</fpage><lpage>3878</lpage><pub-id pub-id-type="doi">10.1109/TMI.2020.3006437</pub-id><pub-id pub-id-type="medline">32746129</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Vickers</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>NR</given-names> </name><etal/></person-group><article-title>Assessing the performance of prediction models: a framework for traditional and novel measures</article-title><source>Epidemiology</source><year>2010</year><month>01</month><volume>21</volume><issue>1</issue><fpage>128</fpage><lpage>138</lpage><pub-id pub-id-type="doi">10.1097/EDE.0b013e3181c30fb2</pub-id><pub-id pub-id-type="medline">20010215</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Devaraj</surname><given-names>J</given-names> </name><name name-style="western"><surname>Madurai Elavarasan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pugazhendhi</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Forecasting of COVID-19 cases using deep learning models: is it reliable and practically significant?</article-title><source>Results Phys</source><year>2021</year><month>02</month><volume>21</volume><fpage>103817</fpage><pub-id pub-id-type="doi">10.1016/j.rinp.2021.103817</pub-id><pub-id pub-id-type="medline">33462560</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rohrbach</surname><given-names>L</given-names> </name><name name-style="western"><surname>Huebner</surname><given-names>P</given-names> </name></person-group><article-title>Application of multi-criteria decision making in bioink selection</article-title><conf-name>2021 Systems and Information Engineering Design Symposium (SIEDS)</conf-name><conf-date>Apr 29 to 30, 2021:</conf-date><conf-loc>Charlottesville, VA</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1109/SIEDS52267.2021.9483762</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Armstrong</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Collopy</surname><given-names>F</given-names> </name></person-group><article-title>Error measures for generalizing about forecasting methods: empirical comparisons</article-title><source>Int J Forecast</source><year>1992</year><month>06</month><volume>8</volume><issue>1</issue><fpage>69</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1016/0169-2070(92)90008-W</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kacalak</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lipi&#x0144;ski</surname><given-names>D</given-names> </name><name name-style="western"><surname>R&#x00F3;&#x017C;a&#x0144;ski</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kr&#x00F3;lczyk</surname><given-names>GM</given-names> </name></person-group><article-title>Assessment of the classification ability of parameters characterizing surface topography formed in manufacturing and operation processes</article-title><source>Measurement</source><year>2021</year><month>01</month><volume>170</volume><fpage>108715</fpage><pub-id pub-id-type="doi">10.1016/j.measurement.2020.108715</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krej&#x010D;&#x00ED;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Stoklasa</surname><given-names>J</given-names> </name></person-group><article-title>Aggregation in the analytic hierarchy process: why weighted geometric mean should be used instead of weighted arithmetic mean</article-title><source>Expert Syst Appl</source><year>2018</year><month>12</month><day>30</day><volume>114</volume><fpage>97</fpage><lpage>106</lpage><pub-id pub-id-type="doi">10.1016/j.eswa.2018.06.060</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gonz&#x00E1;lez-Estrada</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cosmes</surname><given-names>W</given-names> </name></person-group><article-title>Shapiro&#x2013;Wilk test for skew normal distributions based on data transformations</article-title><source>J Stat Comput Simul</source><year>2019</year><month>08</month><day>27</day><volume>89</volume><issue>17</issue><fpage>3258</fpage><lpage>3272</lpage><pub-id pub-id-type="doi">10.1080/00949655.2019.1658763</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="web"><article-title>Guidance for data quality assessment</article-title><source>United States Environmental Protection Agency</source><access-date>2022-02-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.epa.gov/quality/guidance-data-quality-assessment">https://www.epa.gov/quality/guidance-data-quality-assessment</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McLeod</surname><given-names>AI</given-names> </name></person-group><article-title>Improved spread-location visualization</article-title><source>J Comput Graph Stat</source><year>1999</year><volume>8</volume><issue>1</issue><fpage>135</fpage><lpage>141</lpage><pub-id pub-id-type="doi">10.1080/10618600.1999.10474806</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barda</surname><given-names>N</given-names> </name><name name-style="western"><surname>Riesel</surname><given-names>D</given-names> </name><name name-style="western"><surname>Akriv</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Developing a COVID-19 mortality risk prediction model when individual-level data are not available</article-title><source>Nat Commun</source><year>2020</year><month>09</month><day>7</day><volume>11</volume><issue>1</issue><fpage>4439</fpage><pub-id pub-id-type="doi">10.1038/s41467-020-18297-9</pub-id><pub-id pub-id-type="medline">32895375</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Guyon</surname><given-names>I</given-names> </name><name name-style="western"><surname>von Luxburg</surname><given-names>U</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A unified approach to interpreting model predictions</article-title><source>Advances in Neural Information Processing Systems 30 (NIPS 2017)</source><year>2017</year><access-date>2024-05-23</access-date><publisher-name>Curran Associates, Inc</publisher-name><fpage>1</fpage><lpage>10</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html">https://proceedings.neurips.cc/paper_files/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raghupathi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Raghupathi</surname><given-names>V</given-names> </name></person-group><article-title>Big data analytics in healthcare: promise and potential</article-title><source>Health Inf Sci Syst</source><year>2014</year><month>02</month><day>7</day><volume>2</volume><fpage>3</fpage><pub-id pub-id-type="doi">10.1186/2047-2501-2-3</pub-id><pub-id pub-id-type="medline">25825667</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Silverio</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cavallo</surname><given-names>P</given-names> </name><name name-style="western"><surname>de Rosa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Galasso</surname><given-names>G</given-names> </name></person-group><article-title>Big health data and cardiovascular diseases: a challenge for research, an opportunity for clinical care</article-title><source>Front Med (Lausanne)</source><year>2019</year><month>02</month><day>25</day><volume>6</volume><fpage>36</fpage><pub-id pub-id-type="doi">10.3389/fmed.2019.00036</pub-id><pub-id pub-id-type="medline">30873409</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Prabakaran</surname><given-names>S</given-names> </name></person-group><article-title>Big data in digital healthcare: lessons learnt and recommendations for general practice</article-title><source>Heredity (Edinb)</source><year>2020</year><month>04</month><volume>124</volume><issue>4</issue><fpage>525</fpage><lpage>534</lpage><pub-id pub-id-type="doi">10.1038/s41437-020-0303-2</pub-id><pub-id pub-id-type="medline">32139886</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pencina</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>BA</given-names> </name><name name-style="western"><surname>D&#x2019;Agostino</surname><given-names>RB</given-names> </name></person-group><article-title>Prediction models &#x2014; development, evaluation, and clinical application</article-title><source>N Engl J Med</source><year>2020</year><month>04</month><day>23</day><volume>382</volume><issue>17</issue><fpage>1583</fpage><lpage>1586</lpage><pub-id pub-id-type="doi">10.1056/NEJMp2000589</pub-id><pub-id pub-id-type="medline">32320568</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ruiz</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Goldsmith</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Early prediction of clinical deterioration using data-driven machine-learning modeling of electronic health records</article-title><source>J Thorac Cardiovasc Surg</source><year>2022</year><month>07</month><volume>164</volume><issue>1</issue><fpage>211</fpage><lpage>222.e3</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2021.10.060</pub-id><pub-id pub-id-type="medline">34949457</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Kendall</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>R</given-names> </name><name name-style="western"><surname>Moorjani</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Adult cardiac surgery quality improvement and quality assurance: proposal to improve the audit. 2021 to 2024 and beyond</article-title><source>Society for Cardiothoracic Surgery in Great Britain and Ireland</source><year>2020</year><month>10</month><day>6</day><access-date>2024-05-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://scts.org/_userfiles/pages/files/adult%20cardiac/scts_adult_cardiac_surgery_accreditation_proposal_202124.pdf">https://scts.org/_userfiles/pages/files/adult%20cardiac/scts_adult_cardiac_surgery_accreditation_proposal_202124.pdf</ext-link></comment></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hernandez-Suarez</surname><given-names>DF</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Villablanca</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Machine learning prediction models for in-hospital mortality after transcatheter aortic valve replacement</article-title><source>JACC Cardiovasc Interv</source><year>2019</year><month>07</month><day>22</day><volume>12</volume><issue>14</issue><fpage>1328</fpage><lpage>1338</lpage><pub-id pub-id-type="doi">10.1016/j.jcin.2019.06.013</pub-id><pub-id pub-id-type="medline">31320027</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wojnarski</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Roselli</surname><given-names>EE</given-names> </name><name name-style="western"><surname>Idrees</surname><given-names>JJ</given-names> </name><etal/></person-group><article-title>Machine-learning phenotypic classification of bicuspid aortopathy</article-title><source>J Thorac Cardiovasc Surg</source><year>2018</year><month>02</month><volume>155</volume><issue>2</issue><fpage>461</fpage><lpage>469.e4</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2017.08.123</pub-id><pub-id pub-id-type="medline">29042101</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A novel predictive model for poor in-hospital outcomes in patients with acute kidney injury after cardiac surgery</article-title><source>J Thorac Cardiovasc Surg</source><year>2023</year><month>03</month><volume>165</volume><issue>3</issue><fpage>1180</fpage><lpage>1191</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2021.04.085</pub-id><pub-id pub-id-type="medline">34112503</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Domaratzki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kidane</surname><given-names>B</given-names> </name></person-group><article-title>Deus ex machina? demystifying rather than deifying machine learning</article-title><source>J Thorac Cardiovasc Surg</source><year>2022</year><month>03</month><volume>163</volume><issue>3</issue><fpage>1131</fpage><lpage>1137.e4</lpage><pub-id pub-id-type="doi">10.1016/j.jtcvs.2021.02.095</pub-id><pub-id pub-id-type="medline">33840471</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajkomar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>I</given-names> </name></person-group><article-title>Machine learning in medicine</article-title><source>N Engl J Med</source><year>2019</year><month>04</month><day>4</day><volume>380</volume><issue>14</issue><fpage>1347</fpage><lpage>1358</lpage><pub-id pub-id-type="doi">10.1056/NEJMra1814259</pub-id><pub-id pub-id-type="medline">30943338</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Finlayson</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Subbaswamy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>K</given-names> </name><etal/></person-group><article-title>The clinician and dataset shift in artificial intelligence</article-title><source>N Engl J Med</source><year>2021</year><month>07</month><day>15</day><volume>385</volume><issue>3</issue><fpage>283</fpage><lpage>286</lpage><pub-id pub-id-type="doi">10.1056/NEJMc2104626</pub-id><pub-id pub-id-type="medline">34260843</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Subbaswamy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saria</surname><given-names>S</given-names> </name></person-group><article-title>From development to deployment: dataset shift, causality, and shift-stable models in health AI</article-title><source>Biostatistics</source><year>2020</year><month>04</month><day>1</day><volume>21</volume><issue>2</issue><fpage>345</fpage><lpage>352</lpage><pub-id pub-id-type="doi">10.1093/biostatistics/kxz041</pub-id><pub-id pub-id-type="medline">31742354</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>SX</given-names> </name><name name-style="western"><surname>Caraballo</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Performance metrics for the comparative analysis of clinical risk prediction models employing machine learning</article-title><source>Circ Cardiovasc Qual Outcomes</source><year>2021</year><month>10</month><volume>14</volume><issue>10</issue><fpage>e007526</fpage><pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.120.007526</pub-id><pub-id pub-id-type="medline">34601947</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ericksen</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Machine learning consensus scoring improves performance across targets in structure-based virtual screening</article-title><source>J Chem Inf Model</source><year>2017</year><month>07</month><day>24</day><volume>57</volume><issue>7</issue><fpage>1579</fpage><lpage>1590</lpage><pub-id pub-id-type="doi">10.1021/acs.jcim.7b00153</pub-id><pub-id pub-id-type="medline">28654262</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hornik</surname><given-names>K</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>D</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Decker</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lenz</surname><given-names>HJ</given-names> </name></person-group><article-title>Deriving consensus rankings from benchmarking experiments</article-title><source>Advances in Data Analysis. Studies in Classification, Data Analysis, and Knowledge Organization</source><year>2007</year><publisher-name>Springer</publisher-name><fpage>163</fpage><lpage>170</lpage><pub-id pub-id-type="doi">10.1007/978-3-540-70981-7_19</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Q</given-names> </name></person-group><article-title>An ensemble weighted average conservative multi-fidelity surrogate modeling method for engineering optimization</article-title><source>Eng Comput</source><year>2022</year><month>06</month><volume>38</volume><issue>3</issue><fpage>2221</fpage><lpage>2244</lpage><pub-id pub-id-type="doi">10.1007/s00366-020-01203-8</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fudulu</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Dimagli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Weekday and outcomes of elective cardiac surgery in the UK: a large retrospective database analysis</article-title><source>Eur J Cardiothorac Surg</source><year>2022</year><month>05</month><day>27</day><volume>61</volume><issue>6</issue><fpage>1381</fpage><lpage>1388</lpage><pub-id pub-id-type="doi">10.1093/ejcts/ezac038</pub-id><pub-id pub-id-type="medline">35092280</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="web"><article-title>Data access at HQIP</article-title><source>Healthcare Quality Improvement Partnership</source><access-date>2024-05-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hqip.org.uk/national-programmes/accessing-ncapop-data/#.Ys6gN-zMLdp">https://www.hqip.org.uk/national-programmes/accessing-ncapop-data/#.Ys6gN-zMLdp</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Data set split, model specification, drift analysis, and other analyses.</p><media xlink:href="xmed_v5i1e45973_app1.docx" xlink:title="DOCX File, 144155 KB"/></supplementary-material></app-group></back></article>