diff --git a/.gitignore b/.gitignore index 65c9800f4c..8c964acb73 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ revdep cache _site _web +*.log +*.aux diff --git a/_articles/RJ-2023-056/RJ-2023-056.Rmd b/_articles/RJ-2023-056/RJ-2023-056.Rmd index e51b3cdaf9..f44b1176c6 100644 --- a/_articles/RJ-2023-056/RJ-2023-056.Rmd +++ b/_articles/RJ-2023-056/RJ-2023-056.Rmd @@ -26,7 +26,7 @@ author: - Ciudad Universitaria, Bogotá - Colombia - name: L.M. Rondón - affiliation: Departamento de Estadística, Universidad Nacional + affiliation: Departamento de Estadística, Universidad Nacional de Colombia email: | lmrondonp@unal.edu.co address: diff --git a/_articles/RJ-2023-056/RJ-2023-056.tex b/_articles/RJ-2023-056/RJ-2023-056.tex index 44e071569a..a51356624c 100644 --- a/_articles/RJ-2023-056/RJ-2023-056.tex +++ b/_articles/RJ-2023-056/RJ-2023-056.tex @@ -28,7 +28,7 @@ \address{% L.M. Rondón\\ -Departamento de Estadística, Universidad Nacional\\% +Departamento de Estadística, Universidad Nacional de Colombia\\% Ciudad Universitaria, Bogotá\\ Colombia\\ % % diff --git a/_articles/RJ-2024-001/fritz.R b/_articles/RJ-2024-001/RJ-2024-001.R similarity index 81% rename from _articles/RJ-2024-001/fritz.R rename to _articles/RJ-2024-001/RJ-2024-001.R index 75de10eea5..4609de36cf 100644 --- a/_articles/RJ-2024-001/fritz.R +++ b/_articles/RJ-2024-001/RJ-2024-001.R @@ -1,5 +1,5 @@ # Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand -# Please edit fritz.Rmd to modify this file +# Please edit RJ-2024-001.Rmd to modify this file ## ----leisch------------------------------------------------------------------- #| echo: false @@ -8,8 +8,7 @@ #| fig-pos: 'h!' #| fig-show: 'hold' #| fig-cap: 'Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: BOKU.' -library("cowplot") -ggdraw() + draw_image("figures/img-leisch.jpg", width = 1) +knitr::include_graphics("figures/img-leisch.jpg") ## ----lmu---------------------------------------------------------------------- @@ -19,7 +18,7 @@ ggdraw() + draw_image("figures/img-leisch.jpg", width = 1) #| fig-pos: 't!' #| fig-show: 'hold' #| fig-cap: 'Computational statistics group at LMU in 2007 (left to right): Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: Carolin Strobl.' -ggdraw() + draw_image("figures/img-lmu.jpg", width = 1) +knitr::include_graphics("figures/img-lmu.jpg") ## ----boku--------------------------------------------------------------------- @@ -29,7 +28,7 @@ ggdraw() + draw_image("figures/img-lmu.jpg", width = 1) #| fig-pos: 't!' #| fig-show: 'hold' #| fig-cap: 'Institute of Statistics at BOKU in 2022 (left to right, back to front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: BOKU.' -ggdraw() + draw_image("figures/img-boku.jpg", width = 1) +knitr::include_graphics("figures/img-boku.jpg") ## ----cran--------------------------------------------------------------------- @@ -39,7 +38,7 @@ ggdraw() + draw_image("figures/img-boku.jpg", width = 1) #| fig-pos: 't!' #| fig-show: 'hold' #| fig-cap: 'Screenshot of the landing page of the CRAN master site at TU Wien on 1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet Archive.' -ggdraw() + draw_image("figures/img-cran.png", width = 1) +knitr::include_graphics("figures/img-cran.png") ## ----dsc1999------------------------------------------------------------------ @@ -49,11 +48,11 @@ ggdraw() + draw_image("figures/img-cran.png", width = 1) #| fig-pos: 'p!' #| fig-show: 'hold' #| fig-cap: 'Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).' -ggdraw() + draw_image("figures/img-dsc1999a.jpg", width = 1) +knitr::include_graphics("figures/img-dsc1999a.jpg") -ggdraw() + draw_image("figures/img-dsc1999b.jpg", width = 1) +knitr::include_graphics("figures/img-dsc1999b.jpg") -ggdraw() + draw_image("figures/img-dsc1999c.jpg", width = 1) +knitr::include_graphics("figures/img-dsc1999c.jpg") ## ----user2006----------------------------------------------------------------- @@ -63,8 +62,7 @@ ggdraw() + draw_image("figures/img-dsc1999c.jpg", width = 1) #| fig-pos: 't!' #| fig-show: 'hold' #| fig-cap: 'Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 homepage).' -library("cowplot") -ggdraw() + draw_image("figures/img-user2006.jpg", width = 1) +knitr::include_graphics("figures/img-user2006.jpg") ## ----sweave------------------------------------------------------------------- @@ -74,5 +72,5 @@ ggdraw() + draw_image("figures/img-user2006.jpg", width = 1) #| fig-pos: 't!' #| fig-show: 'hold' #| fig-cap: 'Screenshot of the strucchange package vignette, shown in a PDF viewer (right), along with the vExplorer from Bioconductor for interactive code execution (top left) with output in the active R graphics window (bottom left). Source: Leisch (2003, Figure 2).' -ggdraw() + draw_image("figures/img-sweave.png", width = 1) +knitr::include_graphics("figures/img-sweave.png") diff --git a/_articles/RJ-2024-001/RJ-2024-001.Rmd b/_articles/RJ-2024-001/RJ-2024-001.Rmd index 9a1ec9a12f..c0c5ee0c38 100644 --- a/_articles/RJ-2024-001/RJ-2024-001.Rmd +++ b/_articles/RJ-2024-001/RJ-2024-001.Rmd @@ -1,141 +1,126 @@ --- title: Remembering Friedrich "Fritz" Leisch -abstract: This article remembers our friend and colleague Fritz Leisch (1968--2024) - who sadly died earlier this year. Many of the readers of The R Journal will know - Fritz as a member of the R Core Team and for many of his contributions to the R - community. For us, the co-authors of this article, he was an important companion - on our journey with the R project and other scientific endeavours over the years. - In the following, we provide a brief synopsis of his career, present his key contributions - to the R project and to the scientific community more generally, acknowledge his - academic service, and highlight his teaching and mentoring achievements. +date: '2025-01-11' +draft: no author: - name: Bettina Grün affiliation: WU Wirtschaftsuniversität Wien - address: - - Austria - - '*ORCiD: [0000-0001-7265-4773](https://orcid.org/0000-0001-7265-4773)*' - - | - [`Bettina.Gruen@wu.ac.at`](mailto:Bettina.Gruen@wu.ac.at) + address: Austria + orcid: 0000-0001-7265-4773 + email: Bettina.Gruen@wu.ac.at - name: Kurt Hornik affiliation: WU Wirtschaftsuniversität Wien - address: - - Austria - - '*ORCiD: [0000-0003-4198-9911](https://orcid.org/0000-0003-4198-9911)*' - - | - [`Kurt.Hornik@R-project.org`](mailto:Kurt.Hornik@R-project.org) + address: Austria + orcid: 0000-0003-4198-9911 + email: Kurt.Hornik@R-project.org - name: Torsten Hothorn affiliation: Universität Zürich - address: - - Switzerland - - '*ORCiD: [0000-0001-8301-0471](https://orcid.org/0000-0001-8301-0471)*' - - | - [`Torsten.Hothorn@R-project.org`](mailto:Torsten.Hothorn@R-project.org) + address: Switzerland + orcid: 0000-0001-8301-0471 + email: Torsten.Hothorn@R-project.org - name: Theresa Scharl affiliation: BOKU University - address: - - Austria - - '*ORCiD: [0000-0001-8850-3312](https://orcid.org/0000-0001-8850-3312)*' - - | - [`Theresa.Scharl@boku.ac.at`](mailto:Theresa.Scharl@boku.ac.at) + address: Austria + orcid: 0000-0001-8850-3312 + email: Theresa.Scharl@boku.ac.at - name: Achim Zeileis affiliation: Universität Innsbruck - address: - - Austria - - - - '*ORCiD: [0000-0003-0918-3766](https://orcid.org/0000-0003-0918-3766)*' - - | - [`Achim.Zeileis@R-project.org`](mailto:Achim.Zeileis@R-project.org) -date: '2024-12-11' + address: Austria + url: https://www.zeileis.org/ + orcid: 0000-0003-0918-3766 + email: Achim.Zeileis@R-project.org +abstract: | + This article remembers our friend and colleague Fritz Leisch (1968--2024) who sadly died earlier this year. Many of the readers of The R Journal will know Fritz as a member of the R Core Team and for many of his contributions to the R community. For us, the co-authors of this article, he was an important companion on our journey with the R project and other scientific endeavours over the years. In the following, we provide a brief synopsis of his career, present his key contributions to the R project and to the scientific community more generally, acknowledge his academic service, and highlight his teaching and mentoring achievements. +preamble: | + %\newcommand{\doi}[1]{\href{https://doi.org/#1}{\normalfont\texttt{doi:\discretionary{}{}{}{#1}}}} +bibliography: fritz.bib +output: + rjtools::rjournal_article: + self_contained: yes date_received: '2024-10-01' -journal: - firstpage: ~ - lastpage: ~ volume: 16 issue: 1 slug: RJ-2024-001 -packages: - cran: ~ - bioc: ~ -draft: no -preview: preview.png -bibliography: ~ -CTV: ~ -output: - rjtools::rjournal_web_article: - self_contained: no - toc: no - legacy_pdf: yes +journal: + lastpage: 14 + firstpage: 5 --- + # Career -Friedrich Leisch (see Figure [1](#fig:leisch){reference-type="ref" -reference="fig:leisch"}) was born 1968 in Vienna (Austria) and died -after serious illness in 2024 in Vienna. Everyone called him Fritz. - -![Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: -BOKU.](fritz_files/figure-latex/leisch-1.pdf){#fig:leisch -width="0.55\\linewidth"} - -Starting in 1987, Fritz studied Applied Mathematics at Technische -Universität Wien (TU Wien), earning his master's degree (Dipl.-Ing.) in -1993. Subsequently, he joined the Department of Statistics and -Probability Theory at TU Wien as an assistant professor which he -continued to be, with short intermissions, until 2006. During this time -he also defended his doctoral thesis in Applied Mathematics (Dr.techn.) -in 1999 and earned his habilitation (venia docendi) in Statistics in -2005. - -In 1995, he visited the Knowledge-Based Engineering Systems Group at the -University of South-Australia in Adelaide on a Kurt Gödel scholarship -for postgraduate studies. From 1997 to 2004 he was a member of the SFB -project "Adaptive Information Systems and Modeling in Economics and -Management Science", coordinated at Wirtschaftsuniversität Wien (WU -Wien). From 2002 to 2003 he was assistant professor at the Department of -Statistics and Decision Support Systems, Universität Wien. - -In 2006 Fritz moved to Munich, Germany, to become a professor for -computational statistics at the Department of Statistics, -Ludwig-Maximilians-Universität München (LMU), see -Figure [2](#fig:lmu){reference-type="ref" reference="fig:lmu"}. He -returned to Vienna in 2011 to join the BOKU University as head of the -Institute of Statistics, see Figure [3](#fig:boku){reference-type="ref" -reference="fig:boku"}. - -![Computational statistics group at LMU in 2007 (left to right): -Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: -Carolin Strobl.](fritz_files/figure-latex/lmu-1.pdf){#fig:lmu -width="0.83\\linewidth"} - -![Institute of Statistics at BOKU in 2022 (left to right, back to -front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz -Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, -Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: -BOKU.](fritz_files/figure-latex/boku-1.pdf){#fig:boku -width="0.83\\linewidth"} +Friedrich Leisch (see Figure \@ref(fig:leisch)) was born 1968 in Vienna (Austria) and +died after serious illness in 2024 in Vienna. Everyone called him Fritz. + +```{r leisch} +#| echo: false +#| out-width: '55%' +#| fig-align: 'center' +#| fig-pos: 'h!' +#| fig-show: 'hold' +#| fig-cap: 'Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: BOKU.' +knitr::include_graphics("figures/img-leisch.jpg") +``` + +Starting in 1987, Fritz studied Applied Mathematics at Technische Universität Wien (TU Wien), +earning his master's degree (Dipl.-Ing.) in 1993. Subsequently, he joined the +Department of Statistics and Probability Theory at TU Wien as an +assistant professor which he continued to be, with short intermissions, until 2006. +During this time he also defended his doctoral thesis in Applied Mathematics (Dr.techn.) +in 1999 and earned his habilitation (venia docendi) in Statistics in 2005. + +In 1995, he visited the Knowledge-Based Engineering Systems Group at the University of +South-Australia in Adelaide on a Kurt Gödel scholarship for postgraduate +studies. From 1997 to 2004 he was a member of the SFB project +"Adaptive Information Systems and Modeling in Economics and Management Science", coordinated +at Wirtschaftsuniversität Wien (WU Wien). From 2002 to 2003 he was assistant professor +at the Department of Statistics and Decision Support Systems, Universität Wien. + +In 2006 Fritz moved to Munich, Germany, to become a professor for computational +statistics at the Department of Statistics, Ludwig-Maximilians-Universität München (LMU), see Figure \@ref(fig:lmu). +He returned to Vienna in 2011 to join the BOKU University as head of the Institute of Statistics, see Figure \@ref(fig:boku). + +```{r lmu} +#| echo: false +#| out-width: '83%' +#| fig-align: 'center' +#| fig-pos: 't!' +#| fig-show: 'hold' +#| fig-cap: 'Computational statistics group at LMU in 2007 (left to right): Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: Carolin Strobl.' +knitr::include_graphics("figures/img-lmu.jpg") +``` + +```{r boku} +#| echo: false +#| out-width: '83%' +#| fig-align: 'center' +#| fig-pos: 't!' +#| fig-show: 'hold' +#| fig-cap: 'Institute of Statistics at BOKU in 2022 (left to right, back to front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: BOKU.' +knitr::include_graphics("figures/img-boku.jpg") +``` + # Key contributions Fritz' scientific contributions span an impressive range including -theoretical and methodological work (especially in the field of -clustering and finite mixture models) over software (mostly related to -the R programming language) to applied work and cooperations (notably in +theoretical and methodological work (especially in the field of clustering +and finite mixture models) over software (mostly related to the R +programming language) to applied work and cooperations (notably in marketing, biotechnology, and genomics, among many others). In the following sections we try to highlight his key contributions and scientific legacy. -::: {#r-core-cran} ## R Core & CRAN -::: During his stay in Australia, Fritz had learned about the existence of -R. Back in Austria, he and Kurt started to explore this potentially good -news more systematically. They soon stopped further work on a statistics -toolbox they had developed for Octave (Eaton et al. 2024), and switched -to R for their applied work, finding lots of room for further -improvement, and thus sending polite emails with patches and more -suggestions to Ross Ihaka and Robert Gentleman. Clearly these were +R. Back in Austria, he and Kurt started to explore this potentially +good news more systematically. They soon stopped further work on a +statistics toolbox they had developed for Octave [@Eaton+Bateman+Hauberg:2024], +and switched to R for their applied work, finding lots of room for +further improvement, and thus sending polite emails with patches and +more suggestions to Ross Ihaka and Robert Gentleman. Clearly these were acceptable in quality but too high in quantity, and it did not take very long that Ross and Robert gave Fritz and Kurt write access to the R sources (initially in CVS, then moved to SVN), and in 1997, they both @@ -143,183 +128,192 @@ officially became very early members of the R Core Team. One of the main challenges then was that the functionality provided by R was rather limited. Contributed extensions for S were available from the -Carnegie Mellon University Statlib S Archive[^1], and could typically be +Carnegie Mellon University Statlib S Archive^[Unfortunately, the Statlib +S Archive is currently not available anymore. A snapshot, including many +of the actual source code files, is available on the Internet Archive at +.], and could typically be ported to R rather easily, but there was no mechanism for conveniently distributing or actually using these extensions. This fundamentally changed, when in 1997 Fritz and Kurt implemented the R package management system, using ideas from Debian's APT (advanced package tool, ) they had successfully employed for managing their computer systems. They also set up the Comprehensive R -Archive Network (CRAN, , see also Hornik -2012) as a means for redistributing R and its contributed extensions, -and infrastructure for quality assurance of these extensions. These two -contributions paved the way for the amazing growth and success of R -through its wealth of high-quality contributed extensions. See - for the -first announcement of CRAN, starting with 12 extension packages. -Currently, there are more than 21,000. See -Figure [4](#fig:cran){reference-type="ref" reference="fig:cran"} for a -screenshot[^2] of the landing page of the CRAN master site at TU Wien, -as last modified by Fritz on 1997-12-09. - -![Screenshot of the landing page of the CRAN master site at TU Wien on -1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet -Archive.](fritz_files/figure-latex/cran-1.pdf){#fig:cran -width="1\\linewidth"} +Archive Network [CRAN, , see also +@Hornik:2012] as a means for redistributing R and its contributed +extensions, and infrastructure for quality assurance of these +extensions. These two contributions paved the way for the amazing +growth and success of R through its wealth of high-quality contributed +extensions. +See for +the first announcement of CRAN, starting with 12 extension packages. +Currently, there are more than 21,000. See Figure \@ref(fig:cran) +for a screenshot^[This is from the earliest capture, from 1998-01-10, +available on the Internet Archive at +.] +of the landing page of the CRAN master site at TU Wien, as last modified +by Fritz on 1997-12-09. + +```{r cran} +#| echo: false +#| out-width: '100%' +#| fig-align: 'center' +#| fig-pos: 't!' +#| fig-show: 'hold' +#| fig-cap: 'Screenshot of the landing page of the CRAN master site at TU Wien on 1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet Archive.' +knitr::include_graphics("figures/img-cran.png") +``` The first SVN commit by Fritz is from 1997-10-02, the last from 2013-10-04. Overall, there are 651 commits by Fritz, mostly from the early years of R Core, and related to the R package management and CRAN -mirror system, and the addition of the `Sweave` system (see -Section [2.3](#sec:sweave-reproducibility) for more details). +mirror system, and the addition of the `Sweave` system +(see Section [2.3](#sec:sweave-reproducibility) for more details). -::: {#dsc-user-conferences} ## DSC & useR! conferences -::: - -With establishing CRAN in Vienna at TU Wien, Fritz and Kurt laid the -foundation for a special relationship between Vienna and R that they -characterized as a story of "love and marriage" (Hornik and Leisch -2002). In the decade after the creation of CRAN a number of seminal -R-related meetings took place in Vienna, co-organized by Fritz as well -as several of the co-authors of this paper. - -The first workshop on "Distributed Statistical Computing" (DSC) took -place from March 19-23, 1999, at TU Wien. The main motivations were -bringing together the R Core Team for its first face-to-face meeting, -discussing the roadmap for the release of R 1.0.0, as well as exploring -potential synergies with other environments for statistical computing. -There were around 30 participants and about 20 presentations, many of -which were relatively short, leaving ample time for discussions (see -Figure [5](#fig:dsc1999){reference-type="ref" reference="fig:dsc1999"}). - -
-

-
Discussions at DSC 1999 (top to bottom, left to right): -Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, -Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt -Hornik. Source: Douglas Bates (DSC 1999 homepage).
-
- -Two more DSC workshops were organized at TU Wien in 2001 and 2003. While -meetings focusing on R development issues (with the R Core Team and -everyone else interested) were still an important part of these -conferences, they also saw an increasing number of regular conference + +With establishing CRAN in Vienna at TU Wien, Fritz and Kurt +laid the foundation for a special relationship between Vienna and R that they +characterized as a story of "love and marriage" [@Hornik+Leisch:2002]. In the decade +after the creation of CRAN a number of seminal R-related meetings took place in Vienna, +co-organized by Fritz as well as several of the co-authors of this paper. + +The first workshop on "Distributed Statistical Computing" (DSC) took place from +March 19-23, 1999, at TU Wien. The main motivations were bringing together the R Core Team +for its first face-to-face meeting, discussing the roadmap for the release of R 1.0.0, +as well as exploring potential synergies with other environments for statistical computing. +There were around 30 participants and about 20 presentations, many of which were +relatively short, leaving ample time for discussions (see Figure \@ref(fig:dsc1999)). + +```{r dsc1999} +#| echo: false +#| out-width: '83%' +#| fig-align: 'center' +#| fig-pos: 'p!' +#| fig-show: 'hold' +#| fig-cap: 'Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).' +knitr::include_graphics("figures/img-dsc1999a.jpg") + +knitr::include_graphics("figures/img-dsc1999b.jpg") + +knitr::include_graphics("figures/img-dsc1999c.jpg") +``` + +Two more DSC workshops were organized at TU Wien +in 2001 and 2003. While meetings focusing on R development issues (with the +R Core Team and everyone else interested) were still an important part of +these conferences, they also saw an increasing number of regular conference presentations on R packages and their different fields of application (e.g., establishing infrastructure for spatial data). In 2001 there were -around 60 participants and about 30 presentations, most with -corresponding papers in the online proceedings (Hornik and Leisch 2001). -In 2003 this increased to more than 150 participants and about 60 -presentations, again with the majority in the online proceedings -(Hornik, Leisch, and Zeileis 2003). - -The high demand for a platform, where R users from different fields -could exchange ideas, prompted the creation of a new conference series -called useR!. The first two installments again took place in Vienna in -2004 at TU Wien and in 2006 at WU Wien. Torsten Hothorn, David Meyer, -and Achim Zeileis took the lead in the organization with support and -advice from Fritz and Kurt in the background. An important contribution -from the R Core Team at the useR! conferences were keynote lectures -highlighting important developments, e.g., a keynote given by Fritz at -useR! 2004 on S4 classes and methods. Both conferences continued the -success of the earlier DSC workshops with the number of participants -rising to more than 200 in 2004 and close to 350 in 2006. Similarly, the -number of presentations grew to about 100 in 2004 and more than 150 in -2006. - -In addition to the efforts initiated by Fritz and Kurt, another key -factor to the success of these meetings was the city of Vienna with its -culture, cafes, wine and beer pubs, etc. (see Hornik and Leisch 2002 and -also Figure [6](#fig:user2006){reference-type="ref" -reference="fig:user2006"}). - -![Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten -Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 -homepage).](fritz_files/figure-latex/user2006-1.pdf){#fig:user2006 -width="0.83\\linewidth"} - -::: {#sweave-reproducibility} +around 60 participants and about 30 presentations, most with corresponding +papers in the online proceedings [@Hornik+Leisch:2001]. In 2003 this +increased to more than 150 participants and about 60 presentations, again +with the majority in the online proceedings [@Hornik+Leisch+Zeileis:2003]. + +The high demand for a platform, where R users from different fields could +exchange ideas, prompted the creation of a new conference series called +useR!. The first two installments again took place in Vienna in 2004 +at TU Wien and in 2006 at WU Wien. +Torsten Hothorn, David Meyer, and Achim Zeileis took the lead in the +organization with support and advice from Fritz and Kurt in the background. +An important contribution from the R Core Team at the useR! conferences +were keynote lectures highlighting important developments, e.g., a keynote +given by Fritz at useR! 2004 on S4 classes and methods. Both conferences +continued the success of the earlier DSC workshops with the number of +participants rising to more than 200 in 2004 and close to 350 in 2006. +Similarly, the number of presentations grew to about 100 in 2004 and more +than 150 in 2006. + +In addition to the efforts initiated by Fritz and Kurt, another key factor +to the success of these meetings was the city of Vienna with its culture, +cafes, wine and beer pubs, etc. [see @Hornik+Leisch:2002 and also +Figure \@ref(fig:user2006)]. + +```{r user2006} +#| echo: false +#| out-width: '83%' +#| fig-align: 'center' +#| fig-pos: 't!' +#| fig-show: 'hold' +#| fig-cap: 'Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 homepage).' +knitr::include_graphics("figures/img-user2006.jpg") +``` + + ## Sweave & reproducibility -::: - -With `Sweave` (Leisch 2002), Fritz pioneered what we now can understand -as the technical foundation of reproducible research. `Sweave` was the -main inspiration for [knitr](https://CRAN.R-project.org/package=knitr) -(Xie 2015) which in turn led to -[rmarkdown](https://CRAN.R-project.org/package=rmarkdown) (Xie, Allaire, -and Grolemund 2018) and -[quarto](https://CRAN.R-project.org/package=quarto) (Scheidegger et al. -2024). All these systems are used today to generate countless scientific -articles, package vignettes, webpages, books, blogs, and much more in a -dynamic and reproducible way. - -Of course, Fritz was not the first one going in this direction. The -concept of "literate programming" had been introduced by Knuth (1984), -allowing to combine the source code for software and the corresponding -documentation in the same file. The concepts of "tangling", that is, -extracting the code for compilation, and "weaving", the process of -generating a nicely looking document containing code next to prosa and -formulae, have their roots in the `WEB` and `CWEB` systems (Knuth and -Levy 1993). As these packages were specific to code in Pascal (`WEB`) -and C (`CWEB`), respectively, and documentation in LaTeX, Ramsey (1994) -introduced his `noweb` system as a literate programming tool that is -agnostic to the programming language used and also supports HTML in -addition to LaTeX and a few other backends for documentation. The -`noweb` syntax for code chunks is: - - <>= - 1 + 2 - @ - -This will look familiar to users of `Sweave`. From this history, the -naming decisions for the software and its file format can be understood: -`Sweave` is the function that weaves code in S (or R - both languages -still existed side by side at the time) with its output and -documentation. And `Rnw` stands for files mixing R code with `noweb` -syntax. - -Starting in the mid-1990s to the early 2000s, interests shifted from -just "literate programming" to "literate data analysis" (Leisch 2002; -Leisch and Rossini 2003) as a core ingredient for reproducible research -(Buckheit and Donoho 1995). The seminal new idea was to have dynamic -documents so *outputs* of code such as figures and tables could be -updated automatically when the underlying data changed, which was -pioneered by the late Günter Sawitzki in his `Voyager` system (Sawitzki -1996). - -Fritz amalgamated all of this into `Sweave` which was the first time -that the power of dynamic reporting became easily available in a -widely-used programming language for statistics in combination with the -standard textprocessing system LaTeX. This turned out to be a "killer -feature" of R at the time and the basis for further work towards -reproducible research (Hothorn and Leisch 2011; Stodden, Leisch, and -Peng 2014). - -`Sweave` was also the basis for R package vignettes (Leisch 2003) as an -addition to the previously available technical manual pages. The first R -package vignette published on CRAN in May 2002 was in the -[strucchange](https://CRAN.R-project.org/package=strucchange) package, -providing methods for testing, monitoring, and dating structural -changes. The vignette was the `Sweave` adaptation of an introduction to -the package that had been co-authored by Fritz and published a couple of -months earlier in the *Journal of Statistical Software* (Zeileis et al. -2002). See Figure [7](#fig:sweave){reference-type="ref" -reference="fig:sweave"} for how Fritz used it to illustrate the idea of -package vignettes in Leisch (2003) and that the R code from vignettes -can be easily extracted (also interactively), explored, and re-run. - -![Screenshot of the strucchange package vignette, shown in a PDF viewer -(right), along with the vExplorer from Bioconductor for interactive code -execution (top left) with output in the active R graphics window (bottom -left). Source: Leisch (2003, Figure -2).](fritz_files/figure-latex/sweave-1.pdf){#fig:sweave -width="1\\linewidth"} - -::: {#clustering-mixture-models} + +With `Sweave` [@Leisch:2002], Fritz pioneered what we now can understand as +the technical foundation of reproducible research. `Sweave` was the main +inspiration for \CRANpkg{knitr} [@Xie:2015] which in turn led to +\CRANpkg{rmarkdown} [@Xie+Allaire+Grolemund:2018] and \CRANpkg{quarto} +[@Scheidegger+Teague+Dervieux:2024]. All these systems are used today to +generate countless scientific articles, package vignettes, webpages, books, blogs, +and much more in a dynamic and reproducible way. + +Of course, Fritz was not the first one going in this direction. The concept +of "literate programming" had been introduced by @Knuth:1984, allowing to +combine the source code for software and the corresponding documentation +in the same file. The concepts of "tangling", that is, extracting the code +for compilation, and "weaving", the process of generating a nicely looking +document containing code next to prosa and formulae, have their roots in the +`WEB` and `CWEB` systems [@Knuth+Levy:1993]. As these packages were specific +to code in Pascal (`WEB`) and C (`CWEB`), respectively, and documentation in +LaTeX, @Ramsey:1994 introduced his `noweb` system as a literate programming +tool that is agnostic to the programming language used and also supports HTML +in addition to LaTeX and a few other backends for documentation. The `noweb` +syntax for code chunks is: + +`r ifelse(knitr::is_latex_output(), "\\pagebreak", "")` + +``` +<>= +1 + 2 +@ +``` + +This will look familiar to users of `Sweave`. From this history, the naming +decisions for the software and its file format can be understood: `Sweave` +is the function that weaves code in S (or R - both languages still existed +side by side at the time) with its output and documentation. And `Rnw` stands for files +mixing R code with `noweb` syntax. + +Starting in the mid-1990s to the early 2000s, interests shifted from just +"literate programming" to "literate data analysis" [@Leisch:2002; @Leisch+Rossini:2003] +as a core ingredient for reproducible research [@Buckheit+Donoho:1995]. +The seminal new idea was to have dynamic documents so _outputs_ of code +such as figures and tables could be updated automatically when the underlying +data changed, which was pioneered by the late Günter Sawitzki in his +`Voyager` system [@Sawitzki:1996]. + +Fritz amalgamated all of this into `Sweave` which was the first time that the +power of dynamic reporting became easily available in a widely-used programming +language for statistics in combination with the standard textprocessing system +LaTeX. This turned out to be a "killer feature" of R at the time and the basis +for further work towards reproducible research [@Hothorn+Leisch_2011; @Stodden:2014]. + +`Sweave` was also the basis for R package vignettes [@Leisch:2003] as an +addition to the previously available technical manual pages. +The first R package vignette published on CRAN in May 2002 was in the +\CRANpkg{strucchange} package, providing methods for testing, monitoring, +and dating structural changes. The vignette was the `Sweave` adaptation +of an introduction to the package that had been co-authored by Fritz and +published a couple of months earlier in the _Journal of Statistical Software_ +[@Zeileis+Leisch+Hornik:2002]. See Figure \@ref(fig:sweave) for how +Fritz used it to illustrate the idea of package vignettes in @Leisch:2003 +and that the R code from vignettes can be easily extracted (also interactively), +explored, and re-run. + +```{r sweave} +#| echo: false +#| out-width: '100%' +#| fig-align: 'center' +#| fig-pos: 't!' +#| fig-show: 'hold' +#| fig-cap: 'Screenshot of the strucchange package vignette, shown in a PDF viewer (right), along with the vExplorer from Bioconductor for interactive code execution (top left) with output in the active R graphics window (bottom left). Source: Leisch (2003, Figure 2).' +knitr::include_graphics("figures/img-sweave.png") +``` + + ## Clustering & mixture models -::: Fritz' theoretical and methodological work focused in particular on clustering and finite mixture models. Centroid-based partitioning @@ -328,272 +322,106 @@ algorithm is embedded in a common estimation framework. In this framework, each of the steps is adapted in a modular way depending on the specific setup, e.g., the distance and centroid determining method or the component distribution used. Fritz exploited this for the -implementation of the packages -[flexclust](https://CRAN.R-project.org/package=flexclust) (Leisch 2006) -and [flexmix](https://CRAN.R-project.org/package=flexmix) (Leisch 2004; -Grün and Leisch 2008), contributing to the clustering tools available -for R (see the CRAN Task View -[*Cluster*](https://CRAN.R-project.org/view=Cluster)). Both packages -provide general infrastructure for (model-based) clustering and enable -rapid prototyping and the simple extension to new variants taking into -account complicated data structures or challenging model specifications -(see, for example, psychomix, Frick et al. 2012). - -::: {#applied-work} +implementation of the packages \CRANpkg{flexclust} [@Leisch:2006] and +\CRANpkg{flexmix} [@Leisch:2004; @Gruen+Leisch:2008], contributing to +the clustering tools available for R (see the CRAN Task View +\ctv{Cluster}). Both packages provide general infrastructure for +(model-based) clustering and enable rapid prototyping and the simple +extension to new variants taking into account complicated data +structures or challenging model specifications [see, for example, +\pkg{psychomix}, @Frick+Strobl+Leisch:2012]. + + ## Applied work -::: For many years, Fritz and Kurt actively participated in the Biological Psychiatry working group at Medizinische Universität Wien. The first -paper co-authored by Fritz dates from 2000 (Bailer et al. 2000), the -last from 2023 (Solmi et al. 2023). The joint research was mostly -focused on linking genetic traits to psychiatric disorders and treatment -success. This prompted many enhancements in the classical test +paper co-authored by Fritz dates from 2000 +[@Bailer+Leisch+Meszaros:2000], the last from 2023 +[@Solmi+Thompson:2023]. The joint research was mostly +focused on linking genetic traits to psychiatric disorders and +treatment success. This prompted many enhancements in the classical test infrastructure in base R - in surprising ways to some reviewers, who could not believe that Fisher's test really worked for tables with more than two rows or columns. It also established a strong need for conveniently reporting the results of the statistical analyses to the medical doctors in the group that went beyond providing annotated transcripts, which Fritz eventually managed to satisfy by inventing the -`Sweave` system (see Section [2.3](#sec:sweave-reproducibility)). +`Sweave` system (see Section [2.3](#sec:sweave-reproducibility)). Fritz also intensively collaborated with Sara Dolnicar to advance data analytic methods for data-driven market segmentation analysis. They received the Charles R. Goeldner Article of Excellence Award for their -work on extracting stable Winter tourist segments in Austria with bagged -clustering (Dolnicar and Leisch 2003). They focused on the evaluation of -data structure and the selection of suitable segments based on segment -stability as a key criterion (Dolnicar and Leisch 2010, 2017). Finally, -this joint work resulted in Dolnicar, Grün, and Leisch (2018) which -provides practical guidance for users of market segmentation solutions -and for data analysts with respect to the technical and statistical -aspects of market segmentation analysis. - -As head of the Institute of Statistics, Fritz was involved in various -interdisciplinary research projects covering almost the whole range of -core areas of research at BOKU. He was key researcher at the Austrian -Centre of Industrial Biotechnology (acib) (Scharl, Voglhuber, and Leisch -2009; Melcher et al. 2017) and faculty member of the doctoral schools on -agricultural genomics and bioprocess engineering. Among others he -contributed to the fields of zoology (Cech et al. 2022), forestry, -transportation and tourism (Taczanowska et al. 2023) as well as -chemistry, genomics and wildlife biology (Steiner, Leisch, and -Hackländer 2014). +work on extracting stable Winter tourist segments in Austria with +bagged clustering [@Dolnicar+Leisch:2003]. They focused on the +evaluation of data structure and the selection of suitable segments +based on segment stability as a key criterion [@Dolnicar+Leisch:2010; +@Dolnicar+Leisch:2017]. Finally, this joint work resulted in +@Dolnicar+Gruen+Leisch:2018 which provides practical guidance for +users of market segmentation solutions and for data analysts with +respect to the technical and statistical aspects of market +segmentation analysis. + +As head of the Institute of Statistics, Fritz was involved +in various interdisciplinary research projects covering almost the whole +range of core areas of research at BOKU. He was key researcher at the +Austrian Centre of Industrial Biotechnology (acib) +[@Scharl+Voglhuber+Leisch:2009; @Melcher+Scharl+Leisch:2017] and +faculty member of the doctoral schools on agricultural genomics and +bioprocess engineering. Among others he contributed to the fields of +zoology [@Cech:2022], forestry, transportation and tourism +[@Taczanowska:2023] as well as chemistry, genomics and wildlife +biology [@Steiner:2014]. + # Academic service In addition to the services for the various conferences and proceedings -already described above, he served the scientific community in various -ways. In January 2001, he co-created *R News* which evolved into *The R -Journal* eight years later. For the journal *Computational Statistics* -he was an associate editor from 2005 to 2006 before he became -editor-in-chief from 2007 to 2011 (see Symanzik, Mori, and Vieu 2024 for -more details). Other notable contributions include being editor for the -*Journal of Statistical Software*, core member of the *Bioconductor* -project for statistical software in bioinformatics, and first secretary -general of the *R Foundation for Statistical Computing* when it was -formed in 2002. +already described above, he served the scientific community in various ways. +In January 2001, he co-created _R News_ which evolved into +_The R Journal_ eight years later. For the journal _Computational Statistics_ +he was an associate editor from 2005 to 2006 before he became editor-in-chief +from 2007 to 2011 [see @Symanzik+Mori+Vieu:2024 for more details]. +Other notable contributions include being +editor for the _Journal of Statistical Software_, core member of the +_Bioconductor_ project for statistical software in bioinformatics, and +first secretary general of the _R Foundation for Statistical Computing_ when +it was formed in 2002. + + # Teaching & mentoring -Fritz taught generations of students at bachelor, master, and PhD level -and introduced hundreds of useRs to proper R development in his -"Introduction to R Programming" short course. At TU Wien, LMU, and BOKU, -he taught courses in applied statistics, statistical computing and -computational statistics. He had the ability to explain even difficult -content in a simple way and to inspire students with statistics and -programming with R. He co-founded the "Munich R Courses" lecture series -and was part of a group aiming to initiate a formal PhD program in -statistics at LMU. - -Fritz supervised Bettina Grün, Theresa Scharl, Sebastian Kaiser, Manuel -Eugster, Christina Yassouridis, Rainer Dangl, Weksi Budiaji, Muhammad -Atif and Simona Jokubauskaite as his PhD students. Based on his -research, Fritz often discussed the state of and the need for -reproducible research and taught his many students how to avoid the many -small and innocent errors that have a tendency to pile up and invalidate -reported statistical results, with potentially devastating consequences, -as we all know. +Fritz taught generations of students at bachelor, master, and PhD level and +introduced hundreds of useRs to proper R development in his "Introduction to +R Programming" short course. At TU Wien, LMU, and BOKU, he taught courses in applied +statistics, statistical computing and computational statistics. He had the +ability to explain even difficult content in a simple way and to inspire students +with statistics and programming with R. He +co-founded the "Munich R Courses" lecture series and was part of a group +aiming to initiate a formal PhD program in statistics at LMU. + +Fritz supervised +Bettina Grün, Theresa Scharl, +Sebastian Kaiser, Manuel Eugster, +Christina Yassouridis, Rainer Dangl, +Weksi Budiaji, Muhammad Atif and +Simona Jokubauskaite as his PhD students. +Based on his research, Fritz often discussed the state of and the need for reproducible +research and taught his many students how to avoid the many small and +innocent errors that have a tendency to pile up and invalidate reported +statistical results, with potentially devastating consequences, as we all know. # Odds & ends Fritz loved cooking, music, motorbike riding, playing cards with his -friends, skiing and hiking. A late afternoon call to his office asking -him to go along for a beer in Munich's English Garden almost never went -unanswered, positively. Back in Vienna at BOKU, colleagues got to know -Fritz as a very structured, thoughtful, calm person who involved -everyone, listened to everyone and always endeavored to balance -interests and ensure fairness. He strengthened cooperation and cohesion -with his leadership style. Fritz was a friendly, always modest person -who was free of airs and graces or vanity, despite or perhaps because of -his great scientific successes. The R Core Team and the R community at -large miss a contributor, collaborator, teacher, colleague, and friend. - -# References {#references .unnumbered} - -::: {#refs} -::: - -Bailer, Ursula, Friedrich Leisch, Kurt Meszaros, Elisabeth Lenzinger, -Ulrike Willinger, Rainer Strobl, Christian Gebhardt, et al. 2000. -"Genome Scan for Susceptibility Loci for Schizophrenia." -*Neuropsychobiology* 42 (4): 175--82. -. - -Buckheit, Jonathan B., and David L. Donoho. 1995. "WaveLab and -Reproducible Research." In *Wavelets in Statistics*, edited by A. -Antoniadis and G. Oppenheim, 55--82. Lecture Notes in Statistics. New -York: Springer-Verlag. . - -Cech, Ramona M, Suzanne Jovanovic, Susan Kegley, Koen Hertoge, Friedrich -Leisch, and Johann G Zaller. 2022. "Reducing Overall Herbicide Use May -Reduce Risks to Humans but Increase Toxic Loads to Honeybees, Earthworms -and Birds." *Environmental Sciences Europe* 34 (1): 44. -. - -Dolnicar, Sara, Bettina Grün, and Friedrich Leisch. 2018. *Market -Segmentation Analysis: Understanding It, Doing It, and Making It -Useful*. Management for Professionals. Springer-Verlag. -. - -Dolnicar, Sara, and Friedrich Leisch. 2003. "Winter Tourist Segments in -Austria: Identifying Stable Vacation Styles Using Bagged Clustering -Techniques." *Journal of Travel Research* 41 (3): 281--92. -. - ----------. 2010. "Evaluation of Structure and Reproducibility of Cluster -Solutions Using the Bootstrap." *Marketing Letters* 21 (1): 83--101. -. - ----------. 2017. "Using Segment Level Stability to Select Target -Segments in Data-Driven Market Segmentation Studies." *Marketing -Letters* 28 (3): 423--36. . - -Eaton, John W., David Bateman, Søren Hauberg, and Rik Wehbring. 2024. -*GNU Octave Version 9.2.0 Manual: A High-Level Interactive Language for -Numerical Computations*. -. - -Frick, Hannah, Carolin Strobl, Friedrich Leisch, and Achim Zeileis. -2012. "Flexible Rasch Mixture Models with Package psychomix." *Journal -of Statistical Software* 48 (7): 1--25. -. - -Grün, Bettina, and Friedrich Leisch. 2008. "FlexMix Version 2: Finite -Mixtures with Concomitant Variables and Varying and Constant -Parameters." *Journal of Statistical Software* 28 (4): 1--35. -. - -Hornik, Kurt. 2012. "The Comprehensive R Archive Network." *Wiley -Interdisciplinary Reviews: Computational Statistics* 4 (4): 394--98. -. - -Hornik, Kurt, and Friedrich Leisch, eds. 2001. *Proceedings of the 2nd -International Workshop on Distributed Statistical Computing, Vienna, -Austria*. . - ----------. 2002. "Vienna and R: Love, Marriage and the Future." In -*Festschrift 50 Jahre Österreichische Statistische Gesellschaft*, edited -by Rudolf Dutter, 61--70. Österreichische Statistische Gesellschaft. - -Hornik, Kurt, Friedrich Leisch, and Achim Zeileis, eds. 2003. -*Proceedings of the 3rd International Workshop on Distributed -Statistical Computing, Vienna, Austria*. -. - -Hothorn, Torsten, and Friedrich Leisch. 2011. "Case Studies in -Reproducibility." *Briefings in Bioinformatics* 12 (3): 288--300. -. - -Knuth, Donald E. 1984. "Literate Programming." *The Computer Journal* 27 -(2): 97--111. . - -Knuth, Donald E., and Silvio Levy. 1993. *The CWEB System of Structured -Documentation*. Reading: Addison-Wesley. - -Leisch, Friedrich. 2002. "Sweave: Dynamic Generation of Statistical -Reports Using Literate Data Analysis." In *COMPSTAT 2002 -- Proceedings -in Computational Statistics*, edited by Wolfgang Härdle and Bernd Rönz, -575--80. Heidelberg: Physica Verlag. -. - ----------. 2003. "Sweave, Part II: Package Vignettes." *R News* 3 (2): -21--24. . - ----------. 2004. "FlexMix: A General Framework for Finite Mixture Models -and Latent Class Regression in R." *Journal of Statistical Software* 11 -(8): 1--18. . - ----------. 2006. "A Toolbox for k-Centroids Cluster Analysis." -*Computational Statistics and Data Analysis* 51 (2): 526--44. -. - -Leisch, Friedrich, and Anthony J. Rossini. 2003. "Reproducible -Statistical Research." *Chance* 16 (2): 46--50. -. - -Melcher, Michael, Theresa Scharl, Markus Luchner, Gerald Striedner, and -Friedrich Leisch. 2017. "Boosted Structured Additive Regression for -Escherichia Coli Fed-Batch Fermentation Modeling." *Biotechnology and -Bioengineering* 114 (2): 321--34. . - -Ramsey, Norman. 1994. "Literate Programming Simplified." *IEEE Software* -11 (5): 97--105. . - -Sawitzki, Günther. 1996. "Extensible Statistical Software: On a Voyage -to Oberon." *Journal of Computational and Graphical Statistics* 5 (3): -263--83. . - -Scharl, Theresa, Ingo Voglhuber, and Friedrich Leisch. 2009. -"Exploratory and Inferential Analysis of Gene Cluster Neighborhood -Graphs." *BMC Bioinformatics* 10 (1): 288. -. - -Scheidegger, Carlos, Charles Teague, Christophe Dervieux, J. J. Allaire, -and Yihui Xie. 2024. "Quarto: An Open-Source Scientific and Technical -Publishing System." . - -Solmi, Marco, Trevor Thompson, Andrés Estradé, Agorastos Agorastos, -Joaquim Radua, Samuele Cortese, Elena Dragioti, et al. 2023. "Validation -of the Collaborative Outcomes Study on Health and Functioning During -Infection Times (COH-FIT) Questionnaire for Adults." *Journal of -Affective Disorders* 326: 249--61. -. - -Steiner, Wolfgang, Friedrich Leisch, and Klaus Hackländer. 2014. "A -Review on the Temporal Pattern of Deer-Vehicle Accidents: Impact of -Seasonal, Diurnal and Lunar Effects in Cervids." *Accident Analysis & -Prevention* 66: 168--81. . - -Stodden, Victoria, Friedrich Leisch, and Roger D. Peng. 2014. -*Implementing Reproducible Research*. Boca Raton: Chapman & Hall/CRC. - -Symanzik, Jürgen, Yuichi Mori, and Philippe Vieu. 2024. "A Memorial for -the Late Professor Friedrich Leisch." *Computational Statistics* 39. - -Taczanowska, Karolina, Barbara Latosinska, Christiane Brandenburg, -Friedrich Leisch, Christina Czachs, and Andreas Muhar. 2023. "Lobbying -in Social Media as a New Source of Survey Bias." *Journal of Outdoor -Recreation and Tourism* 44 (A): 100689. -. - -Xie, Yihui. 2015. *Dynamic Documents with R and knitr*. 2nd ed. Boca -Raton: Chapman & Hall/CRC. . - -Xie, Yihui, J. J. Allaire, and Garrett Grolemund. 2018. *R Markdown: The -Definitive Guide*. Boca Raton: Chapman & Hall/CRC. -. - -Zeileis, Achim, Friedrich Leisch, Kurt Hornik, and Christian Kleiber. -2002. "strucchange: An R Package for Testing for Structural Change in -Linear Regression Models." *Journal of Statistical Software* 7 (2): -1--38. . - -[^1]: Unfortunately, the Statlib S Archive is currently not available - anymore. A snapshot, including many of the actual source code files, - is available on the Internet Archive at - . - -[^2]: This is from the earliest capture, from 1998-01-10, available on - the Internet Archive at - . +friends, skiing and hiking. A late afternoon call to his office +asking him to go along for a beer in Munich's English Garden almost never went +unanswered, positively. Back in Vienna at BOKU, colleagues got to know Fritz as a very +structured, thoughtful, calm person who involved everyone, listened to +everyone and always endeavored to balance interests and ensure fairness. +He strengthened cooperation and cohesion with his leadership style. +Fritz was a friendly, always modest person who was free of airs and graces or +vanity, despite or perhaps because of his great scientific successes. +The R Core Team and the R community at large miss a contributor, +collaborator, teacher, colleague, and friend. diff --git a/_articles/RJ-2024-001/RJ-2024-001.html b/_articles/RJ-2024-001/RJ-2024-001.html index 9b8282cd7c..e81bdb7e90 100644 --- a/_articles/RJ-2024-001/RJ-2024-001.html +++ b/_articles/RJ-2024-001/RJ-2024-001.html @@ -1,19 +1,19 @@ - + - - - + + + + +body { +visibility: hidden; +} + @@ -21,7 +21,7 @@ - Remembering Friedrich "Fritz" Leisch + Remembering Friedrich "Fritz" Leisch - + - - - - - - - - + + + + + + + - - - - + + + + - - - + + + - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - .shaded .html-widget { - margin-bottom: 0; - border: 1px solid rgba(0, 0, 0, 0.1); - } + - .downlevel .d-byline { - padding-top: 0.8em; - padding-bottom: 0.8em; - font-size: 0.8rem; - line-height: 1.8em; - } + + + + + + + + + + + + - - - - - - - - - - - +`);class qr extends Rr(HTMLElement){}const Fr=function(){if(1>window.distillRunlevel)throw new Error('Insufficient Runlevel for Distill Template!');if('distillTemplateIsLoading'in window&&window.distillTemplateIsLoading)throw new Error('Runlevel 1: Distill Template is getting loaded more than once, aborting!');else window.distillTemplateIsLoading=!0,console.info('Runlevel 1: Distill Template has started loading.');p(document),console.info('Runlevel 1: Static Distill styles have been added.'),console.info('Runlevel 1->2.'),window.distillRunlevel+=1;for(const[e,t]of Object.entries(hi.listeners))'function'==typeof t?document.addEventListener(e,t):console.error('Runlevel 2: Controller listeners need to be functions!');console.info('Runlevel 2: We can now listen to controller events.'),console.info('Runlevel 2->3.'),window.distillRunlevel+=1;if(2>window.distillRunlevel)throw new Error('Insufficient Runlevel for adding custom elements!');const e=[ki,wi,Ci,Li,Ai,Di,Oi,Ni,Ri,Fi,pi,Hi,zi,T,Bi,Wi,Vi,Mr,$i].concat([Ir,jr,qr]);for(const t of e)console.info('Runlevel 2: Registering custom element: '+t.is),customElements.define(t.is,t);console.info('Runlevel 3: Distill Template finished registering custom elements.'),console.info('Runlevel 3->4.'),window.distillRunlevel+=1,hi.listeners.DOMContentLoaded(),console.info('Runlevel 4: Distill Template initialisation complete.')};window.distillRunlevel=0,yi.browserSupportsAllFeatures()?(console.info('Runlevel 0: No need for polyfills.'),console.info('Runlevel 0->1.'),window.distillRunlevel+=1,Fr()):(console.info('Runlevel 0: Distill Template is loading polyfills.'),yi.load(Fr))}); +//# sourceMappingURL=template.v2.js.map +} + +d-byline .byline { +grid-template-columns: 2fr 2fr 2fr 2fr; +} +d-byline .rjournal { +grid-column-end: span 2; +grid-template-columns: 1fr 1fr; +margin-bottom: 0; +} +d-title h1, d-title p, d-title figure, +d-abstract p, d-abstract b { +grid-column: page; +} +d-title .dt-tags { +grid-column: page; +} +.dt-tags .dt-tag { +text-transform: lowercase; +} +d-article h1 { +line-height: 1.1em; +} +d-abstract p, d-article p { +text-align: justify; +} +@media(min-width: 1000px) { +.d-contents.d-contents-float { +justify-self: end; +} +nav.toc { +border-right: 1px solid rgba(0, 0, 0, 0.1); +border-right-width: 1px; +border-right-style: solid; +border-right-color: rgba(0, 0, 0, 0.1); +} +} +.posts-list .dt-tags .dt-tag { +text-transform: lowercase; +} +@keyframes highlight-target { +0% { +background-color: #ffa; +} +66% { +background-color: #ffa; +} +100% { +background-color: none; +} +} +d-article :target, d-appendix :target { +animation: highlight-target 3s; +} +.header-section-number { +margin-right: 0.5em; +} +d-appendix .citation-appendix, +.d-appendix .citation-appendix { +color: rgb(60, 60, 60); +} +d-article h2 { +border-bottom: 0px solid rgba(0, 0, 0, 0.1); +padding-bottom: 0rem; +} +d-article h3 { +font-size: 20px; +} +d-article h4 { +font-size: 18px; +text-transform: none; +} +@media (min-width: 1024px) { +d-article h2 { +font-size: 32px; +} +d-article h3 { +font-size: 24px; +} +d-article h4 { +font-size: 20px; +} +} + @@ -1742,7 +2597,7 @@ @@ -1756,7 +2611,7 @@

Remembering Friedrich “Fritz” Leisch

-

This article remembers our friend and colleague Fritz Leisch (1968–2024) who sadly died earlier this year. Many of the readers of The R Journal will know Fritz as a member of the R Core Team and for many of his contributions to the R community. For us, the co-authors of this article, he was an important companion on our journey with the R project and other scientific endeavours over the years. In the following, we provide a brief synopsis of his career, present his key contributions to the R project and to the scientific community more generally, acknowledge his academic service, and highlight his teaching and mentoring achievements.

+

This article remembers our friend and colleague Fritz Leisch (1968–2024) who sadly died earlier this year. Many of the readers of The R Journal will know Fritz as a member of the R Core Team and for many of his contributions to the R community. For us, the co-authors of this article, he was an important companion on our journey with the R project and other scientific endeavours over the years. In the following, we provide a brief synopsis of his career, present his key contributions to the R project and to the scientific community more generally, acknowledge his academic service, and highlight his teaching and mentoring achievements.

1 Career

-

Friedrich Leisch (see Figure 1) was born 1968 in Vienna (Austria) and died -after serious illness in 2024 in Vienna. Everyone called him Fritz.

-
- - -
-

Starting in 1987, Fritz studied Applied Mathematics at Technische -Universität Wien (TU Wien), earning his master’s degree (Dipl.-Ing.) in -1993. Subsequently, he joined the Department of Statistics and -Probability Theory at TU Wien as an assistant professor which he -continued to be, with short intermissions, until 2006. During this time -he also defended his doctoral thesis in Applied Mathematics (Dr.techn.) -in 1999 and earned his habilitation (venia docendi) in Statistics in -2005.

-

In 1995, he visited the Knowledge-Based Engineering Systems Group at the -University of South-Australia in Adelaide on a Kurt Gödel scholarship -for postgraduate studies. From 1997 to 2004 he was a member of the SFB -project “Adaptive Information Systems and Modeling in Economics and -Management Science”, coordinated at Wirtschaftsuniversität Wien (WU -Wien). From 2002 to 2003 he was assistant professor at the Department of -Statistics and Decision Support Systems, Universität Wien.

-

In 2006 Fritz moved to Munich, Germany, to become a professor for -computational statistics at the Department of Statistics, -Ludwig-Maximilians-Universität München (LMU), see -Figure 2. He -returned to Vienna in 2011 to join the BOKU University as head of the -Institute of Statistics, see Figure 3.

-
- - -
-
- - -
+

Friedrich Leisch (see Figure 1) was born 1968 in Vienna (Austria) and +died after serious illness in 2024 in Vienna. Everyone called him Fritz.

+
+
+Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: BOKU. +

+Figure 1: Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: BOKU. +

+
+
+

Starting in 1987, Fritz studied Applied Mathematics at Technische Universität Wien (TU Wien), +earning his master’s degree (Dipl.-Ing.) in 1993. Subsequently, he joined the +Department of Statistics and Probability Theory at TU Wien as an +assistant professor which he continued to be, with short intermissions, until 2006. +During this time he also defended his doctoral thesis in Applied Mathematics (Dr.techn.) +in 1999 and earned his habilitation (venia docendi) in Statistics in 2005.

+

In 1995, he visited the Knowledge-Based Engineering Systems Group at the University of +South-Australia in Adelaide on a Kurt Gödel scholarship for postgraduate +studies. From 1997 to 2004 he was a member of the SFB project +“Adaptive Information Systems and Modeling in Economics and Management Science”, coordinated +at Wirtschaftsuniversität Wien (WU Wien). From 2002 to 2003 he was assistant professor +at the Department of Statistics and Decision Support Systems, Universität Wien.

+

In 2006 Fritz moved to Munich, Germany, to become a professor for computational +statistics at the Department of Statistics, Ludwig-Maximilians-Universität München (LMU), see Figure 2. +He returned to Vienna in 2011 to join the BOKU University as head of the Institute of Statistics, see Figure 3.

+
+
+Computational statistics group at LMU in 2007 (left to right): Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: Carolin Strobl. +

+Figure 2: Computational statistics group at LMU in 2007 (left to right): Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: Carolin Strobl. +

+
+
+
+
+Institute of Statistics at BOKU in 2022 (left to right, back to front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: BOKU. +

+Figure 3: Institute of Statistics at BOKU in 2022 (left to right, back to front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: BOKU. +

+
+

2 Key contributions

Fritz’ scientific contributions span an impressive range including -theoretical and methodological work (especially in the field of -clustering and finite mixture models) over software (mostly related to -the R programming language) to applied work and cooperations (notably in +theoretical and methodological work (especially in the field of clustering +and finite mixture models) over software (mostly related to the R +programming language) to applied work and cooperations (notably in marketing, biotechnology, and genomics, among many others). In the following sections we try to highlight his key contributions and scientific legacy.

2.1 R Core & CRAN

During his stay in Australia, Fritz had learned about the existence of -R. Back in Austria, he and Kurt started to explore this potentially good -news more systematically. They soon stopped further work on a statistics -toolbox they had developed for Octave (Eaton et al. 2024), and switched -to R for their applied work, finding lots of room for further -improvement, and thus sending polite emails with patches and more -suggestions to Ross Ihaka and Robert Gentleman. Clearly these were +R. Back in Austria, he and Kurt started to explore this potentially +good news more systematically. They soon stopped further work on a +statistics toolbox they had developed for Octave (Eaton et al. 2024), +and switched to R for their applied work, finding lots of room for +further improvement, and thus sending polite emails with patches and +more suggestions to Ross Ihaka and Robert Gentleman. Clearly these were acceptable in quality but too high in quantity, and it did not take very long that Ross and Robert gave Fritz and Kurt write access to the R sources (initially in CVS, then moved to SVN), and in 1997, they both @@ -1846,159 +2700,144 @@

2.1https://wiki.debian.org/AptCLI) they had successfully employed for managing their computer systems. They also set up the Comprehensive R -Archive Network (CRAN, https://CRAN.R-project.org/, see also Hornik -2012) as a means for redistributing R and its contributed extensions, -and infrastructure for quality assurance of these extensions. These two -contributions paved the way for the amazing growth and success of R -through its wealth of high-quality contributed extensions. See -https://stat.ethz.ch/pipermail/r-announce/1997/000001.html for the -first announcement of CRAN, starting with 12 extension packages. -Currently, there are more than 21,000. See -Figure 4 for a -screenshot2 of the landing page of the CRAN master site at TU Wien, -as last modified by Fritz on 1997-12-09.

-
- - -
+Archive Network (CRAN, https://CRAN.R-project.org/, see also Hornik 2012) as a means for redistributing R and its contributed +extensions, and infrastructure for quality assurance of these +extensions. These two contributions paved the way for the amazing +growth and success of R through its wealth of high-quality contributed +extensions. +See https://stat.ethz.ch/pipermail/r-announce/1997/000001.html for +the first announcement of CRAN, starting with 12 extension packages. +Currently, there are more than 21,000. See Figure 4 +for a screenshot2 +of the landing page of the CRAN master site at TU Wien, as last modified +by Fritz on 1997-12-09.

+
+
+Screenshot of the landing page of the CRAN master site at TU Wien on 1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet Archive. +

+Figure 4: Screenshot of the landing page of the CRAN master site at TU Wien on 1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet Archive. +

+
+

The first SVN commit by Fritz is from 1997-10-02, the last from 2013-10-04. Overall, there are 651 commits by Fritz, mostly from the early years of R Core, and related to the R package management and CRAN -mirror system, and the addition of the Sweave system (see -Section 2.3 for more details).

+mirror system, and the addition of the Sweave system +(see Section 2.3 for more details).

2.2 DSC & useR! conferences

-

With establishing CRAN in Vienna at TU Wien, Fritz and Kurt laid the -foundation for a special relationship between Vienna and R that they -characterized as a story of “love and marriage” (Hornik and Leisch -2002). In the decade after the creation of CRAN a number of seminal -R-related meetings took place in Vienna, co-organized by Fritz as well -as several of the co-authors of this paper.

-

The first workshop on “Distributed Statistical Computing” (DSC) took -place from March 19-23, 1999, at TU Wien. The main motivations were -bringing together the R Core Team for its first face-to-face meeting, -discussing the roadmap for the release of R 1.0.0, as well as exploring -potential synergies with other environments for statistical computing. -There were around 30 participants and about 20 presentations, many of -which were relatively short, leaving ample time for discussions (see -Figure 5).

-
-

- - - +

With establishing CRAN in Vienna at TU Wien, Fritz and Kurt +laid the foundation for a special relationship between Vienna and R that they +characterized as a story of “love and marriage” (Hornik and Leisch 2002). In the decade +after the creation of CRAN a number of seminal R-related meetings took place in Vienna, +co-organized by Fritz as well as several of the co-authors of this paper.

+

The first workshop on “Distributed Statistical Computing” (DSC) took place from +March 19-23, 1999, at TU Wien. The main motivations were bringing together the R Core Team +for its first face-to-face meeting, discussing the roadmap for the release of R 1.0.0, +as well as exploring potential synergies with other environments for statistical computing. +There were around 30 participants and about 20 presentations, many of which were +relatively short, leaving ample time for discussions (see Figure 5).

+
+
+Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage). +

+Figure 5: Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).

-
-Discussions at DSC 1999 (top to bottom, left to right): -Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, -Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt -Hornik. Source: Douglas Bates (DSC 1999 homepage). -
-
-

Two more DSC workshops were organized at TU Wien in 2001 and 2003. While -meetings focusing on R development issues (with the R Core Team and -everyone else interested) were still an important part of these -conferences, they also saw an increasing number of regular conference +

+ +

Two more DSC workshops were organized at TU Wien +in 2001 and 2003. While meetings focusing on R development issues (with the +R Core Team and everyone else interested) were still an important part of +these conferences, they also saw an increasing number of regular conference presentations on R packages and their different fields of application (e.g., establishing infrastructure for spatial data). In 2001 there were -around 60 participants and about 30 presentations, most with -corresponding papers in the online proceedings (Hornik and Leisch 2001). -In 2003 this increased to more than 150 participants and about 60 -presentations, again with the majority in the online proceedings -(Hornik, Leisch, and Zeileis 2003).

-

The high demand for a platform, where R users from different fields -could exchange ideas, prompted the creation of a new conference series -called useR!. The first two installments again took place in Vienna in -2004 at TU Wien and in 2006 at WU Wien. Torsten Hothorn, David Meyer, -and Achim Zeileis took the lead in the organization with support and -advice from Fritz and Kurt in the background. An important contribution -from the R Core Team at the useR! conferences were keynote lectures -highlighting important developments, e.g., a keynote given by Fritz at -useR! 2004 on S4 classes and methods. Both conferences continued the -success of the earlier DSC workshops with the number of participants -rising to more than 200 in 2004 and close to 350 in 2006. Similarly, the -number of presentations grew to about 100 in 2004 and more than 150 in -2006.

-

In addition to the efforts initiated by Fritz and Kurt, another key -factor to the success of these meetings was the city of Vienna with its -culture, cafes, wine and beer pubs, etc. (see Hornik and Leisch 2002 and -also Figure 6).

-
- - -
+around 60 participants and about 30 presentations, most with corresponding +papers in the online proceedings (Hornik and Leisch 2001). In 2003 this +increased to more than 150 participants and about 60 presentations, again +with the majority in the online proceedings (Hornik et al. 2003).

+

The high demand for a platform, where R users from different fields could +exchange ideas, prompted the creation of a new conference series called +useR!. The first two installments again took place in Vienna in 2004 +at TU Wien and in 2006 at WU Wien. +Torsten Hothorn, David Meyer, and Achim Zeileis took the lead in the +organization with support and advice from Fritz and Kurt in the background. +An important contribution from the R Core Team at the useR! conferences +were keynote lectures highlighting important developments, e.g., a keynote +given by Fritz at useR! 2004 on S4 classes and methods. Both conferences +continued the success of the earlier DSC workshops with the number of +participants rising to more than 200 in 2004 and close to 350 in 2006. +Similarly, the number of presentations grew to about 100 in 2004 and more +than 150 in 2006.

+

In addition to the efforts initiated by Fritz and Kurt, another key factor +to the success of these meetings was the city of Vienna with its culture, +cafes, wine and beer pubs, etc. (see Hornik and Leisch 2002 and also +Figure 6).

+
+
+Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 homepage). +

+Figure 6: Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 homepage). +

+
+

2.3 Sweave & reproducibility

-

With Sweave (Leisch 2002), Fritz pioneered what we now can understand -as the technical foundation of reproducible research. Sweave was the -main inspiration for knitr -(Xie 2015) which in turn led to -rmarkdown (Xie, Allaire, -and Grolemund 2018) and -quarto (Scheidegger et al. -2024). All these systems are used today to generate countless scientific -articles, package vignettes, webpages, books, blogs, and much more in a -dynamic and reproducible way.

-

Of course, Fritz was not the first one going in this direction. The -concept of “literate programming” had been introduced by Knuth (1984), -allowing to combine the source code for software and the corresponding -documentation in the same file. The concepts of “tangling”, that is, -extracting the code for compilation, and “weaving”, the process of -generating a nicely looking document containing code next to prosa and -formulae, have their roots in the WEB and CWEB systems (Knuth and -Levy 1993). As these packages were specific to code in Pascal (WEB) -and C (CWEB), respectively, and documentation in LaTeX, Ramsey (1994) -introduced his noweb system as a literate programming tool that is -agnostic to the programming language used and also supports HTML in -addition to LaTeX and a few other backends for documentation. The -noweb syntax for code chunks is:

+

With Sweave (Leisch 2002), Fritz pioneered what we now can understand as +the technical foundation of reproducible research. Sweave was the main +inspiration for knitr (Xie 2015) which in turn led to +rmarkdown (Xie et al. 2018) and quarto +(Scheidegger et al. 2024). All these systems are used today to +generate countless scientific articles, package vignettes, webpages, books, blogs, +and much more in a dynamic and reproducible way.

+

Of course, Fritz was not the first one going in this direction. The concept +of “literate programming” had been introduced by Knuth (1984), allowing to +combine the source code for software and the corresponding documentation +in the same file. The concepts of “tangling”, that is, extracting the code +for compilation, and “weaving”, the process of generating a nicely looking +document containing code next to prosa and formulae, have their roots in the +WEB and CWEB systems (Knuth and Levy 1993). As these packages were specific +to code in Pascal (WEB) and C (CWEB), respectively, and documentation in +LaTeX, Ramsey (1994) introduced his noweb system as a literate programming +tool that is agnostic to the programming language used and also supports HTML +in addition to LaTeX and a few other backends for documentation. The noweb +syntax for code chunks is:

<<code>>=
 1 + 2
 @
-

This will look familiar to users of Sweave. From this history, the -naming decisions for the software and its file format can be understood: -Sweave is the function that weaves code in S (or R - both languages -still existed side by side at the time) with its output and -documentation. And Rnw stands for files mixing R code with noweb -syntax.

-

Starting in the mid-1990s to the early 2000s, interests shifted from -just “literate programming” to “literate data analysis” (Leisch 2002; -Leisch and Rossini 2003) as a core ingredient for reproducible research -(Buckheit and Donoho 1995). The seminal new idea was to have dynamic -documents so outputs of code such as figures and tables could be -updated automatically when the underlying data changed, which was -pioneered by the late Günter Sawitzki in his Voyager system (Sawitzki -1996).

-

Fritz amalgamated all of this into Sweave which was the first time -that the power of dynamic reporting became easily available in a -widely-used programming language for statistics in combination with the -standard textprocessing system LaTeX. This turned out to be a “killer -feature” of R at the time and the basis for further work towards -reproducible research (Hothorn and Leisch 2011; Stodden, Leisch, and -Peng 2014).

-

Sweave was also the basis for R package vignettes (Leisch 2003) as an -addition to the previously available technical manual pages. The first R -package vignette published on CRAN in May 2002 was in the -strucchange package, -providing methods for testing, monitoring, and dating structural -changes. The vignette was the Sweave adaptation of an introduction to -the package that had been co-authored by Fritz and published a couple of -months earlier in the Journal of Statistical Software (Zeileis et al. -2002). See Figure 7 for how Fritz used it to illustrate the idea of -package vignettes in Leisch (2003) and that the R code from vignettes -can be easily extracted (also interactively), explored, and re-run.

-
- - -
+

This will look familiar to users of Sweave. From this history, the naming +decisions for the software and its file format can be understood: Sweave +is the function that weaves code in S (or R - both languages still existed +side by side at the time) with its output and documentation. And Rnw stands for files +mixing R code with noweb syntax.

+

Starting in the mid-1990s to the early 2000s, interests shifted from just +“literate programming” to “literate data analysis” (Leisch 2002; Leisch and Rossini 2003) +as a core ingredient for reproducible research (Buckheit and Donoho 1995). +The seminal new idea was to have dynamic documents so outputs of code +such as figures and tables could be updated automatically when the underlying +data changed, which was pioneered by the late Günter Sawitzki in his +Voyager system (Sawitzki 1996).

+

Fritz amalgamated all of this into Sweave which was the first time that the +power of dynamic reporting became easily available in a widely-used programming +language for statistics in combination with the standard textprocessing system +LaTeX. This turned out to be a “killer feature” of R at the time and the basis +for further work towards reproducible research (Hothorn and Leisch 2011; Stodden et al. 2014).

+

Sweave was also the basis for R package vignettes (Leisch 2003) as an +addition to the previously available technical manual pages. +The first R package vignette published on CRAN in May 2002 was in the +strucchange package, providing methods for testing, monitoring, +and dating structural changes. The vignette was the Sweave adaptation +of an introduction to the package that had been co-authored by Fritz and +published a couple of months earlier in the Journal of Statistical Software +(Zeileis et al. 2002). See Figure 7 for how +Fritz used it to illustrate the idea of package vignettes in Leisch (2003) +and that the R code from vignettes can be easily extracted (also interactively), +explored, and re-run.

+
+
+Screenshot of the strucchange package vignette, shown in a PDF viewer (right), along with the vExplorer from Bioconductor for interactive code execution (top left) with output in the active R graphics window (bottom left). Source: Leisch (2003, Figure 2). +

+Figure 7: Screenshot of the strucchange package vignette, shown in a PDF viewer (right), along with the vExplorer from Bioconductor for interactive code execution (top left) with output in the active R graphics window (bottom left). Source: Leisch (2003, Figure 2). +

+
+

2.4 Clustering & mixture models

Fritz’ theoretical and methodological work focused in particular on clustering and finite mixture models. Centroid-based partitioning @@ -2007,23 +2846,22 @@

flexclust (Leisch 2006) -and flexmix (Leisch 2004; -Grün and Leisch 2008), contributing to the clustering tools available -for R (see the CRAN Task View -Cluster). Both packages -provide general infrastructure for (model-based) clustering and enable -rapid prototyping and the simple extension to new variants taking into -account complicated data structures or challenging model specifications -(see, for example, psychomix, Frick et al. 2012).

+implementation of the packages flexclust (Leisch 2006) and +flexmix (Leisch 2004; Grün and Leisch 2008), contributing to +the clustering tools available for R (see the CRAN Task View +Cluster). Both packages provide general infrastructure for +(model-based) clustering and enable rapid prototyping and the simple +extension to new variants taking into account complicated data +structures or challenging model specifications (see, for example, +psychomix, Frick et al. 2012).

2.5 Applied work

For many years, Fritz and Kurt actively participated in the Biological Psychiatry working group at Medizinische Universität Wien. The first -paper co-authored by Fritz dates from 2000 (Bailer et al. 2000), the -last from 2023 (Solmi et al. 2023). The joint research was mostly -focused on linking genetic traits to psychiatric disorders and treatment -success. This prompted many enhancements in the classical test +paper co-authored by Fritz dates from 2000 +(Bailer et al. 2000), the last from 2023 +(Solmi et al. 2023). The joint research was mostly +focused on linking genetic traits to psychiatric disorders and +treatment success. This prompted many enhancements in the classical test infrastructure in base R - in surprising ways to some reviewers, who could not believe that Fisher’s test really worked for tables with more than two rows or columns. It also established a strong need for @@ -2034,204 +2872,191 @@

2.5<

Fritz also intensively collaborated with Sara Dolnicar to advance data analytic methods for data-driven market segmentation analysis. They received the Charles R. Goeldner Article of Excellence Award for their -work on extracting stable Winter tourist segments in Austria with bagged -clustering (Dolnicar and Leisch 2003). They focused on the evaluation of -data structure and the selection of suitable segments based on segment -stability as a key criterion (Dolnicar and Leisch 2010, 2017). Finally, -this joint work resulted in Dolnicar, Grün, and Leisch (2018) which -provides practical guidance for users of market segmentation solutions -and for data analysts with respect to the technical and statistical -aspects of market segmentation analysis.

-

As head of the Institute of Statistics, Fritz was involved in various -interdisciplinary research projects covering almost the whole range of -core areas of research at BOKU. He was key researcher at the Austrian -Centre of Industrial Biotechnology (acib) (Scharl, Voglhuber, and Leisch -2009; Melcher et al. 2017) and faculty member of the doctoral schools on -agricultural genomics and bioprocess engineering. Among others he -contributed to the fields of zoology (Cech et al. 2022), forestry, -transportation and tourism (Taczanowska et al. 2023) as well as -chemistry, genomics and wildlife biology (Steiner, Leisch, and -Hackländer 2014).

+work on extracting stable Winter tourist segments in Austria with +bagged clustering (Dolnicar and Leisch 2003). They focused on the +evaluation of data structure and the selection of suitable segments +based on segment stability as a key criterion (Dolnicar and Leisch 2010, 2017). Finally, this joint work resulted in +Dolnicar et al. (2018) which provides practical guidance for +users of market segmentation solutions and for data analysts with +respect to the technical and statistical aspects of market +segmentation analysis.

+

As head of the Institute of Statistics, Fritz was involved +in various interdisciplinary research projects covering almost the whole +range of core areas of research at BOKU. He was key researcher at the +Austrian Centre of Industrial Biotechnology (acib) +(Scharl et al. 2009; Melcher et al. 2017) and +faculty member of the doctoral schools on agricultural genomics and +bioprocess engineering. Among others he contributed to the fields of +zoology (Cech et al. 2022), forestry, transportation and tourism +(Taczanowska et al. 2023) as well as chemistry, genomics and wildlife +biology (Steiner et al. 2014).

3 Academic service

In addition to the services for the various conferences and proceedings -already described above, he served the scientific community in various -ways. In January 2001, he co-created R News which evolved into The R -Journal eight years later. For the journal Computational Statistics -he was an associate editor from 2005 to 2006 before he became -editor-in-chief from 2007 to 2011 (see Symanzik, Mori, and Vieu 2024 for -more details). Other notable contributions include being editor for the -Journal of Statistical Software, core member of the Bioconductor -project for statistical software in bioinformatics, and first secretary -general of the R Foundation for Statistical Computing when it was -formed in 2002.

+already described above, he served the scientific community in various ways. +In January 2001, he co-created R News which evolved into +The R Journal eight years later. For the journal Computational Statistics +he was an associate editor from 2005 to 2006 before he became editor-in-chief +from 2007 to 2011 (see Symanzik et al. 2024 for more details). +Other notable contributions include being +editor for the Journal of Statistical Software, core member of the +Bioconductor project for statistical software in bioinformatics, and +first secretary general of the R Foundation for Statistical Computing when +it was formed in 2002.

4 Teaching & mentoring

-

Fritz taught generations of students at bachelor, master, and PhD level -and introduced hundreds of useRs to proper R development in his -“Introduction to R Programming” short course. At TU Wien, LMU, and BOKU, -he taught courses in applied statistics, statistical computing and -computational statistics. He had the ability to explain even difficult -content in a simple way and to inspire students with statistics and -programming with R. He co-founded the “Munich R Courses” lecture series -and was part of a group aiming to initiate a formal PhD program in -statistics at LMU.

-

Fritz supervised Bettina Grün, Theresa Scharl, Sebastian Kaiser, Manuel -Eugster, Christina Yassouridis, Rainer Dangl, Weksi Budiaji, Muhammad -Atif and Simona Jokubauskaite as his PhD students. Based on his -research, Fritz often discussed the state of and the need for -reproducible research and taught his many students how to avoid the many -small and innocent errors that have a tendency to pile up and invalidate -reported statistical results, with potentially devastating consequences, -as we all know.

+

Fritz taught generations of students at bachelor, master, and PhD level and +introduced hundreds of useRs to proper R development in his “Introduction to +R Programming” short course. At TU Wien, LMU, and BOKU, he taught courses in applied +statistics, statistical computing and computational statistics. He had the +ability to explain even difficult content in a simple way and to inspire students +with statistics and programming with R. He +co-founded the “Munich R Courses” lecture series and was part of a group +aiming to initiate a formal PhD program in statistics at LMU.

+

Fritz supervised +Bettina Grün, Theresa Scharl, +Sebastian Kaiser, Manuel Eugster, +Christina Yassouridis, Rainer Dangl, +Weksi Budiaji, Muhammad Atif and +Simona Jokubauskaite as his PhD students. +Based on his research, Fritz often discussed the state of and the need for reproducible +research and taught his many students how to avoid the many small and +innocent errors that have a tendency to pile up and invalidate reported +statistical results, with potentially devastating consequences, as we all know.

5 Odds & ends

Fritz loved cooking, music, motorbike riding, playing cards with his -friends, skiing and hiking. A late afternoon call to his office asking -him to go along for a beer in Munich’s English Garden almost never went -unanswered, positively. Back in Vienna at BOKU, colleagues got to know -Fritz as a very structured, thoughtful, calm person who involved -everyone, listened to everyone and always endeavored to balance -interests and ensure fairness. He strengthened cooperation and cohesion -with his leadership style. Fritz was a friendly, always modest person -who was free of airs and graces or vanity, despite or perhaps because of -his great scientific successes. The R Core Team and the R community at -large miss a contributor, collaborator, teacher, colleague, and friend.

-

References

-
- +friends, skiing and hiking. A late afternoon call to his office +asking him to go along for a beer in Munich’s English Garden almost never went +unanswered, positively. Back in Vienna at BOKU, colleagues got to know Fritz as a very +structured, thoughtful, calm person who involved everyone, listened to +everyone and always endeavored to balance interests and ensure fairness. +He strengthened cooperation and cohesion with his leadership style. +Fritz was a friendly, always modest person who was free of airs and graces or +vanity, despite or perhaps because of his great scientific successes. +The R Core Team and the R community at large miss a contributor, +collaborator, teacher, colleague, and friend.

+
+

5.1 CRAN packages used

+

knitr, rmarkdown, quarto, strucchange, flexclust, flexmix

+

5.2 CRAN Task Views implied by cited packages

+

Cluster, Econometrics, Environmetrics, Finance, Psychometrics, ReproducibleResearch, TimeSeries

+
+
+U. Bailer, F. Leisch, K. Meszaros, E. Lenzinger, U. Willinger, R. Strobl, C. Gebhardt, E. Gerhard, K. Fuchs, W. Sieghart, et al. Genome scan for susceptibility loci for schizophrenia. Neuropsychobiology, 42(4): 175–182, 2000. DOI 10.1159/000026690. +
+
+J. B. Buckheit and D. L. Donoho. WaveLab and reproducible research. In Wavelets in statistics, Eds A. Antoniadis and G. Oppenheim pages. 55–82 1995. New York: Springer-Verlag. DOI 10.1007/978-1-4612-2544-7_5. +
+
+R. M. Cech, S. Jovanovic, S. Kegley, K. Hertoge, F. Leisch and J. G. Zaller. Reducing overall herbicide use may reduce risks to humans but increase toxic loads to honeybees, earthworms and birds. Environmental Sciences Europe, 34(1): 44, 2022. DOI 10.1186/s12302-022-00622-2. +
+
+S. Dolnicar, B. Grün and F. Leisch. Market segmentation analysis: Understanding it, doing it, and making it useful. Springer-Verlag, 2018. DOI 10.1007/978-981-10-8818-6. +
+
+S. Dolnicar and F. Leisch. Evaluation of structure and reproducibility of cluster solutions using the bootstrap. Marketing Letters, 21(1): 83–101, 2010. DOI 10.1007/s11002-009-9083-4. +
+
+S. Dolnicar and F. Leisch. Using segment level stability to select target segments in data-driven market segmentation studies. Marketing Letters, 28(3): 423–436, 2017. DOI 10.1007/s11002-017-9423-8. +
+
+S. Dolnicar and F. Leisch. Winter tourist segments in Austria: Identifying stable vacation styles using bagged clustering techniques. Journal of Travel Research, 41(3): 281–292, 2003. DOI 10.1177/0047287502239037. +
+
+J. W. Eaton, D. Bateman, S. Hauberg and R. Wehbring. GNU Octave version 9.2.0 manual: A high-level interactive language for numerical computations. 2024. URL https://www.gnu.org/software/octave/doc/v9.2.0/. +
+
+H. Frick, C. Strobl, F. Leisch and A. Zeileis. Flexible Rasch mixture models with package psychomix. Journal of Statistical Software, 48(7): 1–25, 2012. DOI 10.18637/jss.v048.i07. +
+
+B. Grün and F. Leisch. FlexMix version 2: Finite mixtures with concomitant variables and varying and constant parameters. Journal of Statistical Software, 28(4): 1–35, 2008. DOI 10.18637/jss.v028.i04. +
+
+K. Hornik. The Comprehensive R Archive Network. Wiley Interdisciplinary Reviews: Computational Statistics, 4(4): 394–398, 2012. DOI 10.1002/wics.1212. +
+
+K. Hornik and F. Leisch, eds. Proceedings of the 2nd International Workshop on Distributed Statistical Computing, Vienna, Austria. 2001. URL https://www.R-project.org/conferences/DSC-2001/Proceedings/. ISSN 1609-395X. +
+
+K. Hornik and F. Leisch. Vienna and R: Love, marriage and the future. In Festschrift 50 Jahre Österreichische Statistische Gesellschaft, Ed R. Dutter pages. 61–70 2002. Österreichische Statistische Gesellschaft. ISSN 1026-597X. +
+
+K. Hornik, F. Leisch and A. Zeileis, eds. Proceedings of the 3rd International Workshop on Distributed Statistical Computing, Vienna, Austria. 2003. URL https://www.R-project.org/conferences/DSC-2003/Proceedings/. ISSN 1609-395X. +
+
+T. Hothorn and F. Leisch. Case studies in reproducibility. Briefings in Bioinformatics, 12(3): 288–300, 2011. DOI 10.1093/bib/bbq084. +
+
+D. E. Knuth. Literate programming. The Computer Journal, 27(2): 97–111, 1984. DOI 10.1093/comjnl/27.2.97. +
+
+D. E. Knuth and S. Levy. The CWEB system of structured documentation. Reading: Addison-Wesley, 1993. +
+
+F. Leisch. A toolbox for k-centroids cluster analysis. Computational Statistics and Data Analysis, 51(2): 526–544, 2006. DOI 10.1016/j.csda.2005.10.006. +
+
+F. Leisch. FlexMix: A general framework for finite mixture models and latent class regression in R. Journal of Statistical Software, 11(8): 1–18, 2004. DOI 10.18637/jss.v011.i08. +
+
+F. Leisch. Sweave, part II: Package vignettes. R News, 3(2): 21–24, 2003. URL https://CRAN.R-project.org/doc/Rnews/. +
+
+F. Leisch. Sweave: Dynamic generation of statistical reports using literate data analysis. In COMPSTAT 2002 – proceedings in computational statistics, Eds W. Härdle and B. Rönz pages. 575–580 2002. Heidelberg: Physica Verlag. DOI 10.1007/978-3-642-57489-4_89. +
+
+F. Leisch and A. J. Rossini. Reproducible statistical research. Chance, 16(2): 46–50, 2003. DOI 10.1080/09332480.2003.10554848. +
+
+M. Melcher, T. Scharl, M. Luchner, G. Striedner and F. Leisch. Boosted structured additive regression for Escherichia coli fed-batch fermentation modeling. Biotechnology and Bioengineering, 114(2): 321–334, 2017. DOI 10.1002/bit.26073. +
+
+N. Ramsey. Literate programming simplified. IEEE Software, 11(5): 97–105, 1994. DOI 10.1109/52.311070. +
+
+G. Sawitzki. Extensible statistical software: On a voyage to Oberon. Journal of Computational and Graphical Statistics, 5(3): 263–283, 1996. DOI 10.1080/10618600.1996.10474711. +
+
+T. Scharl, I. Voglhuber and F. Leisch. Exploratory and inferential analysis of gene cluster neighborhood graphs. BMC Bioinformatics, 10(1): 288, 2009. DOI 10.1186/1471-2105-10-288. +
+
+C. Scheidegger, C. Teague, C. Dervieux, J. J. Allaire and Y. Xie. Quarto: An open-source scientific and technical publishing system. 2024. URL https://quarto.org/. Version 1.5. +
+
+M. Solmi, T. Thompson, A. Estradé, A. Agorastos, J. Radua, S. Cortese, E. Dragioti, F. Leisch, D. Vancampfort, L. C. Thygesen, et al. Validation of the Collaborative Outcomes study on Health and Functioning during Infection Times (COH-FIT) questionnaire for adults. Journal of Affective Disorders, 326: 249–261, 2023. DOI 10.1016/j.jad.2022.12.022. +
+
+W. Steiner, F. Leisch and K. Hackländer. A review on the temporal pattern of deer-vehicle accidents: Impact of seasonal, diurnal and lunar effects in cervids. Accident Analysis & Prevention, 66: 168–181, 2014. DOI 10.1016/j.aap.2014.01.020. +
+
+V. Stodden, F. Leisch and R. D. Peng. Implementing reproducible research. Boca Raton: Chapman & Hall/CRC, 2014. +
+
+J. Symanzik, Y. Mori and P. Vieu. A memorial for the late Professor Friedrich Leisch. Computational Statistics, 39: 2024. Forthcoming. +
+
+K. Taczanowska, B. Latosinska, C. Brandenburg, F. Leisch, C. Czachs and A. Muhar. Lobbying in social media as a new source of survey bias. Journal of Outdoor Recreation and Tourism, 44(A): 100689, 2023. DOI 10.1016/j.jort.2023.100689. +
+
+Y. Xie. Dynamic documents with R and knitr. 2nd ed Boca Raton: Chapman & Hall/CRC, 2015. DOI 10.1201/9781315382487. +
+
+Y. Xie, J. J. Allaire and G. Grolemund. R Markdown: The definitive guide. Boca Raton: Chapman & Hall/CRC, 2018. DOI 10.1201/9781138359444. +
+
+A. Zeileis, F. Leisch, K. Hornik and C. Kleiber. strucchange: An R package for testing for structural change in linear regression models. Journal of Statistical Software, 7(2): 1–38, 2002. DOI 10.18637/jss.v007.i02. +
-

Bailer, Ursula, Friedrich Leisch, Kurt Meszaros, Elisabeth Lenzinger, -Ulrike Willinger, Rainer Strobl, Christian Gebhardt, et al. 2000. -“Genome Scan for Susceptibility Loci for Schizophrenia.” -Neuropsychobiology 42 (4): 175–82. -https://doi.org/10.1159/000026690.

-

Buckheit, Jonathan B., and David L. Donoho. 1995. “WaveLab and -Reproducible Research.” In Wavelets in Statistics, edited by A. -Antoniadis and G. Oppenheim, 55–82. Lecture Notes in Statistics. New -York: Springer-Verlag. https://doi.org/10.1007/978-1-4612-2544-7_5.

-

Cech, Ramona M, Suzanne Jovanovic, Susan Kegley, Koen Hertoge, Friedrich -Leisch, and Johann G Zaller. 2022. “Reducing Overall Herbicide Use May -Reduce Risks to Humans but Increase Toxic Loads to Honeybees, Earthworms -and Birds.” Environmental Sciences Europe 34 (1): 44. -https://doi.org/10.1186/s12302-022-00622-2.

-

Dolnicar, Sara, Bettina Grün, and Friedrich Leisch. 2018. Market -Segmentation Analysis: Understanding It, Doing It, and Making It -Useful. Management for Professionals. Springer-Verlag. -https://doi.org/10.1007/978-981-10-8818-6.

-

Dolnicar, Sara, and Friedrich Leisch. 2003. “Winter Tourist Segments in -Austria: Identifying Stable Vacation Styles Using Bagged Clustering -Techniques.” Journal of Travel Research 41 (3): 281–92. -https://doi.org/10.1177/0047287502239037.

-

———. 2010. “Evaluation of Structure and Reproducibility of Cluster -Solutions Using the Bootstrap.” Marketing Letters 21 (1): 83–101. -https://doi.org/10.1007/s11002-009-9083-4.

-

———. 2017. “Using Segment Level Stability to Select Target -Segments in Data-Driven Market Segmentation Studies.” Marketing -Letters 28 (3): 423–36. https://doi.org/10.1007/s11002-017-9423-8.

-

Eaton, John W., David Bateman, Søren Hauberg, and Rik Wehbring. 2024. -GNU Octave Version 9.2.0 Manual: A High-Level Interactive Language for -Numerical Computations. -https://www.gnu.org/software/octave/doc/v9.2.0/.

-

Frick, Hannah, Carolin Strobl, Friedrich Leisch, and Achim Zeileis. -2012. “Flexible Rasch Mixture Models with Package psychomix.” Journal -of Statistical Software 48 (7): 1–25. -https://doi.org/10.18637/jss.v048.i07.

-

Grün, Bettina, and Friedrich Leisch. 2008. “FlexMix Version 2: Finite -Mixtures with Concomitant Variables and Varying and Constant -Parameters.” Journal of Statistical Software 28 (4): 1–35. -https://doi.org/10.18637/jss.v028.i04.

-

Hornik, Kurt. 2012. “The Comprehensive R Archive Network.” Wiley -Interdisciplinary Reviews: Computational Statistics 4 (4): 394–98. -https://doi.org/10.1002/wics.1212.

-

Hornik, Kurt, and Friedrich Leisch, eds. 2001. Proceedings of the 2nd -International Workshop on Distributed Statistical Computing, Vienna, -Austria. https://www.R-project.org/conferences/DSC-2001/Proceedings/.

-

———. 2002. “Vienna and R: Love, Marriage and the Future.” In -Festschrift 50 Jahre Österreichische Statistische Gesellschaft, edited -by Rudolf Dutter, 61–70. Österreichische Statistische Gesellschaft.

-

Hornik, Kurt, Friedrich Leisch, and Achim Zeileis, eds. 2003. -Proceedings of the 3rd International Workshop on Distributed -Statistical Computing, Vienna, Austria. -https://www.R-project.org/conferences/DSC-2003/Proceedings/.

-

Hothorn, Torsten, and Friedrich Leisch. 2011. “Case Studies in -Reproducibility.” Briefings in Bioinformatics 12 (3): 288–300. -https://doi.org/10.1093/bib/bbq084.

-

Knuth, Donald E. 1984. “Literate Programming.” The Computer Journal 27 -(2): 97–111. https://doi.org/10.1093/comjnl/27.2.97.

-

Knuth, Donald E., and Silvio Levy. 1993. The CWEB System of Structured -Documentation. Reading: Addison-Wesley.

-

Leisch, Friedrich. 2002. “Sweave: Dynamic Generation of Statistical -Reports Using Literate Data Analysis.” In COMPSTAT 2002 – Proceedings -in Computational Statistics, edited by Wolfgang Härdle and Bernd Rönz, -575–80. Heidelberg: Physica Verlag. -https://doi.org/10.1007/978-3-642-57489-4_89.

-

———. 2003. “Sweave, Part II: Package Vignettes.” R News 3 (2): -21–24. https://CRAN.R-project.org/doc/Rnews/.

-

———. 2004. “FlexMix: A General Framework for Finite Mixture Models -and Latent Class Regression in R.” Journal of Statistical Software 11 -(8): 1–18. https://doi.org/10.18637/jss.v011.i08.

-

———. 2006. “A Toolbox for k-Centroids Cluster Analysis.” -Computational Statistics and Data Analysis 51 (2): 526–44. -https://doi.org/10.1016/j.csda.2005.10.006.

-

Leisch, Friedrich, and Anthony J. Rossini. 2003. “Reproducible -Statistical Research.” Chance 16 (2): 46–50. -https://doi.org/10.1080/09332480.2003.10554848.

-

Melcher, Michael, Theresa Scharl, Markus Luchner, Gerald Striedner, and -Friedrich Leisch. 2017. “Boosted Structured Additive Regression for -Escherichia Coli Fed-Batch Fermentation Modeling.” Biotechnology and -Bioengineering 114 (2): 321–34. https://doi.org/10.1002/bit.26073.

-

Ramsey, Norman. 1994. “Literate Programming Simplified.” IEEE Software -11 (5): 97–105. https://doi.org/10.1109/52.311070.

-

Sawitzki, Günther. 1996. “Extensible Statistical Software: On a Voyage -to Oberon.” Journal of Computational and Graphical Statistics 5 (3): -263–83. https://doi.org/10.1080/10618600.1996.10474711.

-

Scharl, Theresa, Ingo Voglhuber, and Friedrich Leisch. 2009. -“Exploratory and Inferential Analysis of Gene Cluster Neighborhood -Graphs.” BMC Bioinformatics 10 (1): 288. -https://doi.org/10.1186/1471-2105-10-288.

-

Scheidegger, Carlos, Charles Teague, Christophe Dervieux, J. J. Allaire, -and Yihui Xie. 2024. “Quarto: An Open-Source Scientific and Technical -Publishing System.” https://quarto.org/.

-

Solmi, Marco, Trevor Thompson, Andrés Estradé, Agorastos Agorastos, -Joaquim Radua, Samuele Cortese, Elena Dragioti, et al. 2023. “Validation -of the Collaborative Outcomes Study on Health and Functioning During -Infection Times (COH-FIT) Questionnaire for Adults.” Journal of -Affective Disorders 326: 249–61. -https://doi.org/10.1016/j.jad.2022.12.022.

-

Steiner, Wolfgang, Friedrich Leisch, and Klaus Hackländer. 2014. “A -Review on the Temporal Pattern of Deer-Vehicle Accidents: Impact of -Seasonal, Diurnal and Lunar Effects in Cervids.” Accident Analysis & -Prevention 66: 168–81. https://doi.org/10.1016/j.aap.2014.01.020.

-

Stodden, Victoria, Friedrich Leisch, and Roger D. Peng. 2014. -Implementing Reproducible Research. Boca Raton: Chapman & Hall/CRC.

-

Symanzik, Jürgen, Yuichi Mori, and Philippe Vieu. 2024. “A Memorial for -the Late Professor Friedrich Leisch.” Computational Statistics 39.

-

Taczanowska, Karolina, Barbara Latosinska, Christiane Brandenburg, -Friedrich Leisch, Christina Czachs, and Andreas Muhar. 2023. “Lobbying -in Social Media as a New Source of Survey Bias.” Journal of Outdoor -Recreation and Tourism 44 (A): 100689. -https://doi.org/10.1016/j.jort.2023.100689.

-

Xie, Yihui. 2015. Dynamic Documents with R and knitr. 2nd ed. Boca -Raton: Chapman & Hall/CRC. https://doi.org/10.1201/9781315382487.

-

Xie, Yihui, J. J. Allaire, and Garrett Grolemund. 2018. R Markdown: The -Definitive Guide. Boca Raton: Chapman & Hall/CRC. -https://doi.org/10.1201/9781138359444.

-

Zeileis, Achim, Friedrich Leisch, Kurt Hornik, and Christian Kleiber. -2002. “strucchange: An R Package for Testing for Structural Change in -Linear Regression Models.” Journal of Statistical Software 7 (2): -1–38. https://doi.org/10.18637/jss.v007.i02.

- +
@@ -2244,23 +3069,25 @@

References

+

References

+

Reuse

-

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

+

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

-
Grün, et al., "Remembering Friedrich "Fritz" Leisch", The R Journal, 2024
+
Grün, et al., "Remembering Friedrich "Fritz" Leisch", The R Journal, 2025

BibTeX citation

@article{RJ-2024-001,
   author = {Grün, Bettina and Hornik, Kurt and Hothorn, Torsten and Scharl, Theresa and Zeileis, Achim},
-  title = {Remembering Friedrich "Fritz" Leisch},
+  title = {Remembering Friedrich "Fritz" Leisch},
   journal = {The R Journal},
-  year = {2024},
+  year = {2025},
   note = {https://doi.org/10.32614/RJ-2024-001},
   doi = {10.32614/RJ-2024-001},
   volume = {16},
   issue = {1},
   issn = {2073-4859},
-  pages = {1}
+  pages = {5-14}
 }
diff --git a/_articles/RJ-2024-001/RJ-2024-001.pdf b/_articles/RJ-2024-001/RJ-2024-001.pdf new file mode 100644 index 0000000000..ded232856d Binary files /dev/null and b/_articles/RJ-2024-001/RJ-2024-001.pdf differ diff --git a/_articles/RJ-2024-001/fritz.knit.md b/_articles/RJ-2024-001/RJ-2024-001.tex similarity index 56% rename from _articles/RJ-2024-001/fritz.knit.md rename to _articles/RJ-2024-001/RJ-2024-001.tex index ab2fa50f76..0b85d93ab1 100644 --- a/_articles/RJ-2024-001/fritz.knit.md +++ b/_articles/RJ-2024-001/RJ-2024-001.tex @@ -1,64 +1,27 @@ ---- -title: Remembering Friedrich "Fritz" Leisch -date: 2024-09-18 -draft: yes -author: - - name: Bettina Grün - affiliation: WU Wirtschaftsuniversität Wien - address: Austria - orcid: 0000-0001-7265-4773 - email: Bettina.Gruen@wu.ac.at - - name: Kurt Hornik - affiliation: WU Wirtschaftsuniversität Wien - address: Austria - orcid: 0000-0003-4198-9911 - email: Kurt.Hornik@R-project.org - - name: Torsten Hothorn - affiliation: Universität Zürich - address: Switzerland - orcid: 0000-0001-8301-0471 - email: Torsten.Hothorn@R-project.org - - name: Theresa Scharl - affiliation: BOKU University - address: Austria - orcid: 0000-0001-8850-3312 - email: Theresa.Scharl@boku.ac.at - - name: Achim Zeileis - affiliation: Universität Innsbruck - address: Austria - url: https://www.zeileis.org/ - orcid: 0000-0003-0918-3766 - email: Achim.Zeileis@R-project.org -abstract: > - This article remembers our friend and colleague Fritz Leisch (1968--2024) who - sadly died earlier this year. Many of the readers of The R Journal will know - Fritz as a member of the R Core Team and for many of his contributions to the - R community. For us, the co-authors of this article, he was an important - companion on our journey with the R project and other scientific endeavours - over the years. In the following, we provide a brief synopsis of his career, - present his key contributions to the R project and to the scientific community - more generally, acknowledge his academic service, and highlight his teaching - and mentoring achievements. -preamble: | - \newcommand{\doi}[1]{\href{https://doi.org/#1}{\normalfont\texttt{doi:\discretionary{}{}{}{#1}}}} -bibliography: fritz.bib -output: - rjtools::rjournal_article: - self_contained: yes ---- - -# Career - -Friedrich Leisch (see Figure \@ref(fig:leisch)) was born 1968 in Vienna (Austria) and +% !TeX root = RJwrapper.tex +\title{Remembering Friedrich ``Fritz'' Leisch} + + +\author{by Bettina Grün, Kurt Hornik, Torsten Hothorn, Theresa Scharl, and Achim Zeileis} + +\maketitle + +\abstract{% +This article remembers our friend and colleague Fritz Leisch (1968--2024) who sadly died earlier this year. Many of the readers of The R Journal will know Fritz as a member of the R Core Team and for many of his contributions to the R community. For us, the co-authors of this article, he was an important companion on our journey with the R project and other scientific endeavours over the years. In the following, we provide a brief synopsis of his career, present his key contributions to the R project and to the scientific community more generally, acknowledge his academic service, and highlight his teaching and mentoring achievements. +} + +\section{Career}\label{career} + +Friedrich Leisch (see Figure~\ref{fig:leisch}) was born 1968 in Vienna (Austria) and died after serious illness in 2024 in Vienna. Everyone called him Fritz. \begin{figure}[h!] -{\centering \includegraphics[width=0.55\linewidth]{fritz_files/figure-latex/leisch-1} +{\centering \includegraphics[width=0.55\linewidth]{figures/img-leisch} } -\caption{Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: BOKU.}(\#fig:leisch) +\caption{Fritz Leisch at his inaugural lecture at BOKU in 2011. Source: BOKU.}\label{fig:leisch} \end{figure} Starting in 1987, Fritz studied Applied Mathematics at Technische Universität Wien (TU Wien), @@ -71,49 +34,48 @@ In 1995, he visited the Knowledge-Based Engineering Systems Group at the University of South-Australia in Adelaide on a Kurt Gödel scholarship for postgraduate studies. From 1997 to 2004 he was a member of the SFB project -"Adaptive Information Systems and Modeling in Economics and Management Science", coordinated +``Adaptive Information Systems and Modeling in Economics and Management Science'', coordinated at Wirtschaftsuniversität Wien (WU Wien). From 2002 to 2003 he was assistant professor at the Department of Statistics and Decision Support Systems, Universität Wien. In 2006 Fritz moved to Munich, Germany, to become a professor for computational -statistics at the Department of Statistics, Ludwig-Maximilians-Universität München (LMU), see Figure \@ref(fig:lmu). -He returned to Vienna in 2011 to join the BOKU University as head of the Institute of Statistics, see Figure \@ref(fig:boku). +statistics at the Department of Statistics, Ludwig-Maximilians-Universität München (LMU), see Figure~\ref{fig:lmu}. +He returned to Vienna in 2011 to join the BOKU University as head of the Institute of Statistics, see Figure~\ref{fig:boku}. \begin{figure}[t!] -{\centering \includegraphics[width=0.83\linewidth]{fritz_files/figure-latex/lmu-1} +{\centering \includegraphics[width=0.83\linewidth]{figures/img-lmu} } -\caption{Computational statistics group at LMU in 2007 (left to right): Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: Carolin Strobl.}(\#fig:lmu) +\caption{Computational statistics group at LMU in 2007 (left to right): Sebastian Kaiser, Adrian Duffner, Manuel Eugster, Fritz Leisch. Source: Carolin Strobl.}\label{fig:lmu} \end{figure} \begin{figure}[t!] -{\centering \includegraphics[width=0.83\linewidth]{fritz_files/figure-latex/boku-1} +{\centering \includegraphics[width=0.83\linewidth]{figures/img-boku} } -\caption{Institute of Statistics at BOKU in 2022 (left to right, back to front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: BOKU.}(\#fig:boku) +\caption{Institute of Statistics at BOKU in 2022 (left to right, back to front): Johannes Laimighofer, Nur Banu Özcelik, Ursula Laa, Fritz Leisch, Bernhard Spangl, Gregor Laaha, Matthias Medl. Robert Wiedermann, Lena Ortega Menjivar, Theresa Scharl, Melati Avedis. Source: BOKU.}\label{fig:boku} \end{figure} +\section{Key contributions}\label{key-contributions} -# Key contributions - -Fritz' scientific contributions span an impressive range including -theoretical and methodological work (especially in the field of clustering -and finite mixture models) over software (mostly related to the R -programming language) to applied work and cooperations (notably in -marketing, biotechnology, and genomics, among many others). In the -following sections we try to highlight his key contributions and +Fritz' scientific contributions span an impressive range including +theoretical and methodological work (especially in the field of clustering +and finite mixture models) over software (mostly related to the R +programming language) to applied work and cooperations (notably in +marketing, biotechnology, and genomics, among many others). In the +following sections we try to highlight his key contributions and scientific legacy. -## R Core & CRAN +\subsection{R Core \& CRAN}\label{r-core-cran} During his stay in Australia, Fritz had learned about the existence of R. Back in Austria, he and Kurt started to explore this potentially good news more systematically. They soon stopped further work on a -statistics toolbox they had developed for Octave [@Eaton+Bateman+Hauberg:2024], +statistics toolbox they had developed for Octave \citep{Eaton+Bateman+Hauberg:2024}, and switched to R for their applied work, finding lots of room for further improvement, and thus sending polite emails with patches and more suggestions to Ross Ihaka and Robert Gentleman. Clearly these were @@ -124,68 +86,67 @@ One of the main challenges then was that the functionality provided by R was rather limited. Contributed extensions for S were available from the -Carnegie Mellon University Statlib S Archive^[Unfortunately, the Statlib -S Archive is currently not available anymore. A snapshot, including many -of the actual source code files, is available on the Internet Archive at -.], and could typically be +Carnegie Mellon University Statlib S Archive\footnote{Unfortunately, the Statlib + S Archive is currently not available anymore. A snapshot, including many + of the actual source code files, is available on the Internet Archive at + \url{https://web.archive.org/web/20000815063825/http://lib.stat.cmu.edu/S/}.}, and could typically be ported to R rather easily, but there was no mechanism for conveniently distributing or actually using these extensions. This fundamentally changed, when in 1997 Fritz and Kurt implemented the R package management system, using ideas from Debian's APT (advanced package tool, -) they had successfully employed for +\url{https://wiki.debian.org/AptCLI}) they had successfully employed for managing their computer systems. They also set up the Comprehensive R -Archive Network [CRAN, , see also -@Hornik:2012] as a means for redistributing R and its contributed +Archive Network \citep[CRAN, \url{https://CRAN.R-project.org/}, see also][]{Hornik:2012} as a means for redistributing R and its contributed extensions, and infrastructure for quality assurance of these -extensions. These two contributions paved the way for the amazing +extensions. These two contributions paved the way for the amazing growth and success of R through its wealth of high-quality contributed extensions. -See for +See \url{https://stat.ethz.ch/pipermail/r-announce/1997/000001.html} for the first announcement of CRAN, starting with 12 extension packages. -Currently, there are more than 21,000. See Figure \@ref(fig:cran) -for a screenshot^[This is from the earliest capture, from 1998-01-10, -available on the Internet Archive at -.] +Currently, there are more than 21,000. See Figure~\ref{fig:cran} +for a screenshot\footnote{This is from the earliest capture, from 1998-01-10, + available on the Internet Archive at + \url{https://web.archive.org/web/19980110082558/http://www.ci.tuwien.ac.at/R/contents.html}.} of the landing page of the CRAN master site at TU Wien, as last modified by Fritz on 1997-12-09. \begin{figure}[t!] -{\centering \includegraphics[width=1\linewidth]{fritz_files/figure-latex/cran-1} +{\centering \includegraphics[width=1\linewidth]{figures/img-cran} } -\caption{Screenshot of the landing page of the CRAN master site at TU Wien on 1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet Archive.}(\#fig:cran) +\caption{Screenshot of the landing page of the CRAN master site at TU Wien on 1998-01-10, as last modified by Fritz on 1997-12-09. Source: Internet Archive.}\label{fig:cran} \end{figure} The first SVN commit by Fritz is from 1997-10-02, the last from 2013-10-04. Overall, there are 651 commits by Fritz, mostly from the early years of R Core, and related to the R package management and CRAN -mirror system, and the addition of the `Sweave` system -(see Section [2.3](#sec:sweave-reproducibility) for more details). +mirror system, and the addition of the \texttt{Sweave} system +(see Section~\hyperref[sec:sweave-reproducibility]{2.3} for more details). -## DSC & useR! conferences +\subsection{DSC \& useR! conferences}\label{dsc-user-conferences} With establishing CRAN in Vienna at TU Wien, Fritz and Kurt laid the foundation for a special relationship between Vienna and R that they -characterized as a story of "love and marriage" [@Hornik+Leisch:2002]. In the decade +characterized as a story of ``love and marriage'' \citep{Hornik+Leisch:2002}. In the decade after the creation of CRAN a number of seminal R-related meetings took place in Vienna, co-organized by Fritz as well as several of the co-authors of this paper. -The first workshop on "Distributed Statistical Computing" (DSC) took place from +The first workshop on ``Distributed Statistical Computing'' (DSC) took place from March 19-23, 1999, at TU Wien. The main motivations were bringing together the R Core Team for its first face-to-face meeting, discussing the roadmap for the release of R 1.0.0, as well as exploring potential synergies with other environments for statistical computing. There were around 30 participants and about 20 presentations, many of which were -relatively short, leaving ample time for discussions (see Figure \@ref(fig:dsc1999)). +relatively short, leaving ample time for discussions (see Figure~\ref{fig:dsc1999}). \begin{figure}[p!] -{\centering \includegraphics[width=0.83\linewidth]{fritz_files/figure-latex/dsc1999-1} \includegraphics[width=0.83\linewidth]{fritz_files/figure-latex/dsc1999-2} \includegraphics[width=0.83\linewidth]{fritz_files/figure-latex/dsc1999-3} +{\centering \includegraphics[width=0.83\linewidth]{figures/img-dsc1999a} \includegraphics[width=0.83\linewidth]{figures/img-dsc1999b} \includegraphics[width=0.83\linewidth]{figures/img-dsc1999c} } -\caption{Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).}(\#fig:dsc1999) +\caption{Discussions at DSC 1999 (top to bottom, left to right): Thomas Lumley, Fritz Leisch, Luke Tierney. Peter Dalgaard, Ross Ihaka, Paul Murrell. Brian Ripley, Martin Mächler, Robert Gentleman, Kurt Hornik. Source: Douglas Bates (DSC 1999 homepage).}\label{fig:dsc1999} \end{figure} Two more DSC workshops were organized at TU Wien @@ -195,9 +156,9 @@ presentations on R packages and their different fields of application (e.g., establishing infrastructure for spatial data). In 2001 there were around 60 participants and about 30 presentations, most with corresponding -papers in the online proceedings [@Hornik+Leisch:2001]. In 2003 this +papers in the online proceedings \citep{Hornik+Leisch:2001}. In 2003 this increased to more than 150 participants and about 60 presentations, again -with the majority in the online proceedings [@Hornik+Leisch+Zeileis:2003]. +with the majority in the online proceedings \citep{Hornik+Leisch+Zeileis:2003}. The high demand for a platform, where R users from different fields could exchange ideas, prompted the creation of a new conference series called @@ -215,93 +176,91 @@ In addition to the efforts initiated by Fritz and Kurt, another key factor to the success of these meetings was the city of Vienna with its culture, -cafes, wine and beer pubs, etc. [see @Hornik+Leisch:2002 and also -Figure \@ref(fig:user2006)]. +cafes, wine and beer pubs, etc. \citep[see][ and also +Figure~\ref{fig:user2006}]{Hornik+Leisch:2002}. \begin{figure}[t!] -{\centering \includegraphics[width=0.83\linewidth]{fritz_files/figure-latex/user2006-1} +{\centering \includegraphics[width=0.83\linewidth]{figures/img-user2006} } -\caption{Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 homepage).}(\#fig:user2006) +\caption{Conference dinner at useR! 2006 (left to right): Fritz Leisch, Torsten Hothorn, Tim Hesterberg. Source: Carolin Strobl (useR! 2006 homepage).}\label{fig:user2006} \end{figure} +\subsection{Sweave \& reproducibility}\label{sweave-reproducibility} -## Sweave & reproducibility - -With `Sweave` [@Leisch:2002], Fritz pioneered what we now can understand as -the technical foundation of reproducible research. `Sweave` was the main -inspiration for \CRANpkg{knitr} [@Xie:2015] which in turn led to -\CRANpkg{rmarkdown} [@Xie+Allaire+Grolemund:2018] and \CRANpkg{quarto} -[@Scheidegger+Teague+Dervieux:2024]. All these systems are used today to +With \texttt{Sweave} \citep{Leisch:2002}, Fritz pioneered what we now can understand as +the technical foundation of reproducible research. \texttt{Sweave} was the main +inspiration for \CRANpkg{knitr} \citep{Xie:2015} which in turn led to +\CRANpkg{rmarkdown} \citep{Xie+Allaire+Grolemund:2018} and \CRANpkg{quarto} +\citep{Scheidegger+Teague+Dervieux:2024}. All these systems are used today to generate countless scientific articles, package vignettes, webpages, books, blogs, and much more in a dynamic and reproducible way. Of course, Fritz was not the first one going in this direction. The concept -of "literate programming" had been introduced by @Knuth:1984, allowing to +of ``literate programming'' had been introduced by \citet{Knuth:1984}, allowing to combine the source code for software and the corresponding documentation -in the same file. The concepts of "tangling", that is, extracting the code -for compilation, and "weaving", the process of generating a nicely looking +in the same file. The concepts of ``tangling'', that is, extracting the code +for compilation, and ``weaving'', the process of generating a nicely looking document containing code next to prosa and formulae, have their roots in the -`WEB` and `CWEB` systems [@Knuth+Levy:1993]. As these packages were specific -to code in Pascal (`WEB`) and C (`CWEB`), respectively, and documentation in -LaTeX, @Ramsey:1994 introduced his `noweb` system as a literate programming +\texttt{WEB} and \texttt{CWEB} systems \citep{Knuth+Levy:1993}. As these packages were specific +to code in Pascal (\texttt{WEB}) and C (\texttt{CWEB}), respectively, and documentation in +LaTeX, \citet{Ramsey:1994} introduced his \texttt{noweb} system as a literate programming tool that is agnostic to the programming language used and also supports HTML -in addition to LaTeX and a few other backends for documentation. The `noweb` +in addition to LaTeX and a few other backends for documentation. The \texttt{noweb} syntax for code chunks is: \pagebreak -``` +\begin{verbatim} <>= 1 + 2 @ -``` +\end{verbatim} -This will look familiar to users of `Sweave`. From this history, the naming -decisions for the software and its file format can be understood: `Sweave` +This will look familiar to users of \texttt{Sweave}. From this history, the naming +decisions for the software and its file format can be understood: \texttt{Sweave} is the function that weaves code in S (or R - both languages still existed -side by side at the time) with its output and documentation. And `Rnw` stands for files -mixing R code with `noweb` syntax. +side by side at the time) with its output and documentation. And \texttt{Rnw} stands for files +mixing R code with \texttt{noweb} syntax. Starting in the mid-1990s to the early 2000s, interests shifted from just -"literate programming" to "literate data analysis" [@Leisch:2002; @Leisch+Rossini:2003] -as a core ingredient for reproducible research [@Buckheit+Donoho:1995]. -The seminal new idea was to have dynamic documents so _outputs_ of code +``literate programming'' to ``literate data analysis'' \citep{Leisch:2002, Leisch+Rossini:2003} +as a core ingredient for reproducible research \citep{Buckheit+Donoho:1995}. +The seminal new idea was to have dynamic documents so \emph{outputs} of code such as figures and tables could be updated automatically when the underlying data changed, which was pioneered by the late Günter Sawitzki in his -`Voyager` system [@Sawitzki:1996]. +\texttt{Voyager} system \citep{Sawitzki:1996}. -Fritz amalgamated all of this into `Sweave` which was the first time that the +Fritz amalgamated all of this into \texttt{Sweave} which was the first time that the power of dynamic reporting became easily available in a widely-used programming language for statistics in combination with the standard textprocessing system -LaTeX. This turned out to be a "killer feature" of R at the time and the basis -for further work towards reproducible research [@Hothorn+Leisch_2011; @Stodden:2014]. +LaTeX. This turned out to be a ``killer feature'' of R at the time and the basis +for further work towards reproducible research \citep{Hothorn+Leisch_2011, Stodden:2014}. -`Sweave` was also the basis for R package vignettes [@Leisch:2003] as an +\texttt{Sweave} was also the basis for R package vignettes \citep{Leisch:2003} as an addition to the previously available technical manual pages. The first R package vignette published on CRAN in May 2002 was in the \CRANpkg{strucchange} package, providing methods for testing, monitoring, -and dating structural changes. The vignette was the `Sweave` adaptation +and dating structural changes. The vignette was the \texttt{Sweave} adaptation of an introduction to the package that had been co-authored by Fritz and -published a couple of months earlier in the _Journal of Statistical Software_ -[@Zeileis+Leisch+Hornik:2002]. See Figure \@ref(fig:sweave) for how -Fritz used it to illustrate the idea of package vignettes in @Leisch:2003 +published a couple of months earlier in the \emph{Journal of Statistical Software} +\citep{Zeileis+Leisch+Hornik:2002}. See Figure~\ref{fig:sweave} for how +Fritz used it to illustrate the idea of package vignettes in \citet{Leisch:2003} and that the R code from vignettes can be easily extracted (also interactively), explored, and re-run. \begin{figure}[t!] -{\centering \includegraphics[width=1\linewidth]{fritz_files/figure-latex/sweave-1} +{\centering \includegraphics[width=1\linewidth]{figures/img-sweave} } -\caption{Screenshot of the strucchange package vignette, shown in a PDF viewer (right), along with the vExplorer from Bioconductor for interactive code execution (top left) with output in the active R graphics window (bottom left). Source: Leisch (2003, Figure 2).}(\#fig:sweave) +\caption{Screenshot of the strucchange package vignette, shown in a PDF viewer (right), along with the vExplorer from Bioconductor for interactive code execution (top left) with output in the active R graphics window (bottom left). Source: Leisch (2003, Figure 2).}\label{fig:sweave} \end{figure} - -## Clustering & mixture models +\subsection{Clustering \& mixture models}\label{clustering-mixture-models} Fritz' theoretical and methodological work focused in particular on clustering and finite mixture models. Centroid-based partitioning @@ -310,23 +269,22 @@ framework, each of the steps is adapted in a modular way depending on the specific setup, e.g., the distance and centroid determining method or the component distribution used. Fritz exploited this for the -implementation of the packages \CRANpkg{flexclust} [@Leisch:2006] and -\CRANpkg{flexmix} [@Leisch:2004; @Gruen+Leisch:2008], contributing to +implementation of the packages \CRANpkg{flexclust} \citep{Leisch:2006} and +\CRANpkg{flexmix} \citep{Leisch:2004, Gruen+Leisch:2008}, contributing to the clustering tools available for R (see the CRAN Task View \ctv{Cluster}). Both packages provide general infrastructure for (model-based) clustering and enable rapid prototyping and the simple extension to new variants taking into account complicated data -structures or challenging model specifications [see, for example, -\pkg{psychomix}, @Frick+Strobl+Leisch:2012]. - +structures or challenging model specifications \citep[see, for example, +\pkg{psychomix},][]{Frick+Strobl+Leisch:2012}. -## Applied work +\subsection{Applied work}\label{applied-work} For many years, Fritz and Kurt actively participated in the Biological Psychiatry working group at Medizinische Universität Wien. The first paper co-authored by Fritz dates from 2000 -[@Bailer+Leisch+Meszaros:2000], the last from 2023 -[@Solmi+Thompson:2023]. The joint research was mostly +\citep{Bailer+Leisch+Meszaros:2000}, the last from 2023 +\citep{Solmi+Thompson:2023}. The joint research was mostly focused on linking genetic traits to psychiatric disorders and treatment success. This prompted many enhancements in the classical test infrastructure in base R - in surprising ways to some reviewers, who @@ -335,63 +293,59 @@ conveniently reporting the results of the statistical analyses to the medical doctors in the group that went beyond providing annotated transcripts, which Fritz eventually managed to satisfy by inventing the -`Sweave` system (see Section [2.3](#sec:sweave-reproducibility)). +\texttt{Sweave} system (see Section~\hyperref[sec:sweave-reproducibility]{2.3}). Fritz also intensively collaborated with Sara Dolnicar to advance data analytic methods for data-driven market segmentation analysis. They received the Charles R. Goeldner Article of Excellence Award for their work on extracting stable Winter tourist segments in Austria with -bagged clustering [@Dolnicar+Leisch:2003]. They focused on the +bagged clustering \citep{Dolnicar+Leisch:2003}. They focused on the evaluation of data structure and the selection of suitable segments -based on segment stability as a key criterion [@Dolnicar+Leisch:2010; -@Dolnicar+Leisch:2017]. Finally, this joint work resulted in -@Dolnicar+Gruen+Leisch:2018 which provides practical guidance for +based on segment stability as a key criterion \citep{Dolnicar+Leisch:2010, Dolnicar+Leisch:2017}. Finally, this joint work resulted in +\citet{Dolnicar+Gruen+Leisch:2018} which provides practical guidance for users of market segmentation solutions and for data analysts with respect to the technical and statistical aspects of market segmentation analysis. -As head of the Institute of Statistics, Fritz was involved -in various interdisciplinary research projects covering almost the whole -range of core areas of research at BOKU. He was key researcher at the +As head of the Institute of Statistics, Fritz was involved +in various interdisciplinary research projects covering almost the whole +range of core areas of research at BOKU. He was key researcher at the Austrian Centre of Industrial Biotechnology (acib) -[@Scharl+Voglhuber+Leisch:2009; @Melcher+Scharl+Leisch:2017] and -faculty member of the doctoral schools on agricultural genomics and -bioprocess engineering. Among others he contributed to the fields of -zoology [@Cech:2022], forestry, transportation and tourism -[@Taczanowska:2023] as well as chemistry, genomics and wildlife -biology [@Steiner:2014]. - +\citep{Scharl+Voglhuber+Leisch:2009, Melcher+Scharl+Leisch:2017} and +faculty member of the doctoral schools on agricultural genomics and +bioprocess engineering. Among others he contributed to the fields of +zoology \citep{Cech:2022}, forestry, transportation and tourism +\citep{Taczanowska:2023} as well as chemistry, genomics and wildlife +biology \citep{Steiner:2014}. -# Academic service +\section{Academic service}\label{academic-service} In addition to the services for the various conferences and proceedings already described above, he served the scientific community in various ways. -In January 2001, he co-created _R News_ which evolved into -_The R Journal_ eight years later. For the journal _Computational Statistics_ +In January 2001, he co-created \emph{R News} which evolved into +\emph{The R Journal} eight years later. For the journal \emph{Computational Statistics} he was an associate editor from 2005 to 2006 before he became editor-in-chief -from 2007 to 2011 [see @Symanzik+Mori+Vieu:2024 for more details]. -Other notable contributions include being -editor for the _Journal of Statistical Software_, core member of the -_Bioconductor_ project for statistical software in bioinformatics, and -first secretary general of the _R Foundation for Statistical Computing_ when -it was formed in 2002. +from 2007 to 2011 \citep[see][ for more details]{Symanzik+Mori+Vieu:2024}. +Other notable contributions include being +editor for the \emph{Journal of Statistical Software}, core member of the +\emph{Bioconductor} project for statistical software in bioinformatics, and +first secretary general of the \emph{R Foundation for Statistical Computing} when +it was formed in 2002. - - -# Teaching & mentoring +\section{Teaching \& mentoring}\label{teaching-mentoring} Fritz taught generations of students at bachelor, master, and PhD level and -introduced hundreds of useRs to proper R development in his "Introduction to -R Programming" short course. At TU Wien, LMU, and BOKU, he taught courses in applied -statistics, statistical computing and computational statistics. He had the -ability to explain even difficult content in a simple way and to inspire students +introduced hundreds of useRs to proper R development in his ``Introduction to +R Programming'' short course. At TU Wien, LMU, and BOKU, he taught courses in applied +statistics, statistical computing and computational statistics. He had the +ability to explain even difficult content in a simple way and to inspire students with statistics and programming with R. He -co-founded the "Munich R Courses" lecture series and was part of a group -aiming to initiate a formal PhD program in statistics at LMU. +co-founded the ``Munich R Courses'' lecture series and was part of a group +aiming to initiate a formal PhD program in statistics at LMU. Fritz supervised -Bettina Grün, Theresa Scharl, -Sebastian Kaiser, Manuel Eugster, +Bettina Grün, Theresa Scharl, +Sebastian Kaiser, Manuel Eugster, Christina Yassouridis, Rainer Dangl, Weksi Budiaji, Muhammad Atif and Simona Jokubauskaite as his PhD students. @@ -400,19 +354,68 @@ innocent errors that have a tendency to pile up and invalidate reported statistical results, with potentially devastating consequences, as we all know. -# Odds & ends +\section{Odds \& ends}\label{odds-ends} Fritz loved cooking, music, motorbike riding, playing cards with his friends, skiing and hiking. A late afternoon call to his office asking him to go along for a beer in Munich's English Garden almost never went -unanswered, positively. Back in Vienna at BOKU, colleagues got to know Fritz as a very -structured, thoughtful, calm person who involved everyone, listened to +unanswered, positively. Back in Vienna at BOKU, colleagues got to know Fritz as a very +structured, thoughtful, calm person who involved everyone, listened to everyone and always endeavored to balance interests and ensure fairness. -He strengthened cooperation and cohesion with his leadership style. -Fritz was a friendly, always modest person who was free of airs and graces or +He strengthened cooperation and cohesion with his leadership style. +Fritz was a friendly, always modest person who was free of airs and graces or vanity, despite or perhaps because of his great scientific successes. The R Core Team and the R community at large miss a contributor, collaborator, teacher, colleague, and friend. -# References +\bibliography{fritz.bib} + +\address{% +Bettina Grün\\ +WU Wirtschaftsuniversität Wien\\% +Austria\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0001-7265-4773}{0000-0001-7265-4773}}\\% +\href{mailto:Bettina.Gruen@wu.ac.at}{\nolinkurl{Bettina.Gruen@wu.ac.at}}% +} + +\address{% +Kurt Hornik\\ +WU Wirtschaftsuniversität Wien\\% +Austria\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0003-4198-9911}{0000-0003-4198-9911}}\\% +\href{mailto:Kurt.Hornik@R-project.org}{\nolinkurl{Kurt.Hornik@R-project.org}}% +} +\address{% +Torsten Hothorn\\ +Universität Zürich\\% +Switzerland\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0001-8301-0471}{0000-0001-8301-0471}}\\% +\href{mailto:Torsten.Hothorn@R-project.org}{\nolinkurl{Torsten.Hothorn@R-project.org}}% +} + +\address{% +Theresa Scharl\\ +BOKU University\\% +Austria\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0001-8850-3312}{0000-0001-8850-3312}}\\% +\href{mailto:Theresa.Scharl@boku.ac.at}{\nolinkurl{Theresa.Scharl@boku.ac.at}}% +} + +\address{% +Achim Zeileis\\ +Universität Innsbruck\\% +Austria\\ +% +\url{https://www.zeileis.org/}\\% +\textit{ORCiD: \href{https://orcid.org/0000-0003-0918-3766}{0000-0003-0918-3766}}\\% +\href{mailto:Achim.Zeileis@R-project.org}{\nolinkurl{Achim.Zeileis@R-project.org}}% +} diff --git a/_articles/RJ-2024-001/RJ-2024-001_files/anchor-4.2.2/anchor.min.js b/_articles/RJ-2024-001/RJ-2024-001_files/anchor-4.2.2/anchor.min.js deleted file mode 100644 index 26908ec13f..0000000000 --- a/_articles/RJ-2024-001/RJ-2024-001_files/anchor-4.2.2/anchor.min.js +++ /dev/null @@ -1,9 +0,0 @@ -// @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&dn=expat.txt Expat -// -// AnchorJS - v4.2.2 - 2019-11-14 -// https://www.bryanbraun.com/anchorjs/ -// Copyright (c) 2019 Bryan Braun; Licensed MIT -// -// @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&dn=expat.txt Expat -!function(A,e){"use strict";"function"==typeof define&&define.amd?define([],e):"object"==typeof module&&module.exports?module.exports=e():(A.AnchorJS=e(),A.anchors=new A.AnchorJS)}(this,function(){"use strict";return function(A){function f(A){A.icon=A.hasOwnProperty("icon")?A.icon:"",A.visible=A.hasOwnProperty("visible")?A.visible:"hover",A.placement=A.hasOwnProperty("placement")?A.placement:"right",A.ariaLabel=A.hasOwnProperty("ariaLabel")?A.ariaLabel:"Anchor",A.class=A.hasOwnProperty("class")?A.class:"",A.base=A.hasOwnProperty("base")?A.base:"",A.truncate=A.hasOwnProperty("truncate")?Math.floor(A.truncate):64,A.titleText=A.hasOwnProperty("titleText")?A.titleText:""}function p(A){var e;if("string"==typeof A||A instanceof String)e=[].slice.call(document.querySelectorAll(A));else{if(!(Array.isArray(A)||A instanceof NodeList))throw new Error("The selector provided to AnchorJS was invalid.");e=[].slice.call(A)}return e}this.options=A||{},this.elements=[],f(this.options),this.isTouchDevice=function(){return!!("ontouchstart"in window||window.DocumentTouch&&document instanceof DocumentTouch)},this.add=function(A){var e,t,i,n,o,s,a,r,c,h,l,u,d=[];if(f(this.options),"touch"===(l=this.options.visible)&&(l=this.isTouchDevice()?"always":"hover"),0===(e=p(A=A||"h2, h3, h4, h5, h6")).length)return this;for(!function(){if(null!==document.head.querySelector("style.anchorjs"))return;var A,e=document.createElement("style");e.className="anchorjs",e.appendChild(document.createTextNode("")),void 0===(A=document.head.querySelector('[rel="stylesheet"], style'))?document.head.appendChild(e):document.head.insertBefore(e,A);e.sheet.insertRule(" .anchorjs-link { opacity: 0; text-decoration: none; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; }",e.sheet.cssRules.length),e.sheet.insertRule(" *:hover > .anchorjs-link, .anchorjs-link:focus { opacity: 1; }",e.sheet.cssRules.length),e.sheet.insertRule(" [data-anchorjs-icon]::after { content: attr(data-anchorjs-icon); }",e.sheet.cssRules.length),e.sheet.insertRule(' @font-face { font-family: "anchorjs-icons"; src: url(data:n/a;base64,AAEAAAALAIAAAwAwT1MvMg8yG2cAAAE4AAAAYGNtYXDp3gC3AAABpAAAAExnYXNwAAAAEAAAA9wAAAAIZ2x5ZlQCcfwAAAH4AAABCGhlYWQHFvHyAAAAvAAAADZoaGVhBnACFwAAAPQAAAAkaG10eASAADEAAAGYAAAADGxvY2EACACEAAAB8AAAAAhtYXhwAAYAVwAAARgAAAAgbmFtZQGOH9cAAAMAAAAAunBvc3QAAwAAAAADvAAAACAAAQAAAAEAAHzE2p9fDzz1AAkEAAAAAADRecUWAAAAANQA6R8AAAAAAoACwAAAAAgAAgAAAAAAAAABAAADwP/AAAACgAAA/9MCrQABAAAAAAAAAAAAAAAAAAAAAwABAAAAAwBVAAIAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAMCQAGQAAUAAAKZAswAAACPApkCzAAAAesAMwEJAAAAAAAAAAAAAAAAAAAAARAAAAAAAAAAAAAAAAAAAAAAQAAg//0DwP/AAEADwABAAAAAAQAAAAAAAAAAAAAAIAAAAAAAAAIAAAACgAAxAAAAAwAAAAMAAAAcAAEAAwAAABwAAwABAAAAHAAEADAAAAAIAAgAAgAAACDpy//9//8AAAAg6cv//f///+EWNwADAAEAAAAAAAAAAAAAAAAACACEAAEAAAAAAAAAAAAAAAAxAAACAAQARAKAAsAAKwBUAAABIiYnJjQ3NzY2MzIWFxYUBwcGIicmNDc3NjQnJiYjIgYHBwYUFxYUBwYGIwciJicmNDc3NjIXFhQHBwYUFxYWMzI2Nzc2NCcmNDc2MhcWFAcHBgYjARQGDAUtLXoWOR8fORYtLTgKGwoKCjgaGg0gEhIgDXoaGgkJBQwHdR85Fi0tOAobCgoKOBoaDSASEiANehoaCQkKGwotLXoWOR8BMwUFLYEuehYXFxYugC44CQkKGwo4GkoaDQ0NDXoaShoKGwoFBe8XFi6ALjgJCQobCjgaShoNDQ0NehpKGgobCgoKLYEuehYXAAAADACWAAEAAAAAAAEACAAAAAEAAAAAAAIAAwAIAAEAAAAAAAMACAAAAAEAAAAAAAQACAAAAAEAAAAAAAUAAQALAAEAAAAAAAYACAAAAAMAAQQJAAEAEAAMAAMAAQQJAAIABgAcAAMAAQQJAAMAEAAMAAMAAQQJAAQAEAAMAAMAAQQJAAUAAgAiAAMAAQQJAAYAEAAMYW5jaG9yanM0MDBAAGEAbgBjAGgAbwByAGoAcwA0ADAAMABAAAAAAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAH//wAP) format("truetype"); }',e.sheet.cssRules.length)}(),t=document.querySelectorAll("[id]"),i=[].map.call(t,function(A){return A.id}),o=0;o\]\.\/\(\)\*\\\n\t\b\v]/g,"-").replace(/-{2,}/g,"-").substring(0,this.options.truncate).replace(/^-+|-+$/gm,"").toLowerCase()},this.hasAnchorJSLink=function(A){var e=A.firstChild&&-1<(" "+A.firstChild.className+" ").indexOf(" anchorjs-link "),t=A.lastChild&&-1<(" "+A.lastChild.className+" ").indexOf(" anchorjs-link ");return e||t||!1}}}); -// @license-end \ No newline at end of file diff --git a/_articles/RJ-2024-001/RJ-2024-001_files/bowser-1.9.3/bowser.min.js b/_articles/RJ-2024-001/RJ-2024-001_files/bowser-1.9.3/bowser.min.js deleted file mode 100644 index 5866337b8d..0000000000 --- a/_articles/RJ-2024-001/RJ-2024-001_files/bowser-1.9.3/bowser.min.js +++ /dev/null @@ -1,6 +0,0 @@ -/*! - * Bowser - a browser detector - * https://github.com/ded/bowser - * MIT License | (c) Dustin Diaz 2015 - */ -!function(e,t,n){typeof module!="undefined"&&module.exports?module.exports=n():typeof define=="function"&&define.amd?define(t,n):e[t]=n()}(this,"bowser",function(){function t(t){function n(e){var n=t.match(e);return n&&n.length>1&&n[1]||""}function r(e){var n=t.match(e);return n&&n.length>1&&n[2]||""}function N(e){switch(e){case"NT":return"NT";case"XP":return"XP";case"NT 5.0":return"2000";case"NT 5.1":return"XP";case"NT 5.2":return"2003";case"NT 6.0":return"Vista";case"NT 6.1":return"7";case"NT 6.2":return"8";case"NT 6.3":return"8.1";case"NT 10.0":return"10";default:return undefined}}var i=n(/(ipod|iphone|ipad)/i).toLowerCase(),s=/like android/i.test(t),o=!s&&/android/i.test(t),u=/nexus\s*[0-6]\s*/i.test(t),a=!u&&/nexus\s*[0-9]+/i.test(t),f=/CrOS/.test(t),l=/silk/i.test(t),c=/sailfish/i.test(t),h=/tizen/i.test(t),p=/(web|hpw)os/i.test(t),d=/windows phone/i.test(t),v=/SamsungBrowser/i.test(t),m=!d&&/windows/i.test(t),g=!i&&!l&&/macintosh/i.test(t),y=!o&&!c&&!h&&!p&&/linux/i.test(t),b=r(/edg([ea]|ios)\/(\d+(\.\d+)?)/i),w=n(/version\/(\d+(\.\d+)?)/i),E=/tablet/i.test(t)&&!/tablet pc/i.test(t),S=!E&&/[^-]mobi/i.test(t),x=/xbox/i.test(t),T;/opera/i.test(t)?T={name:"Opera",opera:e,version:w||n(/(?:opera|opr|opios)[\s\/](\d+(\.\d+)?)/i)}:/opr\/|opios/i.test(t)?T={name:"Opera",opera:e,version:n(/(?:opr|opios)[\s\/](\d+(\.\d+)?)/i)||w}:/SamsungBrowser/i.test(t)?T={name:"Samsung Internet for Android",samsungBrowser:e,version:w||n(/(?:SamsungBrowser)[\s\/](\d+(\.\d+)?)/i)}:/coast/i.test(t)?T={name:"Opera Coast",coast:e,version:w||n(/(?:coast)[\s\/](\d+(\.\d+)?)/i)}:/yabrowser/i.test(t)?T={name:"Yandex Browser",yandexbrowser:e,version:w||n(/(?:yabrowser)[\s\/](\d+(\.\d+)?)/i)}:/ucbrowser/i.test(t)?T={name:"UC Browser",ucbrowser:e,version:n(/(?:ucbrowser)[\s\/](\d+(?:\.\d+)+)/i)}:/mxios/i.test(t)?T={name:"Maxthon",maxthon:e,version:n(/(?:mxios)[\s\/](\d+(?:\.\d+)+)/i)}:/epiphany/i.test(t)?T={name:"Epiphany",epiphany:e,version:n(/(?:epiphany)[\s\/](\d+(?:\.\d+)+)/i)}:/puffin/i.test(t)?T={name:"Puffin",puffin:e,version:n(/(?:puffin)[\s\/](\d+(?:\.\d+)?)/i)}:/sleipnir/i.test(t)?T={name:"Sleipnir",sleipnir:e,version:n(/(?:sleipnir)[\s\/](\d+(?:\.\d+)+)/i)}:/k-meleon/i.test(t)?T={name:"K-Meleon",kMeleon:e,version:n(/(?:k-meleon)[\s\/](\d+(?:\.\d+)+)/i)}:d?(T={name:"Windows Phone",osname:"Windows Phone",windowsphone:e},b?(T.msedge=e,T.version=b):(T.msie=e,T.version=n(/iemobile\/(\d+(\.\d+)?)/i))):/msie|trident/i.test(t)?T={name:"Internet Explorer",msie:e,version:n(/(?:msie |rv:)(\d+(\.\d+)?)/i)}:f?T={name:"Chrome",osname:"Chrome OS",chromeos:e,chromeBook:e,chrome:e,version:n(/(?:chrome|crios|crmo)\/(\d+(\.\d+)?)/i)}:/edg([ea]|ios)/i.test(t)?T={name:"Microsoft Edge",msedge:e,version:b}:/vivaldi/i.test(t)?T={name:"Vivaldi",vivaldi:e,version:n(/vivaldi\/(\d+(\.\d+)?)/i)||w}:c?T={name:"Sailfish",osname:"Sailfish OS",sailfish:e,version:n(/sailfish\s?browser\/(\d+(\.\d+)?)/i)}:/seamonkey\//i.test(t)?T={name:"SeaMonkey",seamonkey:e,version:n(/seamonkey\/(\d+(\.\d+)?)/i)}:/firefox|iceweasel|fxios/i.test(t)?(T={name:"Firefox",firefox:e,version:n(/(?:firefox|iceweasel|fxios)[ \/](\d+(\.\d+)?)/i)},/\((mobile|tablet);[^\)]*rv:[\d\.]+\)/i.test(t)&&(T.firefoxos=e,T.osname="Firefox OS")):l?T={name:"Amazon Silk",silk:e,version:n(/silk\/(\d+(\.\d+)?)/i)}:/phantom/i.test(t)?T={name:"PhantomJS",phantom:e,version:n(/phantomjs\/(\d+(\.\d+)?)/i)}:/slimerjs/i.test(t)?T={name:"SlimerJS",slimer:e,version:n(/slimerjs\/(\d+(\.\d+)?)/i)}:/blackberry|\bbb\d+/i.test(t)||/rim\stablet/i.test(t)?T={name:"BlackBerry",osname:"BlackBerry OS",blackberry:e,version:w||n(/blackberry[\d]+\/(\d+(\.\d+)?)/i)}:p?(T={name:"WebOS",osname:"WebOS",webos:e,version:w||n(/w(?:eb)?osbrowser\/(\d+(\.\d+)?)/i)},/touchpad\//i.test(t)&&(T.touchpad=e)):/bada/i.test(t)?T={name:"Bada",osname:"Bada",bada:e,version:n(/dolfin\/(\d+(\.\d+)?)/i)}:h?T={name:"Tizen",osname:"Tizen",tizen:e,version:n(/(?:tizen\s?)?browser\/(\d+(\.\d+)?)/i)||w}:/qupzilla/i.test(t)?T={name:"QupZilla",qupzilla:e,version:n(/(?:qupzilla)[\s\/](\d+(?:\.\d+)+)/i)||w}:/chromium/i.test(t)?T={name:"Chromium",chromium:e,version:n(/(?:chromium)[\s\/](\d+(?:\.\d+)?)/i)||w}:/chrome|crios|crmo/i.test(t)?T={name:"Chrome",chrome:e,version:n(/(?:chrome|crios|crmo)\/(\d+(\.\d+)?)/i)}:o?T={name:"Android",version:w}:/safari|applewebkit/i.test(t)?(T={name:"Safari",safari:e},w&&(T.version=w)):i?(T={name:i=="iphone"?"iPhone":i=="ipad"?"iPad":"iPod"},w&&(T.version=w)):/googlebot/i.test(t)?T={name:"Googlebot",googlebot:e,version:n(/googlebot\/(\d+(\.\d+))/i)||w}:T={name:n(/^(.*)\/(.*) /),version:r(/^(.*)\/(.*) /)},!T.msedge&&/(apple)?webkit/i.test(t)?(/(apple)?webkit\/537\.36/i.test(t)?(T.name=T.name||"Blink",T.blink=e):(T.name=T.name||"Webkit",T.webkit=e),!T.version&&w&&(T.version=w)):!T.opera&&/gecko\//i.test(t)&&(T.name=T.name||"Gecko",T.gecko=e,T.version=T.version||n(/gecko\/(\d+(\.\d+)?)/i)),!T.windowsphone&&(o||T.silk)?(T.android=e,T.osname="Android"):!T.windowsphone&&i?(T[i]=e,T.ios=e,T.osname="iOS"):g?(T.mac=e,T.osname="macOS"):x?(T.xbox=e,T.osname="Xbox"):m?(T.windows=e,T.osname="Windows"):y&&(T.linux=e,T.osname="Linux");var C="";T.windows?C=N(n(/Windows ((NT|XP)( \d\d?.\d)?)/i)):T.windowsphone?C=n(/windows phone (?:os)?\s?(\d+(\.\d+)*)/i):T.mac?(C=n(/Mac OS X (\d+([_\.\s]\d+)*)/i),C=C.replace(/[_\s]/g,".")):i?(C=n(/os (\d+([_\s]\d+)*) like mac os x/i),C=C.replace(/[_\s]/g,".")):o?C=n(/android[ \/-](\d+(\.\d+)*)/i):T.webos?C=n(/(?:web|hpw)os\/(\d+(\.\d+)*)/i):T.blackberry?C=n(/rim\stablet\sos\s(\d+(\.\d+)*)/i):T.bada?C=n(/bada\/(\d+(\.\d+)*)/i):T.tizen&&(C=n(/tizen[\/\s](\d+(\.\d+)*)/i)),C&&(T.osversion=C);var k=!T.windows&&C.split(".")[0];if(E||a||i=="ipad"||o&&(k==3||k>=4&&!S)||T.silk)T.tablet=e;else if(S||i=="iphone"||i=="ipod"||o||u||T.blackberry||T.webos||T.bada)T.mobile=e;return T.msedge||T.msie&&T.version>=10||T.yandexbrowser&&T.version>=15||T.vivaldi&&T.version>=1||T.chrome&&T.version>=20||T.samsungBrowser&&T.version>=4||T.firefox&&T.version>=20||T.safari&&T.version>=6||T.opera&&T.version>=10||T.ios&&T.osversion&&T.osversion.split(".")[0]>=6||T.blackberry&&T.version>=10.1||T.chromium&&T.version>=20?T.a=e:T.msie&&T.version<10||T.chrome&&T.version<20||T.firefox&&T.version<20||T.safari&&T.version<6||T.opera&&T.version<10||T.ios&&T.osversion&&T.osversion.split(".")[0]<6||T.chromium&&T.version<20?T.c=e:T.x=e,T}function r(e){return e.split(".").length}function i(e,t){var n=[],r;if(Array.prototype.map)return Array.prototype.map.call(e,t);for(r=0;r=0){if(n[0][t]>n[1][t])return 1;if(n[0][t]!==n[1][t])return-1;if(t===0)return 0}}function o(e,r,i){var o=n;typeof r=="string"&&(i=r,r=void 0),r===void 0&&(r=!1),i&&(o=t(i));var u=""+o.version;for(var a in e)if(e.hasOwnProperty(a)&&o[a]){if(typeof e[a]!="string")throw new Error("Browser version in the minVersion map should be a string: "+a+": "+String(e));return s([u,e[a]])<0}return r}function u(e,t,n){return!o(e,t,n)}var e=!0,n=t(typeof navigator!="undefined"?navigator.userAgent||"":"");return n.test=function(e){for(var t=0;tnew Qn(e)),e.katex=t.katex,e.password=t.password}function t(e=document){const t=new Set,n=e.querySelectorAll('d-cite');for(const i of n){const e=i.getAttribute('key').split(',');for(const n of e)t.add(n)}return[...t]}function n(e,t,n,i){if(null==e.author)return'';var a=e.author.split(' and ');let d=a.map((e)=>{if(e=e.trim(),e.match(/\{.+\}/)){var n=/\{([^}]+)\}/,i=n.exec(e);return i[1]}if(-1!=e.indexOf(','))var a=e.split(',')[0].trim(),d=e.split(',')[1];else var a=e.split(' ').slice(-1)[0].trim(),d=e.split(' ').slice(0,-1).join(' ');var r='';return void 0!=d&&(r=d.trim().split(' ').map((e)=>e.trim()[0]),r=r.join('.')+'.'),t.replace('${F}',d).replace('${L}',a).replace('${I}',r)});if(1[${i||'link'}]`}return''}function d(e,t){return'doi'in e?`${t?'
':''} DOI: ${e.doi}`:''}function r(e){return''+e.title+' '}function o(e){if(e){var t=r(e);return t+=a(e)+'
',e.author&&(t+=n(e,'${L}, ${I}',', ',' and '),(e.year||e.date)&&(t+=', ')),t+=e.year||e.date?(e.year||e.date)+'. ':'. ',t+=i(e),t+=d(e),t}return'?'}function l(e){if(e){var t='';t+=''+e.title+'',t+=a(e),t+='
';var r=n(e,'${I} ${L}',', ')+'.',o=i(e).trim()+' '+e.year+'. '+d(e,!0);return t+=(r+o).length'+o,t}return'?'}function s(e){for(let t of e.authors){const e=!!t.affiliation,n=!!t.affiliations;if(e)if(n)console.warn(`Author ${t.author} has both old-style ("affiliation" & "affiliationURL") and new style ("affiliations") affiliation information!`);else{let e={name:t.affiliation};t.affiliationURL&&(e.url=t.affiliationURL),t.affiliations=[e]}}return console.log(e),e}function c(e){const t=e.querySelector('script');if(t){const e=t.getAttribute('type');if('json'==e.split('/')[1]){const e=t.textContent,n=JSON.parse(e);return s(n)}console.error('Distill only supports JSON frontmatter tags anymore; no more YAML.')}else console.error('You added a frontmatter tag but did not provide a script tag with front matter data in it. Please take a look at our templates.');return{}}function u(){return-1!==['interactive','complete'].indexOf(document.readyState)}function p(e){const t='distill-prerendered-styles',n=e.getElementById(t);if(!n){const n=e.createElement('style');n.id=t,n.type='text/css';const i=e.createTextNode(bi);n.appendChild(i);const a=e.head.querySelector('script');e.head.insertBefore(n,a)}}function g(e,t){console.info('Runlevel 0: Polyfill required: '+e.name);const n=document.createElement('script');n.src=e.url,n.async=!1,t&&(n.onload=function(){t(e)}),n.onerror=function(){new Error('Runlevel 0: Polyfills failed to load script '+e.name)},document.head.appendChild(n)}function f(e,t){return t={exports:{}},e(t,t.exports),t.exports}function h(e){return e.replace(/[\t\n ]+/g,' ').replace(/{\\["^`.'acu~Hvs]( )?([a-zA-Z])}/g,(e,t,n)=>n).replace(/{\\([a-zA-Z])}/g,(e,t)=>t)}function b(e){const t=new Map,n=_i.toJSON(e);for(const i of n){for(const[e,t]of Object.entries(i.entryTags))i.entryTags[e.toLowerCase()]=h(t);i.entryTags.type=i.entryType,t.set(i.citationKey,i.entryTags)}return t}function m(e){return`@article{${e.slug}, - author = {${e.bibtexAuthors}}, - title = {${e.title}}, - journal = {${e.journal.title}}, - year = {${e.publishedYear}}, - note = {${e.url}}, - doi = {${e.doi}} -}`}function y(e){return` - -`}function x(e,t,n=document){if(0 - - d-toc { - contain: layout style; - display: block; - } - - d-toc ul { - padding-left: 0; - } - - d-toc ul > ul { - padding-left: 24px; - } - - d-toc a { - border-bottom: none; - text-decoration: none; - } - - - -

Table of contents

-
    `;for(const i of t){const e='D-TITLE'==i.parentElement.tagName,t=i.getAttribute('no-toc');if(e||t)continue;const a=i.textContent,d='#'+i.getAttribute('id');let r='
  • '+a+'
  • ';'H3'==i.tagName?r='
      '+r+'
    ':r+='
    ',n+=r}n+='
',e.innerHTML=n}function v(e){return function(t,n){return Xi(e(t),n)}}function w(e,t,n){var i=(t-e)/Rn(0,n),a=Fn(jn(i)/Nn),d=i/In(10,a);return 0<=a?(d>=Gi?10:d>=ea?5:d>=ta?2:1)*In(10,a):-In(10,-a)/(d>=Gi?10:d>=ea?5:d>=ta?2:1)}function S(e,t,n){var i=Un(t-e)/Rn(0,n),a=In(10,Fn(jn(i)/Nn)),d=i/a;return d>=Gi?a*=10:d>=ea?a*=5:d>=ta&&(a*=2),t>8|240&t>>4,15&t>>4|240&t,(15&t)<<4|15&t,1)):(t=ca.exec(e))?O(parseInt(t[1],16)):(t=ua.exec(e))?new j(t[1],t[2],t[3],1):(t=pa.exec(e))?new j(255*t[1]/100,255*t[2]/100,255*t[3]/100,1):(t=ga.exec(e))?U(t[1],t[2],t[3],t[4]):(t=fa.exec(e))?U(255*t[1]/100,255*t[2]/100,255*t[3]/100,t[4]):(t=ha.exec(e))?R(t[1],t[2]/100,t[3]/100,1):(t=ba.exec(e))?R(t[1],t[2]/100,t[3]/100,t[4]):ma.hasOwnProperty(e)?O(ma[e]):'transparent'===e?new j(NaN,NaN,NaN,0):null}function O(e){return new j(255&e>>16,255&e>>8,255&e,1)}function U(e,t,n,i){return 0>=i&&(e=t=n=NaN),new j(e,t,n,i)}function I(e){return(e instanceof L||(e=M(e)),!e)?new j:(e=e.rgb(),new j(e.r,e.g,e.b,e.opacity))}function N(e,t,n,i){return 1===arguments.length?I(e):new j(e,t,n,null==i?1:i)}function j(e,t,n,i){this.r=+e,this.g=+t,this.b=+n,this.opacity=+i}function R(e,t,n,i){return 0>=i?e=t=n=NaN:0>=n||1<=n?e=t=NaN:0>=t&&(e=NaN),new F(e,t,n,i)}function q(e){if(e instanceof F)return new F(e.h,e.s,e.l,e.opacity);if(e instanceof L||(e=M(e)),!e)return new F;if(e instanceof F)return e;e=e.rgb();var t=e.r/255,n=e.g/255,i=e.b/255,a=Hn(t,n,i),d=Rn(t,n,i),r=NaN,c=d-a,s=(d+a)/2;return c?(r=t===d?(n-i)/c+6*(ns?d+a:2-d-a,r*=60):c=0s?0:r,new F(r,c,s,e.opacity)}function F(e,t,n,i){this.h=+e,this.s=+t,this.l=+n,this.opacity=+i}function P(e,t,n){return 255*(60>e?t+(n-t)*e/60:180>e?n:240>e?t+(n-t)*(240-e)/60:t)}function H(e){if(e instanceof Y)return new Y(e.l,e.a,e.b,e.opacity);if(e instanceof X){var t=e.h*ya;return new Y(e.l,Mn(t)*e.c,Dn(t)*e.c,e.opacity)}e instanceof j||(e=I(e));var n=$(e.r),i=$(e.g),a=$(e.b),d=W((0.4124564*n+0.3575761*i+0.1804375*a)/Kn),r=W((0.2126729*n+0.7151522*i+0.072175*a)/Xn),o=W((0.0193339*n+0.119192*i+0.9503041*a)/Yn);return new Y(116*r-16,500*(d-r),200*(r-o),e.opacity)}function Y(e,t,n,i){this.l=+e,this.a=+t,this.b=+n,this.opacity=+i}function W(e){return e>Sa?In(e,1/3):e/wa+Zn}function V(e){return e>va?e*e*e:wa*(e-Zn)}function K(e){return 255*(0.0031308>=e?12.92*e:1.055*In(e,1/2.4)-0.055)}function $(e){return 0.04045>=(e/=255)?e/12.92:In((e+0.055)/1.055,2.4)}function z(e){if(e instanceof X)return new X(e.h,e.c,e.l,e.opacity);e instanceof Y||(e=H(e));var t=En(e.b,e.a)*xa;return new X(0>t?t+360:t,An(e.a*e.a+e.b*e.b),e.l,e.opacity)}function X(e,t,n,i){this.h=+e,this.c=+t,this.l=+n,this.opacity=+i}function J(e){if(e instanceof Z)return new Z(e.h,e.s,e.l,e.opacity);e instanceof j||(e=I(e));var t=e.r/255,n=e.g/255,i=e.b/255,a=(_a*i+E*t-Ta*n)/(_a+E-Ta),d=i-a,r=(D*(n-a)-B*d)/C,o=An(r*r+d*d)/(D*a*(1-a)),l=o?En(r,d)*xa-120:NaN;return new Z(0>l?l+360:l,o,a,e.opacity)}function Q(e,t,n,i){return 1===arguments.length?J(e):new Z(e,t,n,null==i?1:i)}function Z(e,t,n,i){this.h=+e,this.s=+t,this.l=+n,this.opacity=+i}function G(e,n){return function(i){return e+i*n}}function ee(e,n,i){return e=In(e,i),n=In(n,i)-e,i=1/i,function(a){return In(e+a*n,i)}}function te(e){return 1==(e=+e)?ne:function(t,n){return n-t?ee(t,n,e):La(isNaN(t)?n:t)}}function ne(e,t){var n=t-e;return n?G(e,n):La(isNaN(e)?t:e)}function ie(e){return function(){return e}}function ae(e){return function(n){return e(n)+''}}function de(e){return function t(n){function i(i,t){var a=e((i=Q(i)).h,(t=Q(t)).h),d=ne(i.s,t.s),r=ne(i.l,t.l),o=ne(i.opacity,t.opacity);return function(e){return i.h=a(e),i.s=d(e),i.l=r(In(e,n)),i.opacity=o(e),i+''}}return n=+n,i.gamma=t,i}(1)}function oe(e,t){return(t-=e=+e)?function(n){return(n-e)/t}:Pa(t)}function le(e){return function(t,n){var i=e(t=+t,n=+n);return function(e){return e<=t?0:e>=n?1:i(e)}}}function se(e){return function(n,i){var d=e(n=+n,i=+i);return function(e){return 0>=e?n:1<=e?i:d(e)}}}function ce(e,t,n,i){var a=e[0],d=e[1],r=t[0],o=t[1];return d',a=t[3]||'-',d=t[4]||'',r=!!t[5],o=t[6]&&+t[6],l=!!t[7],s=t[8]&&+t[8].slice(1),c=t[9]||'';'n'===c?(l=!0,c='g'):!$a[c]&&(c=''),(r||'0'===n&&'='===i)&&(r=!0,n='0',i='='),this.fill=n,this.align=i,this.sign=a,this.symbol=d,this.zero=r,this.width=o,this.comma=l,this.precision=s,this.type=c}function be(e){var t=e.domain;return e.ticks=function(e){var n=t();return na(n[0],n[n.length-1],null==e?10:e)},e.tickFormat=function(e,n){return ad(t(),e,n)},e.nice=function(n){null==n&&(n=10);var i,a=t(),d=0,r=a.length-1,o=a[d],l=a[r];return li&&(o=qn(o*i)/i,l=Fn(l*i)/i,i=w(o,l,n)),0i&&(a[d]=qn(o*i)/i,a[r]=Fn(l*i)/i,t(a)),e},e}function me(){var e=ge(oe,Ma);return e.copy=function(){return pe(e,me())},be(e)}function ye(e,t,n,i){function a(t){return e(t=new Date(+t)),t}return a.floor=a,a.ceil=function(n){return e(n=new Date(n-1)),t(n,1),e(n),n},a.round=function(e){var t=a(e),n=a.ceil(e);return e-t=t)for(;e(t),!n(t);)t.setTime(t-1)},function(e,i){if(e>=e)if(0>i)for(;0>=++i;)for(;t(e,-1),!n(e););else for(;0<=--i;)for(;t(e,1),!n(e););})},n&&(a.count=function(t,i){return dd.setTime(+t),rd.setTime(+i),e(dd),e(rd),Fn(n(dd,rd))},a.every=function(e){return e=Fn(e),isFinite(e)&&0e.y){var t=new Date(-1,e.m,e.d,e.H,e.M,e.S,e.L);return t.setFullYear(e.y),t}return new Date(e.y,e.m,e.d,e.H,e.M,e.S,e.L)}function we(e){if(0<=e.y&&100>e.y){var t=new Date(Date.UTC(-1,e.m,e.d,e.H,e.M,e.S,e.L));return t.setUTCFullYear(e.y),t}return new Date(Date.UTC(e.y,e.m,e.d,e.H,e.M,e.S,e.L))}function Se(e){return{y:e,m:0,d:1,H:0,M:0,S:0,L:0}}function Ce(e){function t(e,t){return function(a){var d,r,o,l=[],s=-1,i=0,c=e.length;for(a instanceof Date||(a=new Date(+a));++s=n)return-1;if(r=t.charCodeAt(l++),37===r){if(r=t.charAt(l++),o=C[r in Hd?t.charAt(l++):r],!o||0>(d=o(e,a,d)))return-1;}else if(r!=a.charCodeAt(d++))return-1}return d}var r=e.dateTime,o=e.date,l=e.time,i=e.periods,s=e.days,c=e.shortDays,u=e.months,p=e.shortMonths,g=Le(i),f=Ae(i),h=Le(s),b=Ae(s),m=Le(c),y=Ae(c),x=Le(u),k=Ae(u),v=Le(p),w=Ae(p),d={a:function(e){return c[e.getDay()]},A:function(e){return s[e.getDay()]},b:function(e){return p[e.getMonth()]},B:function(e){return u[e.getMonth()]},c:null,d:Ye,e:Ye,H:Be,I:We,j:Ve,L:Ke,m:$e,M:Xe,p:function(e){return i[+(12<=e.getHours())]},S:Je,U:Qe,w:Ze,W:Ge,x:null,X:null,y:et,Y:tt,Z:nt,"%":mt},S={a:function(e){return c[e.getUTCDay()]},A:function(e){return s[e.getUTCDay()]},b:function(e){return p[e.getUTCMonth()]},B:function(e){return u[e.getUTCMonth()]},c:null,d:it,e:it,H:at,I:dt,j:rt,L:ot,m:lt,M:st,p:function(e){return i[+(12<=e.getUTCHours())]},S:ct,U:ut,w:pt,W:gt,x:null,X:null,y:ft,Y:ht,Z:bt,"%":mt},C={a:function(e,t,a){var i=m.exec(t.slice(a));return i?(e.w=y[i[0].toLowerCase()],a+i[0].length):-1},A:function(e,t,a){var i=h.exec(t.slice(a));return i?(e.w=b[i[0].toLowerCase()],a+i[0].length):-1},b:function(e,t,a){var i=v.exec(t.slice(a));return i?(e.m=w[i[0].toLowerCase()],a+i[0].length):-1},B:function(e,t,a){var i=x.exec(t.slice(a));return i?(e.m=k[i[0].toLowerCase()],a+i[0].length):-1},c:function(e,t,n){return a(e,r,t,n)},d:je,e:je,H:qe,I:qe,j:Re,L:He,m:Ne,M:Fe,p:function(e,t,a){var i=g.exec(t.slice(a));return i?(e.p=f[i[0].toLowerCase()],a+i[0].length):-1},S:Pe,U:De,w:Ee,W:Me,x:function(e,t,n){return a(e,o,t,n)},X:function(e,t,n){return a(e,l,t,n)},y:Ue,Y:Oe,Z:Ie,"%":ze};return d.x=t(o,d),d.X=t(l,d),d.c=t(r,d),S.x=t(o,S),S.X=t(l,S),S.c=t(r,S),{format:function(e){var n=t(e+='',d);return n.toString=function(){return e},n},parse:function(e){var t=n(e+='',ve);return t.toString=function(){return e},t},utcFormat:function(e){var n=t(e+='',S);return n.toString=function(){return e},n},utcParse:function(e){var t=n(e,we);return t.toString=function(){return e},t}}}function Te(e,t,n){var i=0>e?'-':'',a=(i?-e:e)+'',d=a.length;return i+(dt?1:e>=t?0:NaN}function qt(e){return function(){this.removeAttribute(e)}}function Ft(e){return function(){this.removeAttributeNS(e.space,e.local)}}function Pt(e,t){return function(){this.setAttribute(e,t)}}function Ht(e,t){return function(){this.setAttributeNS(e.space,e.local,t)}}function zt(e,t){return function(){var n=t.apply(this,arguments);null==n?this.removeAttribute(e):this.setAttribute(e,n)}}function Yt(e,t){return function(){var n=t.apply(this,arguments);null==n?this.removeAttributeNS(e.space,e.local):this.setAttributeNS(e.space,e.local,n)}}function Bt(e){return function(){this.style.removeProperty(e)}}function Wt(e,t,n){return function(){this.style.setProperty(e,t,n)}}function Vt(e,t,n){return function(){var i=t.apply(this,arguments);null==i?this.style.removeProperty(e):this.style.setProperty(e,i,n)}}function Kt(e,t){return e.style.getPropertyValue(t)||vr(e).getComputedStyle(e,null).getPropertyValue(t)}function $t(e){return function(){delete this[e]}}function Xt(e,t){return function(){this[e]=t}}function Jt(e,t){return function(){var n=t.apply(this,arguments);null==n?delete this[e]:this[e]=n}}function Qt(e){return e.trim().split(/^|\s+/)}function Zt(e){return e.classList||new Gt(e)}function Gt(e){this._node=e,this._names=Qt(e.getAttribute('class')||'')}function en(e,t){for(var a=Zt(e),d=-1,i=t.length;++dUpdates and Corrections

-

`,e.githubCompareUpdatesUrl&&(t+=`View all changes to this article since it was first published.`),t+=` - If you see mistakes or want to suggest changes, please create an issue on GitHub.

- `);const n=e.journal;return'undefined'!=typeof n&&'Distill'===n.title&&(t+=` -

Reuse

-

Diagrams and text are licensed under Creative Commons Attribution CC-BY 4.0 with the source available on GitHub, unless noted otherwise. The figures that have been reused from other sources don’t fall under this license and can be recognized by a note in their caption: “Figure from …”.

- `),'undefined'!=typeof e.publishedDate&&(t+=` -

Citation

-

For attribution in academic contexts, please cite this work as

-
${e.concatenatedAuthors}, "${e.title}", Distill, ${e.publishedYear}.
-

BibTeX citation

-
${m(e)}
- `),t}var An=Math.sqrt,En=Math.atan2,Dn=Math.sin,Mn=Math.cos,On=Math.PI,Un=Math.abs,In=Math.pow,Nn=Math.LN10,jn=Math.log,Rn=Math.max,qn=Math.ceil,Fn=Math.floor,Pn=Math.round,Hn=Math.min;const zn=['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'],Bn=['Jan.','Feb.','March','April','May','June','July','Aug.','Sept.','Oct.','Nov.','Dec.'],Wn=(e)=>10>e?'0'+e:e,Vn=function(e){const t=zn[e.getDay()].substring(0,3),n=Wn(e.getDate()),i=Bn[e.getMonth()].substring(0,3),a=e.getFullYear().toString(),d=e.getUTCHours().toString(),r=e.getUTCMinutes().toString(),o=e.getUTCSeconds().toString();return`${t}, ${n} ${i} ${a} ${d}:${r}:${o} Z`},$n=function(e){const t=Array.from(e).reduce((e,[t,n])=>Object.assign(e,{[t]:n}),{});return t},Jn=function(e){const t=new Map;for(var n in e)e.hasOwnProperty(n)&&t.set(n,e[n]);return t};class Qn{constructor(e){this.name=e.author,this.personalURL=e.authorURL,this.affiliation=e.affiliation,this.affiliationURL=e.affiliationURL,this.affiliations=e.affiliations||[]}get firstName(){const e=this.name.split(' ');return e.slice(0,e.length-1).join(' ')}get lastName(){const e=this.name.split(' ');return e[e.length-1]}}class Gn{constructor(){this.title='unnamed article',this.description='',this.authors=[],this.bibliography=new Map,this.bibliographyParsed=!1,this.citations=[],this.citationsCollected=!1,this.journal={},this.katex={},this.publishedDate=void 0}set url(e){this._url=e}get url(){if(this._url)return this._url;return this.distillPath&&this.journal.url?this.journal.url+'/'+this.distillPath:this.journal.url?this.journal.url:void 0}get githubUrl(){return this.githubPath?'https://github.com/'+this.githubPath:void 0}set previewURL(e){this._previewURL=e}get previewURL(){return this._previewURL?this._previewURL:this.url+'/thumbnail.jpg'}get publishedDateRFC(){return Vn(this.publishedDate)}get updatedDateRFC(){return Vn(this.updatedDate)}get publishedYear(){return this.publishedDate.getFullYear()}get publishedMonth(){return Bn[this.publishedDate.getMonth()]}get publishedDay(){return this.publishedDate.getDate()}get publishedMonthPadded(){return Wn(this.publishedDate.getMonth()+1)}get publishedDayPadded(){return Wn(this.publishedDate.getDate())}get publishedISODateOnly(){return this.publishedDate.toISOString().split('T')[0]}get volume(){const e=this.publishedYear-2015;if(1>e)throw new Error('Invalid publish date detected during computing volume');return e}get issue(){return this.publishedDate.getMonth()+1}get concatenatedAuthors(){if(2{return e.lastName+', '+e.firstName}).join(' and ')}get slug(){let e='';return this.authors.length&&(e+=this.authors[0].lastName.toLowerCase(),e+=this.publishedYear,e+=this.title.split(' ')[0].toLowerCase()),e||'Untitled'}get bibliographyEntries(){return new Map(this.citations.map((e)=>{const t=this.bibliography.get(e);return[e,t]}))}set bibliography(e){e instanceof Map?this._bibliography=e:'object'==typeof e&&(this._bibliography=Jn(e))}get bibliography(){return this._bibliography}static fromObject(e){const t=new Gn;return Object.assign(t,e),t}assignToObject(e){Object.assign(e,this),e.bibliography=$n(this.bibliographyEntries),e.url=this.url,e.githubUrl=this.githubUrl,e.previewURL=this.previewURL,this.publishedDate&&(e.volume=this.volume,e.issue=this.issue,e.publishedDateRFC=this.publishedDateRFC,e.publishedYear=this.publishedYear,e.publishedMonth=this.publishedMonth,e.publishedDay=this.publishedDay,e.publishedMonthPadded=this.publishedMonthPadded,e.publishedDayPadded=this.publishedDayPadded),this.updatedDate&&(e.updatedDateRFC=this.updatedDateRFC),e.concatenatedAuthors=this.concatenatedAuthors,e.bibtexAuthors=this.bibtexAuthors,e.slug=this.slug}}const ei=(e)=>{return class extends e{constructor(){super();const e={childList:!0,characterData:!0,subtree:!0},t=new MutationObserver(()=>{t.disconnect(),this.renderIfPossible(),t.observe(this,e)});t.observe(this,e)}connectedCallback(){super.connectedCallback(),this.renderIfPossible()}renderIfPossible(){this.textContent&&this.root&&this.renderContent()}renderContent(){console.error(`Your class ${this.constructor.name} must provide a custom renderContent() method!`)}}},ti=(e,t,n=!0)=>{return(i)=>{const a=document.createElement('template');return a.innerHTML=t,n&&'ShadyCSS'in window&&ShadyCSS.prepareTemplate(a,e),class extends i{static get is(){return e}constructor(){super(),this.clone=document.importNode(a.content,!0),n&&(this.attachShadow({mode:'open'}),this.shadowRoot.appendChild(this.clone))}connectedCallback(){n?'ShadyCSS'in window&&ShadyCSS.styleElement(this):this.insertBefore(this.clone,this.firstChild)}get root(){return n?this.shadowRoot:this}$(e){return this.root.querySelector(e)}$$(e){return this.root.querySelectorAll(e)}}}};var ni='/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nspan.katex-display {\n text-align: left;\n padding: 8px 0 8px 0;\n margin: 0.5em 0 0.5em 1em;\n}\n\nspan.katex {\n -webkit-font-smoothing: antialiased;\n color: rgba(0, 0, 0, 0.8);\n font-size: 1.18em;\n}\n';const ii=function(e,t,n){let i=n,a=0;for(const d=e.length;i=a&&t.slice(i,i+d)===e)return i;'\\'===n?i++:'{'===n?a++:'}'===n&&a--;i++}return-1},ai=function(e,t,n,i){const a=[];for(let d=0;d',ui=ti('d-math',` -${ci} - - -`);class T extends ei(ui(HTMLElement)){static set katexOptions(e){T._katexOptions=e,T.katexOptions.delimiters&&(T.katexAdded?T.katexLoadedCallback():T.addKatex())}static get katexOptions(){return T._katexOptions||(T._katexOptions={delimiters:[{left:'$$',right:'$$',display:!1}]}),T._katexOptions}static katexLoadedCallback(){const e=document.querySelectorAll('d-math');for(const t of e)t.renderContent();if(T.katexOptions.delimiters){const e=document.querySelector('d-article');si(e,T.katexOptions)}}static addKatex(){document.head.insertAdjacentHTML('beforeend',ci);const e=document.createElement('script');e.src='https://distill.pub/third-party/katex/katex.min.js',e.async=!0,e.onload=T.katexLoadedCallback,e.crossorigin='anonymous',document.head.appendChild(e),T.katexAdded=!0}get options(){const e={displayMode:this.hasAttribute('block')};return Object.assign(e,T.katexOptions)}connectedCallback(){super.connectedCallback(),T.katexAdded||T.addKatex()}renderContent(){if('undefined'!=typeof katex){const e=this.root.querySelector('#katex-container');katex.render(this.textContent,e,this.options)}}}T.katexAdded=!1,T.inlineMathRendered=!1,window.DMath=T;class pi extends HTMLElement{static get is(){return'd-front-matter'}constructor(){super();const e=new MutationObserver((e)=>{for(const t of e)if('SCRIPT'===t.target.nodeName||'characterData'===t.type){const e=c(this);this.notify(e)}});e.observe(this,{childList:!0,characterData:!0,subtree:!0})}notify(e){const t=new CustomEvent('onFrontMatterChanged',{detail:e,bubbles:!0});document.dispatchEvent(t)}}var gi=function(e,t){const n=e.body,i=n.querySelector('d-article');if(!i)return void console.warn('No d-article tag found; skipping adding optional components!');let a=e.querySelector('d-byline');a||(t.authors?(a=e.createElement('d-byline'),n.insertBefore(a,i)):console.warn('No authors found in front matter; please add them before submission!'));let d=e.querySelector('d-title');d||(d=e.createElement('d-title'),n.insertBefore(d,a));let r=d.querySelector('h1');r||(r=e.createElement('h1'),r.textContent=t.title,d.insertBefore(r,d.firstChild));const o='undefined'!=typeof t.password;let l=n.querySelector('d-interstitial');if(o&&!l){const i='undefined'!=typeof window,a=i&&window.location.hostname.includes('localhost');i&&a||(l=e.createElement('d-interstitial'),l.password=t.password,n.insertBefore(l,n.firstChild))}else!o&&l&&l.parentElement.removeChild(this);let s=e.querySelector('d-appendix');s||(s=e.createElement('d-appendix'),e.body.appendChild(s));let c=e.querySelector('d-footnote-list');c||(c=e.createElement('d-footnote-list'),s.appendChild(c));let u=e.querySelector('d-citation-list');u||(u=e.createElement('d-citation-list'),s.appendChild(u))};const fi=new Gn,hi={frontMatter:fi,waitingOn:{bibliography:[],citations:[]},listeners:{onCiteKeyCreated(e){const[t,n]=e.detail;if(!fi.citationsCollected)return void hi.waitingOn.citations.push(()=>hi.listeners.onCiteKeyCreated(e));if(!fi.bibliographyParsed)return void hi.waitingOn.bibliography.push(()=>hi.listeners.onCiteKeyCreated(e));const i=n.map((e)=>fi.citations.indexOf(e));t.numbers=i;const a=n.map((e)=>fi.bibliography.get(e));t.entries=a},onCiteKeyChanged(){fi.citations=t(),fi.citationsCollected=!0;for(const e of hi.waitingOn.citations.slice())e();const e=document.querySelector('d-citation-list'),n=new Map(fi.citations.map((e)=>{return[e,fi.bibliography.get(e)]}));e.citations=n;const i=document.querySelectorAll('d-cite');for(const e of i){const t=e.keys,n=t.map((e)=>fi.citations.indexOf(e));e.numbers=n;const i=t.map((e)=>fi.bibliography.get(e));e.entries=i}},onCiteKeyRemoved(e){hi.listeners.onCiteKeyChanged(e)},onBibliographyChanged(e){const t=document.querySelector('d-citation-list'),n=e.detail;fi.bibliography=n,fi.bibliographyParsed=!0;for(const t of hi.waitingOn.bibliography.slice())t();if(!fi.citationsCollected)return void hi.waitingOn.citations.push(function(){hi.listeners.onBibliographyChanged({target:e.target,detail:e.detail})});if(t.hasAttribute('distill-prerendered'))console.info('Citation list was prerendered; not updating it.');else{const e=new Map(fi.citations.map((e)=>{return[e,fi.bibliography.get(e)]}));t.citations=e}},onFootnoteChanged(){const e=document.querySelector('d-footnote-list');if(e){const t=document.querySelectorAll('d-footnote');e.footnotes=t}},onFrontMatterChanged(t){const n=t.detail;e(fi,n);const i=document.querySelector('d-interstitial');i&&('undefined'==typeof fi.password?i.parentElement.removeChild(i):i.password=fi.password);const a=document.body.hasAttribute('distill-prerendered');if(!a&&u()){gi(document,fi);const e=document.querySelector('distill-appendix');e&&(e.frontMatter=fi);const t=document.querySelector('d-byline');t&&(t.frontMatter=fi),n.katex&&(T.katexOptions=n.katex)}},DOMContentLoaded(){if(hi.loaded)return void console.warn('Controller received DOMContentLoaded but was already loaded!');if(!u())return void console.warn('Controller received DOMContentLoaded before appropriate document.readyState!');hi.loaded=!0,console.log('Runlevel 4: Controller running DOMContentLoaded');const e=document.querySelector('d-front-matter'),n=c(e);hi.listeners.onFrontMatterChanged({detail:n}),fi.citations=t(),fi.citationsCollected=!0;for(const e of hi.waitingOn.citations.slice())e();if(fi.bibliographyParsed)for(const e of hi.waitingOn.bibliography.slice())e();const i=document.querySelector('d-footnote-list');if(i){const e=document.querySelectorAll('d-footnote');i.footnotes=e}}}};const bi='/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nhtml {\n font-size: 14px;\n\tline-height: 1.6em;\n /* font-family: "Libre Franklin", "Helvetica Neue", sans-serif; */\n font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Fira Sans", "Droid Sans", "Helvetica Neue", Arial, sans-serif;\n /*, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";*/\n text-size-adjust: 100%;\n -ms-text-size-adjust: 100%;\n -webkit-text-size-adjust: 100%;\n}\n\n@media(min-width: 768px) {\n html {\n font-size: 16px;\n }\n}\n\nbody {\n margin: 0;\n}\n\na {\n color: #004276;\n}\n\nfigure {\n margin: 0;\n}\n\ntable {\n\tborder-collapse: collapse;\n\tborder-spacing: 0;\n}\n\ntable th {\n\ttext-align: left;\n}\n\ntable thead {\n border-bottom: 1px solid rgba(0, 0, 0, 0.05);\n}\n\ntable thead th {\n padding-bottom: 0.5em;\n}\n\ntable tbody :first-child td {\n padding-top: 0.5em;\n}\n\npre {\n overflow: auto;\n max-width: 100%;\n}\n\np {\n margin-top: 0;\n margin-bottom: 1em;\n}\n\nsup, sub {\n vertical-align: baseline;\n position: relative;\n top: -0.4em;\n line-height: 1em;\n}\n\nsub {\n top: 0.4em;\n}\n\n.kicker,\n.marker {\n font-size: 15px;\n font-weight: 600;\n color: rgba(0, 0, 0, 0.5);\n}\n\n\n/* Headline */\n\n@media(min-width: 1024px) {\n d-title h1 span {\n display: block;\n }\n}\n\n/* Figure */\n\nfigure {\n position: relative;\n margin-bottom: 2.5em;\n margin-top: 1.5em;\n}\n\nfigcaption+figure {\n\n}\n\nfigure img {\n width: 100%;\n}\n\nfigure svg text,\nfigure svg tspan {\n}\n\nfigcaption,\n.figcaption {\n color: rgba(0, 0, 0, 0.6);\n font-size: 12px;\n line-height: 1.5em;\n}\n\n@media(min-width: 1024px) {\nfigcaption,\n.figcaption {\n font-size: 13px;\n }\n}\n\nfigure.external img {\n background: white;\n border: 1px solid rgba(0, 0, 0, 0.1);\n box-shadow: 0 1px 8px rgba(0, 0, 0, 0.1);\n padding: 18px;\n box-sizing: border-box;\n}\n\nfigcaption a {\n color: rgba(0, 0, 0, 0.6);\n}\n\nfigcaption b,\nfigcaption strong, {\n font-weight: 600;\n color: rgba(0, 0, 0, 1.0);\n}\n'+'/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n@supports not (display: grid) {\n .base-grid,\n distill-header,\n d-title,\n d-abstract,\n d-article,\n d-appendix,\n distill-appendix,\n d-byline,\n d-footnote-list,\n d-citation-list,\n distill-footer {\n display: block;\n padding: 8px;\n }\n}\n\n.base-grid,\ndistill-header,\nd-title,\nd-abstract,\nd-article,\nd-appendix,\ndistill-appendix,\nd-byline,\nd-footnote-list,\nd-citation-list,\ndistill-footer {\n display: grid;\n justify-items: stretch;\n grid-template-columns: [screen-start] 8px [page-start kicker-start text-start gutter-start middle-start] 1fr 1fr 1fr 1fr 1fr 1fr 1fr 1fr [text-end page-end gutter-end kicker-end middle-end] 8px [screen-end];\n grid-column-gap: 8px;\n}\n\n.grid {\n display: grid;\n grid-column-gap: 8px;\n}\n\n@media(min-width: 768px) {\n .base-grid,\n distill-header,\n d-title,\n d-abstract,\n d-article,\n d-appendix,\n distill-appendix,\n d-byline,\n d-footnote-list,\n d-citation-list,\n distill-footer {\n grid-template-columns: [screen-start] 1fr [page-start kicker-start middle-start text-start] 45px 45px 45px 45px 45px 45px 45px 45px [ kicker-end text-end gutter-start] 45px [middle-end] 45px [page-end gutter-end] 1fr [screen-end];\n grid-column-gap: 16px;\n }\n\n .grid {\n grid-column-gap: 16px;\n }\n}\n\n@media(min-width: 1000px) {\n .base-grid,\n distill-header,\n d-title,\n d-abstract,\n d-article,\n d-appendix,\n distill-appendix,\n d-byline,\n d-footnote-list,\n d-citation-list,\n distill-footer {\n grid-template-columns: [screen-start] 1fr [page-start kicker-start] 50px [middle-start] 50px [text-start kicker-end] 50px 50px 50px 50px 50px 50px 50px 50px [text-end gutter-start] 50px [middle-end] 50px [page-end gutter-end] 1fr [screen-end];\n grid-column-gap: 16px;\n }\n\n .grid {\n grid-column-gap: 16px;\n }\n}\n\n@media(min-width: 1180px) {\n .base-grid,\n distill-header,\n d-title,\n d-abstract,\n d-article,\n d-appendix,\n distill-appendix,\n d-byline,\n d-footnote-list,\n d-citation-list,\n distill-footer {\n grid-template-columns: [screen-start] 1fr [page-start kicker-start] 60px [middle-start] 60px [text-start kicker-end] 60px 60px 60px 60px 60px 60px 60px 60px [text-end gutter-start] 60px [middle-end] 60px [page-end gutter-end] 1fr [screen-end];\n grid-column-gap: 32px;\n }\n\n .grid {\n grid-column-gap: 32px;\n }\n}\n\n\n\n\n.base-grid {\n grid-column: screen;\n}\n\n/* .l-body,\nd-article > * {\n grid-column: text;\n}\n\n.l-page,\nd-title > *,\nd-figure {\n grid-column: page;\n} */\n\n.l-gutter {\n grid-column: gutter;\n}\n\n.l-text,\n.l-body {\n grid-column: text;\n}\n\n.l-page {\n grid-column: page;\n}\n\n.l-body-outset {\n grid-column: middle;\n}\n\n.l-page-outset {\n grid-column: page;\n}\n\n.l-screen {\n grid-column: screen;\n}\n\n.l-screen-inset {\n grid-column: screen;\n padding-left: 16px;\n padding-left: 16px;\n}\n\n\n/* Aside */\n\nd-article aside {\n grid-column: gutter;\n font-size: 12px;\n line-height: 1.6em;\n color: rgba(0, 0, 0, 0.6)\n}\n\n@media(min-width: 768px) {\n aside {\n grid-column: gutter;\n }\n\n .side {\n grid-column: gutter;\n }\n}\n'+'/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nd-title {\n padding: 2rem 0 1.5rem;\n contain: layout style;\n overflow-x: hidden;\n}\n\n@media(min-width: 768px) {\n d-title {\n padding: 4rem 0 1.5rem;\n }\n}\n\nd-title h1 {\n grid-column: text;\n font-size: 40px;\n font-weight: 700;\n line-height: 1.1em;\n margin: 0 0 0.5rem;\n}\n\n@media(min-width: 768px) {\n d-title h1 {\n font-size: 50px;\n }\n}\n\nd-title p {\n font-weight: 300;\n font-size: 1.2rem;\n line-height: 1.55em;\n grid-column: text;\n}\n\nd-title .status {\n margin-top: 0px;\n font-size: 12px;\n color: #009688;\n opacity: 0.8;\n grid-column: kicker;\n}\n\nd-title .status span {\n line-height: 1;\n display: inline-block;\n padding: 6px 0;\n border-bottom: 1px solid #80cbc4;\n font-size: 11px;\n text-transform: uppercase;\n}\n'+'/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nd-byline {\n contain: content;\n overflow: hidden;\n border-top: 1px solid rgba(0, 0, 0, 0.1);\n font-size: 0.8rem;\n line-height: 1.8em;\n padding: 1.5rem 0;\n min-height: 1.8em;\n}\n\n\nd-byline .byline {\n grid-template-columns: 1fr 1fr;\n grid-column: text;\n}\n\n@media(min-width: 768px) {\n d-byline .byline {\n grid-template-columns: 1fr 1fr 1fr 1fr;\n }\n}\n\nd-byline .authors-affiliations {\n grid-column-end: span 2;\n grid-template-columns: 1fr 1fr;\n margin-bottom: 1em;\n}\n\n@media(min-width: 768px) {\n d-byline .authors-affiliations {\n margin-bottom: 0;\n }\n}\n\nd-byline h3 {\n font-size: 0.6rem;\n font-weight: 400;\n color: rgba(0, 0, 0, 0.5);\n margin: 0;\n text-transform: uppercase;\n}\n\nd-byline p {\n margin: 0;\n}\n\nd-byline a,\nd-article d-byline a {\n color: rgba(0, 0, 0, 0.8);\n text-decoration: none;\n border-bottom: none;\n}\n\nd-article d-byline a:hover {\n text-decoration: underline;\n border-bottom: none;\n}\n\nd-byline p.author {\n font-weight: 500;\n}\n\nd-byline .affiliations {\n\n}\n'+'/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nd-article {\n contain: layout style;\n overflow-x: hidden;\n border-top: 1px solid rgba(0, 0, 0, 0.1);\n padding-top: 2rem;\n color: rgba(0, 0, 0, 0.8);\n}\n\nd-article > * {\n grid-column: text;\n}\n\n@media(min-width: 768px) {\n d-article {\n font-size: 16px;\n }\n}\n\n@media(min-width: 1024px) {\n d-article {\n font-size: 1.06rem;\n line-height: 1.7em;\n }\n}\n\n\n/* H2 */\n\n\nd-article .marker {\n text-decoration: none;\n border: none;\n counter-reset: section;\n grid-column: kicker;\n line-height: 1.7em;\n}\n\nd-article .marker:hover {\n border: none;\n}\n\nd-article .marker span {\n padding: 0 3px 4px;\n border-bottom: 1px solid rgba(0, 0, 0, 0.2);\n position: relative;\n top: 4px;\n}\n\nd-article .marker:hover span {\n color: rgba(0, 0, 0, 0.7);\n border-bottom: 1px solid rgba(0, 0, 0, 0.7);\n}\n\nd-article h2 {\n font-weight: 600;\n font-size: 24px;\n line-height: 1.25em;\n margin: 2rem 0 1.5rem 0;\n border-bottom: 1px solid rgba(0, 0, 0, 0.1);\n padding-bottom: 1rem;\n}\n\n@media(min-width: 1024px) {\n d-article h2 {\n font-size: 36px;\n }\n}\n\n/* H3 */\n\nd-article h3 {\n font-weight: 700;\n font-size: 18px;\n line-height: 1.4em;\n margin-bottom: 1em;\n margin-top: 2em;\n}\n\n@media(min-width: 1024px) {\n d-article h3 {\n font-size: 20px;\n }\n}\n\n/* H4 */\n\nd-article h4 {\n font-weight: 600;\n text-transform: uppercase;\n font-size: 14px;\n line-height: 1.4em;\n}\n\nd-article a {\n color: inherit;\n}\n\nd-article p,\nd-article ul,\nd-article ol,\nd-article blockquote {\n margin-top: 0;\n margin-bottom: 1em;\n margin-left: 0;\n margin-right: 0;\n}\n\nd-article blockquote {\n border-left: 2px solid rgba(0, 0, 0, 0.2);\n padding-left: 2em;\n font-style: italic;\n color: rgba(0, 0, 0, 0.6);\n}\n\nd-article a {\n border-bottom: 1px solid rgba(0, 0, 0, 0.4);\n text-decoration: none;\n}\n\nd-article a:hover {\n border-bottom: 1px solid rgba(0, 0, 0, 0.8);\n}\n\nd-article .link {\n text-decoration: underline;\n cursor: pointer;\n}\n\nd-article ul,\nd-article ol {\n padding-left: 24px;\n}\n\nd-article li {\n margin-bottom: 1em;\n margin-left: 0;\n padding-left: 0;\n}\n\nd-article li:last-child {\n margin-bottom: 0;\n}\n\nd-article pre {\n font-size: 14px;\n margin-bottom: 20px;\n}\n\nd-article hr {\n grid-column: screen;\n width: 100%;\n border: none;\n border-bottom: 1px solid rgba(0, 0, 0, 0.1);\n margin-top: 60px;\n margin-bottom: 60px;\n}\n\nd-article section {\n margin-top: 60px;\n margin-bottom: 60px;\n}\n\nd-article span.equation-mimic {\n font-family: georgia;\n font-size: 115%;\n font-style: italic;\n}\n\nd-article > d-code,\nd-article section > d-code {\n display: block;\n}\n\nd-article > d-math[block],\nd-article section > d-math[block] {\n display: block;\n}\n\n@media (max-width: 768px) {\n d-article > d-code,\n d-article section > d-code,\n d-article > d-math[block],\n d-article section > d-math[block] {\n overflow-x: scroll;\n -ms-overflow-style: none; // IE 10+\n overflow: -moz-scrollbars-none; // Firefox\n }\n\n d-article > d-code::-webkit-scrollbar,\n d-article section > d-code::-webkit-scrollbar,\n d-article > d-math[block]::-webkit-scrollbar,\n d-article section > d-math[block]::-webkit-scrollbar {\n display: none; // Safari and Chrome\n }\n}\n\nd-article .citation {\n color: #668;\n cursor: pointer;\n}\n\nd-include {\n width: auto;\n display: block;\n}\n\nd-figure {\n contain: layout style;\n}\n\n/* KaTeX */\n\n.katex, .katex-prerendered {\n contain: style;\n display: inline-block;\n}\n\n/* Tables */\n\nd-article table {\n border-collapse: collapse;\n margin-bottom: 1.5rem;\n border-bottom: 1px solid rgba(0, 0, 0, 0.2);\n}\n\nd-article table th {\n border-bottom: 1px solid rgba(0, 0, 0, 0.2);\n}\n\nd-article table td {\n border-bottom: 1px solid rgba(0, 0, 0, 0.05);\n}\n\nd-article table tr:last-of-type td {\n border-bottom: none;\n}\n\nd-article table th,\nd-article table td {\n font-size: 15px;\n padding: 2px 8px;\n}\n\nd-article table tbody :first-child td {\n padding-top: 2px;\n}\n'+ni+'/*\n * Copyright 2018 The Distill Template Authors\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n@media print {\n\n @page {\n size: 8in 11in;\n @bottom-right {\n content: counter(page) " of " counter(pages);\n }\n }\n\n html {\n /* no general margins -- CSS Grid takes care of those */\n }\n\n p, code {\n page-break-inside: avoid;\n }\n\n h2, h3 {\n page-break-after: avoid;\n }\n\n d-header {\n visibility: hidden;\n }\n\n d-footer {\n display: none!important;\n }\n\n}\n',mi=[{name:'WebComponents',support:function(){return'customElements'in window&&'attachShadow'in Element.prototype&&'getRootNode'in Element.prototype&&'content'in document.createElement('template')&&'Promise'in window&&'from'in Array},url:'https://distill.pub/third-party/polyfills/webcomponents-lite.js'},{name:'IntersectionObserver',support:function(){return'IntersectionObserver'in window&&'IntersectionObserverEntry'in window},url:'https://distill.pub/third-party/polyfills/intersection-observer.js'}];class yi{static browserSupportsAllFeatures(){return mi.every((e)=>e.support())}static load(e){const t=function(t){t.loaded=!0,console.info('Runlevel 0: Polyfill has finished loading: '+t.name),yi.neededPolyfills.every((e)=>e.loaded)&&(console.info('Runlevel 0: All required polyfills have finished loading.'),console.info('Runlevel 0->1.'),window.distillRunlevel=1,e())};for(const n of yi.neededPolyfills)g(n,t)}static get neededPolyfills(){return yi._neededPolyfills||(yi._neededPolyfills=mi.filter((e)=>!e.support())),yi._neededPolyfills}}const xi=ti('d-abstract',` - - - -`);class ki extends xi(HTMLElement){}const vi=ti('d-appendix',` - - -`,!1);class wi extends vi(HTMLElement){}const Si=/^\s*$/;class Ci extends HTMLElement{static get is(){return'd-article'}constructor(){super(),new MutationObserver((e)=>{for(const t of e)for(const e of t.addedNodes)switch(e.nodeName){case'#text':{const t=e.nodeValue;if(!Si.test(t)){console.warn('Use of unwrapped text in distill articles is discouraged as it breaks layout! Please wrap any text in a or

tag. We found the following text: '+t);const n=document.createElement('span');n.innerHTML=e.nodeValue,e.parentNode.insertBefore(n,e),e.parentNode.removeChild(e)}}}}).observe(this,{childList:!0})}}var Ti='undefined'==typeof window?'undefined'==typeof global?'undefined'==typeof self?{}:self:global:window,_i=f(function(e,t){(function(e){function t(){this.months=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],this.notKey=[',','{','}',' ','='],this.pos=0,this.input='',this.entries=[],this.currentEntry='',this.setInput=function(e){this.input=e},this.getEntries=function(){return this.entries},this.isWhitespace=function(e){return' '==e||'\r'==e||'\t'==e||'\n'==e},this.match=function(e,t){if((void 0==t||null==t)&&(t=!0),this.skipWhitespace(t),this.input.substring(this.pos,this.pos+e.length)==e)this.pos+=e.length;else throw'Token mismatch, expected '+e+', found '+this.input.substring(this.pos);this.skipWhitespace(t)},this.tryMatch=function(e,t){return(void 0==t||null==t)&&(t=!0),this.skipWhitespace(t),this.input.substring(this.pos,this.pos+e.length)==e},this.matchAt=function(){for(;this.input.length>this.pos&&'@'!=this.input[this.pos];)this.pos++;return!('@'!=this.input[this.pos])},this.skipWhitespace=function(e){for(;this.isWhitespace(this.input[this.pos]);)this.pos++;if('%'==this.input[this.pos]&&!0==e){for(;'\n'!=this.input[this.pos];)this.pos++;this.skipWhitespace(e)}},this.value_braces=function(){var e=0;this.match('{',!1);for(var t=this.pos,n=!1;;){if(!n)if('}'==this.input[this.pos]){if(0=this.input.length-1)throw'Unterminated value';n='\\'==this.input[this.pos]&&!1==n,this.pos++}},this.value_comment=function(){for(var e='',t=0;!(this.tryMatch('}',!1)&&0==t);){if(e+=this.input[this.pos],'{'==this.input[this.pos]&&t++,'}'==this.input[this.pos]&&t--,this.pos>=this.input.length-1)throw'Unterminated value:'+this.input.substring(start);this.pos++}return e},this.value_quotes=function(){this.match('"',!1);for(var e=this.pos,t=!1;;){if(!t){if('"'==this.input[this.pos]){var n=this.pos;return this.match('"',!1),this.input.substring(e,n)}if(this.pos>=this.input.length-1)throw'Unterminated value:'+this.input.substring(e)}t='\\'==this.input[this.pos]&&!1==t,this.pos++}},this.single_value=function(){var e=this.pos;if(this.tryMatch('{'))return this.value_braces();if(this.tryMatch('"'))return this.value_quotes();var t=this.key();if(t.match('^[0-9]+$'))return t;if(0<=this.months.indexOf(t.toLowerCase()))return t.toLowerCase();throw'Value expected:'+this.input.substring(e)+' for key: '+t},this.value=function(){for(var e=[this.single_value()];this.tryMatch('#');)this.match('#'),e.push(this.single_value());return e.join('')},this.key=function(){for(var e=this.pos;;){if(this.pos>=this.input.length)throw'Runaway key';if(0<=this.notKey.indexOf(this.input[this.pos]))return this.input.substring(e,this.pos);this.pos++}},this.key_equals_value=function(){var e=this.key();if(this.tryMatch('=')){this.match('=');var t=this.value();return[e,t]}throw'... = value expected, equals sign missing:'+this.input.substring(this.pos)},this.key_value_list=function(){var e=this.key_equals_value();for(this.currentEntry.entryTags={},this.currentEntry.entryTags[e[0]]=e[1];this.tryMatch(',')&&(this.match(','),!this.tryMatch('}'));)e=this.key_equals_value(),this.currentEntry.entryTags[e[0]]=e[1]},this.entry_body=function(e){this.currentEntry={},this.currentEntry.citationKey=this.key(),this.currentEntry.entryType=e.substring(1),this.match(','),this.key_value_list(),this.entries.push(this.currentEntry)},this.directive=function(){return this.match('@'),'@'+this.key()},this.preamble=function(){this.currentEntry={},this.currentEntry.entryType='PREAMBLE',this.currentEntry.entry=this.value_comment(),this.entries.push(this.currentEntry)},this.comment=function(){this.currentEntry={},this.currentEntry.entryType='COMMENT',this.currentEntry.entry=this.value_comment(),this.entries.push(this.currentEntry)},this.entry=function(e){this.entry_body(e)},this.bibtex=function(){for(;this.matchAt();){var e=this.directive();this.match('{'),'@STRING'==e?this.string():'@PREAMBLE'==e?this.preamble():'@COMMENT'==e?this.comment():this.entry(e),this.match('}')}}}e.toJSON=function(e){var n=new t;return n.setInput(e),n.bibtex(),n.entries},e.toBibtex=function(e){var t='';for(var n in e){if(t+='@'+e[n].entryType,t+='{',e[n].citationKey&&(t+=e[n].citationKey+', '),e[n].entry&&(t+=e[n].entry),e[n].entryTags){var i='';for(var a in e[n].entryTags)0!=i.length&&(i+=', '),i+=a+'= {'+e[n].entryTags[a]+'}';t+=i}t+='}\n\n'}return t}})(t)});class Li extends HTMLElement{static get is(){return'd-bibliography'}constructor(){super();const e=new MutationObserver((e)=>{for(const t of e)('SCRIPT'===t.target.nodeName||'characterData'===t.type)&&this.parseIfPossible()});e.observe(this,{childList:!0,characterData:!0,subtree:!0})}connectedCallback(){requestAnimationFrame(()=>{this.parseIfPossible()})}parseIfPossible(){const e=this.querySelector('script');if(e)if('text/bibtex'==e.type){const t=e.textContent;if(this.bibtex!==t){this.bibtex=t;const e=b(this.bibtex);this.notify(e)}}else if('text/json'==e.type){const t=new Map(JSON.parse(e.textContent));this.notify(t)}else console.warn('Unsupported bibliography script tag type: '+e.type)}notify(e){const t=new CustomEvent('onBibliographyChanged',{detail:e,bubbles:!0});this.dispatchEvent(t)}static get observedAttributes(){return['src']}receivedBibtex(e){const t=b(e.target.response);this.notify(t)}attributeChangedCallback(e,t,n){var i=new XMLHttpRequest;i.onload=(t)=>this.receivedBibtex(t),i.onerror=()=>console.warn(`Could not load Bibtex! (tried ${n})`),i.responseType='text',i.open('GET',n,!0),i.send()}}class Ai extends HTMLElement{static get is(){return'd-byline'}set frontMatter(e){this.innerHTML=y(e)}}const Ei=ti('d-cite',` - - - - -

- - -
-`);class Di extends Ei(HTMLElement){connectedCallback(){this.outerSpan=this.root.querySelector('#citation-'),this.innerSpan=this.root.querySelector('.citation-number'),this.hoverBox=this.root.querySelector('d-hover-box'),window.customElements.whenDefined('d-hover-box').then(()=>{this.hoverBox.listen(this)})}static get observedAttributes(){return['key']}attributeChangedCallback(e,t,n){const i=t?'onCiteKeyChanged':'onCiteKeyCreated',a=n.split(','),d={detail:[this,a],bubbles:!0},r=new CustomEvent(i,d);document.dispatchEvent(r)}set key(e){this.setAttribute('key',e)}get key(){return this.getAttribute('key')}get keys(){return this.getAttribute('key').split(',')}set numbers(e){const t=e.map((e)=>{return-1==e?'?':e+1+''}),n='['+t.join(', ')+']';this.innerSpan&&(this.innerSpan.textContent=n)}set entries(e){this.hoverBox&&(this.hoverBox.innerHTML=`
    - ${e.map(l).map((e)=>`
  • ${e}
  • `).join('\n')} -
`)}}const Mi=` -d-citation-list { - contain: layout style; -} - -d-citation-list .references { - grid-column: text; -} - -d-citation-list .references .title { - font-weight: 500; -} -`;class Oi extends HTMLElement{static get is(){return'd-citation-list'}connectedCallback(){this.hasAttribute('distill-prerendered')||(this.style.display='none')}set citations(e){x(this,e)}}var Ui=f(function(e){var t='undefined'==typeof window?'undefined'!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?self:{}:window,n=function(){var e=/\blang(?:uage)?-(\w+)\b/i,n=0,a=t.Prism={util:{encode:function(e){return e instanceof i?new i(e.type,a.util.encode(e.content),e.alias):'Array'===a.util.type(e)?e.map(a.util.encode):e.replace(/&/g,'&').replace(/e.length)break tokenloop;if(!(y instanceof n)){c.lastIndex=0;var v=c.exec(y),w=1;if(!v&&f&&x!=d.length-1){if(c.lastIndex=i,v=c.exec(e),!v)break;for(var S=v.index+(g?v[1].length:0),C=v.index+v[0].length,T=x,k=i,p=d.length;T=k&&(++x,i=k);if(d[x]instanceof n||d[T-1].greedy)continue;w=T-x,y=e.slice(i,k),v.index-=i}if(v){g&&(h=v[1].length);var S=v.index+h,v=v[0].slice(h),C=S+v.length,_=y.slice(0,S),L=y.slice(C),A=[x,w];_&&A.push(_);var E=new n(o,u?a.tokenize(v,u):v,b,v,f);A.push(E),L&&A.push(L),Array.prototype.splice.apply(d,A)}}}}}return d},hooks:{all:{},add:function(e,t){var n=a.hooks.all;n[e]=n[e]||[],n[e].push(t)},run:function(e,t){var n=a.hooks.all[e];if(n&&n.length)for(var d,r=0;d=n[r++];)d(t)}}},i=a.Token=function(e,t,n,i,a){this.type=e,this.content=t,this.alias=n,this.length=0|(i||'').length,this.greedy=!!a};if(i.stringify=function(e,t,n){if('string'==typeof e)return e;if('Array'===a.util.type(e))return e.map(function(n){return i.stringify(n,t,e)}).join('');var d={type:e.type,content:i.stringify(e.content,t,n),tag:'span',classes:['token',e.type],attributes:{},language:t,parent:n};if('comment'==d.type&&(d.attributes.spellcheck='true'),e.alias){var r='Array'===a.util.type(e.alias)?e.alias:[e.alias];Array.prototype.push.apply(d.classes,r)}a.hooks.run('wrap',d);var l=Object.keys(d.attributes).map(function(e){return e+'="'+(d.attributes[e]||'').replace(/"/g,'"')+'"'}).join(' ');return'<'+d.tag+' class="'+d.classes.join(' ')+'"'+(l?' '+l:'')+'>'+d.content+''},!t.document)return t.addEventListener?(t.addEventListener('message',function(e){var n=JSON.parse(e.data),i=n.language,d=n.code,r=n.immediateClose;t.postMessage(a.highlight(d,a.languages[i],i)),r&&t.close()},!1),t.Prism):t.Prism;var d=document.currentScript||[].slice.call(document.getElementsByTagName('script')).pop();return d&&(a.filename=d.src,document.addEventListener&&!d.hasAttribute('data-manual')&&('loading'===document.readyState?document.addEventListener('DOMContentLoaded',a.highlightAll):window.requestAnimationFrame?window.requestAnimationFrame(a.highlightAll):window.setTimeout(a.highlightAll,16))),t.Prism}();e.exports&&(e.exports=n),'undefined'!=typeof Ti&&(Ti.Prism=n),n.languages.markup={comment://,prolog:/<\?[\w\W]+?\?>/,doctype://i,cdata://i,tag:{pattern:/<\/?(?!\d)[^\s>\/=$<]+(?:\s+[^\s>\/=]+(?:=(?:("|')(?:\\\1|\\?(?!\1)[\w\W])*\1|[^\s'">=]+))?)*\s*\/?>/i,inside:{tag:{pattern:/^<\/?[^\s>\/]+/i,inside:{punctuation:/^<\/?/,namespace:/^[^\s>\/:]+:/}},"attr-value":{pattern:/=(?:('|")[\w\W]*?(\1)|[^\s>]+)/i,inside:{punctuation:/[=>"']/}},punctuation:/\/?>/,"attr-name":{pattern:/[^\s>\/]+/,inside:{namespace:/^[^\s>\/:]+:/}}}},entity:/&#?[\da-z]{1,8};/i},n.hooks.add('wrap',function(e){'entity'===e.type&&(e.attributes.title=e.content.replace(/&/,'&'))}),n.languages.xml=n.languages.markup,n.languages.html=n.languages.markup,n.languages.mathml=n.languages.markup,n.languages.svg=n.languages.markup,n.languages.css={comment:/\/\*[\w\W]*?\*\//,atrule:{pattern:/@[\w-]+?.*?(;|(?=\s*\{))/i,inside:{rule:/@[\w-]+/}},url:/url\((?:(["'])(\\(?:\r\n|[\w\W])|(?!\1)[^\\\r\n])*\1|.*?)\)/i,selector:/[^\{\}\s][^\{\};]*?(?=\s*\{)/,string:{pattern:/("|')(\\(?:\r\n|[\w\W])|(?!\1)[^\\\r\n])*\1/,greedy:!0},property:/(\b|\B)[\w-]+(?=\s*:)/i,important:/\B!important\b/i,function:/[-a-z0-9]+(?=\()/i,punctuation:/[(){};:]/},n.languages.css.atrule.inside.rest=n.util.clone(n.languages.css),n.languages.markup&&(n.languages.insertBefore('markup','tag',{style:{pattern:/()[\w\W]*?(?=<\/style>)/i,lookbehind:!0,inside:n.languages.css,alias:'language-css'}}),n.languages.insertBefore('inside','attr-value',{"style-attr":{pattern:/\s*style=("|').*?\1/i,inside:{"attr-name":{pattern:/^\s*style/i,inside:n.languages.markup.tag.inside},punctuation:/^\s*=\s*['"]|['"]\s*$/,"attr-value":{pattern:/.+/i,inside:n.languages.css}},alias:'language-css'}},n.languages.markup.tag)),n.languages.clike={comment:[{pattern:/(^|[^\\])#.*/,lookbehind:!0},{pattern:/(^|[^\\])\/\*[\w\W]*?\*\//,lookbehind:!0},{pattern:/(^|[^\\:])\/\/.*/,lookbehind:!0}],string:{pattern:/(["'])(\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,greedy:!0},"class-name":{pattern:/((?:\b(?:class|interface|extends|implements|trait|instanceof|new)\s+)|(?:catch\s+\())[a-z0-9_\.\\]+/i,lookbehind:!0,inside:{punctuation:/(\.|\\)/}},keyword:/\b(if|else|while|do|for|return|in|instanceof|function|new|try|throw|catch|finally|null|break|continue)\b/,boolean:/\b(true|false)\b/,function:/[a-z\.0-9_]+(?=\()/i,number:/\b-?(?:0x[\da-f]+|\d*\.?\d+(?:e[+-]?\d+)?)\b/i,operator:/--?|\+\+?|!=?=?|<=?|>=?|==?=?|&&?|\|\|?|\?|\*|\/|~|\^|%/,punctuation:/[{}[\];(),.:]/},n.languages.javascript=n.languages.extend('clike',{keyword:/\b(as|async|await|break|case|catch|class|const|continue|debugger|default|delete|do|else|enum|export|extends|finally|for|from|function|get|if|implements|import|in|instanceof|interface|let|new|null|of|package|private|protected|public|return|set|static|super|switch|this|throw|try|typeof|var|void|while|with|yield)\b/,number:/\b-?(0x[\dA-Fa-f]+|0b[01]+|0o[0-7]+|\d*\.?\d+([Ee][+-]?\d+)?|NaN|Infinity)\b/,function:/[_$a-zA-Z\xA0-\uFFFF][_$a-zA-Z0-9\xA0-\uFFFF]*(?=\()/i,operator:/--?|\+\+?|!=?=?|<=?|>=?|==?=?|&&?|\|\|?|\?|\*\*?|\/|~|\^|%|\.{3}/}),n.languages.insertBefore('javascript','keyword',{regex:{pattern:/(^|[^/])\/(?!\/)(\[.+?]|\\.|[^/\\\r\n])+\/[gimyu]{0,5}(?=\s*($|[\r\n,.;})]))/,lookbehind:!0,greedy:!0}}),n.languages.insertBefore('javascript','string',{"template-string":{pattern:/`(?:\\\\|\\?[^\\])*?`/,greedy:!0,inside:{interpolation:{pattern:/\$\{[^}]+\}/,inside:{"interpolation-punctuation":{pattern:/^\$\{|\}$/,alias:'punctuation'},rest:n.languages.javascript}},string:/[\s\S]+/}}}),n.languages.markup&&n.languages.insertBefore('markup','tag',{script:{pattern:/()[\w\W]*?(?=<\/script>)/i,lookbehind:!0,inside:n.languages.javascript,alias:'language-javascript'}}),n.languages.js=n.languages.javascript,function(){'undefined'!=typeof self&&self.Prism&&self.document&&document.querySelector&&(self.Prism.fileHighlight=function(){var e={js:'javascript',py:'python',rb:'ruby',ps1:'powershell',psm1:'powershell',sh:'bash',bat:'batch',h:'c',tex:'latex'};Array.prototype.forEach&&Array.prototype.slice.call(document.querySelectorAll('pre[data-src]')).forEach(function(t){for(var i,a=t.getAttribute('data-src'),d=t,r=/\blang(?:uage)?-(?!\*)(\w+)\b/i;d&&!r.test(d.className);)d=d.parentNode;if(d&&(i=(t.className.match(r)||[,''])[1]),!i){var o=(a.match(/\.(\w+)$/)||[,''])[1];i=e[o]||o}var l=document.createElement('code');l.className='language-'+i,t.textContent='',l.textContent='Loading\u2026',t.appendChild(l);var s=new XMLHttpRequest;s.open('GET',a,!0),s.onreadystatechange=function(){4==s.readyState&&(400>s.status&&s.responseText?(l.textContent=s.responseText,n.highlightElement(l)):400<=s.status?l.textContent='\u2716 Error '+s.status+' while fetching file: '+s.statusText:l.textContent='\u2716 Error: File does not exist or is empty')},s.send(null)})},document.addEventListener('DOMContentLoaded',self.Prism.fileHighlight))}()});Prism.languages.python={"triple-quoted-string":{pattern:/"""[\s\S]+?"""|'''[\s\S]+?'''/,alias:'string'},comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:{pattern:/("|')(?:\\\\|\\?[^\\\r\n])*?\1/,greedy:!0},function:{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_][a-zA-Z0-9_]*(?=\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)[a-z0-9_]+/i,lookbehind:!0},keyword:/\b(?:as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|pass|print|raise|return|try|while|with|yield)\b/,boolean:/\b(?:True|False)\b/,number:/\b-?(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*\.?\d*|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]|\b(?:or|and|not)\b/,punctuation:/[{}[\];(),.:]/},Prism.languages.clike={comment:[{pattern:/(^|[^\\])#.*/,lookbehind:!0},{pattern:/(^|[^\\])\/\*[\w\W]*?\*\//,lookbehind:!0},{pattern:/(^|[^\\:])\/\/.*/,lookbehind:!0}],string:{pattern:/(["'])(\\(?:\r\n|[\s\S])|(?!\1)[^\\\r\n])*\1/,greedy:!0},"class-name":{pattern:/((?:\b(?:class|interface|extends|implements|trait|instanceof|new)\s+)|(?:catch\s+\())[a-z0-9_\.\\]+/i,lookbehind:!0,inside:{punctuation:/(\.|\\)/}},keyword:/\b(if|else|while|do|for|return|in|instanceof|function|new|try|throw|catch|finally|null|break|continue)\b/,boolean:/\b(true|false)\b/,function:/[a-z\.0-9_]+(?=\()/i,number:/\b-?(?:0x[\da-f]+|\d*\.?\d+(?:e[+-]?\d+)?)\b/i,operator:/--?|\+\+?|!=?=?|<=?|>=?|==?=?|&&?|\|\|?|\?|\*|\/|~|\^|%/,punctuation:/[{}[\];(),.:]/},Prism.languages.lua={comment:/^#!.+|--(?:\[(=*)\[[\s\S]*?\]\1\]|.*)/m,string:{pattern:/(["'])(?:(?!\1)[^\\\r\n]|\\z(?:\r\n|\s)|\\(?:\r\n|[\s\S]))*\1|\[(=*)\[[\s\S]*?\]\2\]/,greedy:!0},number:/\b0x[a-f\d]+\.?[a-f\d]*(?:p[+-]?\d+)?\b|\b\d+(?:\.\B|\.?\d*(?:e[+-]?\d+)?\b)|\B\.\d+(?:e[+-]?\d+)?\b/i,keyword:/\b(?:and|break|do|else|elseif|end|false|for|function|goto|if|in|local|nil|not|or|repeat|return|then|true|until|while)\b/,function:/(?!\d)\w+(?=\s*(?:[({]))/,operator:[/[-+*%^&|#]|\/\/?|<[<=]?|>[>=]?|[=~]=?/,{pattern:/(^|[^.])\.\.(?!\.)/,lookbehind:!0}],punctuation:/[\[\](){},;]|\.+|:+/},function(e){var t={variable:[{pattern:/\$?\(\([\w\W]+?\)\)/,inside:{variable:[{pattern:/(^\$\(\([\w\W]+)\)\)/,lookbehind:!0},/^\$\(\(/],number:/\b-?(?:0x[\dA-Fa-f]+|\d*\.?\d+(?:[Ee]-?\d+)?)\b/,operator:/--?|-=|\+\+?|\+=|!=?|~|\*\*?|\*=|\/=?|%=?|<<=?|>>=?|<=?|>=?|==?|&&?|&=|\^=?|\|\|?|\|=|\?|:/,punctuation:/\(\(?|\)\)?|,|;/}},{pattern:/\$\([^)]+\)|`[^`]+`/,inside:{variable:/^\$\(|^`|\)$|`$/}},/\$(?:[a-z0-9_#\?\*!@]+|\{[^}]+\})/i]};e.languages.bash={shebang:{pattern:/^#!\s*\/bin\/bash|^#!\s*\/bin\/sh/,alias:'important'},comment:{pattern:/(^|[^"{\\])#.*/,lookbehind:!0},string:[{pattern:/((?:^|[^<])<<\s*)(?:"|')?(\w+?)(?:"|')?\s*\r?\n(?:[\s\S])*?\r?\n\2/g,lookbehind:!0,greedy:!0,inside:t},{pattern:/(["'])(?:\\\\|\\?[^\\])*?\1/g,greedy:!0,inside:t}],variable:t.variable,function:{pattern:/(^|\s|;|\||&)(?:alias|apropos|apt-get|aptitude|aspell|awk|basename|bash|bc|bg|builtin|bzip2|cal|cat|cd|cfdisk|chgrp|chmod|chown|chroot|chkconfig|cksum|clear|cmp|comm|command|cp|cron|crontab|csplit|cut|date|dc|dd|ddrescue|df|diff|diff3|dig|dir|dircolors|dirname|dirs|dmesg|du|egrep|eject|enable|env|ethtool|eval|exec|expand|expect|export|expr|fdformat|fdisk|fg|fgrep|file|find|fmt|fold|format|free|fsck|ftp|fuser|gawk|getopts|git|grep|groupadd|groupdel|groupmod|groups|gzip|hash|head|help|hg|history|hostname|htop|iconv|id|ifconfig|ifdown|ifup|import|install|jobs|join|kill|killall|less|link|ln|locate|logname|logout|look|lpc|lpr|lprint|lprintd|lprintq|lprm|ls|lsof|make|man|mkdir|mkfifo|mkisofs|mknod|more|most|mount|mtools|mtr|mv|mmv|nano|netstat|nice|nl|nohup|notify-send|npm|nslookup|open|op|passwd|paste|pathchk|ping|pkill|popd|pr|printcap|printenv|printf|ps|pushd|pv|pwd|quota|quotacheck|quotactl|ram|rar|rcp|read|readarray|readonly|reboot|rename|renice|remsync|rev|rm|rmdir|rsync|screen|scp|sdiff|sed|seq|service|sftp|shift|shopt|shutdown|sleep|slocate|sort|source|split|ssh|stat|strace|su|sudo|sum|suspend|sync|tail|tar|tee|test|time|timeout|times|touch|top|traceroute|trap|tr|tsort|tty|type|ulimit|umask|umount|unalias|uname|unexpand|uniq|units|unrar|unshar|uptime|useradd|userdel|usermod|users|uuencode|uudecode|v|vdir|vi|vmstat|wait|watch|wc|wget|whereis|which|who|whoami|write|xargs|xdg-open|yes|zip)(?=$|\s|;|\||&)/,lookbehind:!0},keyword:{pattern:/(^|\s|;|\||&)(?:let|:|\.|if|then|else|elif|fi|for|break|continue|while|in|case|function|select|do|done|until|echo|exit|return|set|declare)(?=$|\s|;|\||&)/,lookbehind:!0},boolean:{pattern:/(^|\s|;|\||&)(?:true|false)(?=$|\s|;|\||&)/,lookbehind:!0},operator:/&&?|\|\|?|==?|!=?|<<>|<=?|>=?|=~/,punctuation:/\$?\(\(?|\)\)?|\.\.|[{}[\];]/};var n=t.variable[1].inside;n['function']=e.languages.bash['function'],n.keyword=e.languages.bash.keyword,n.boolean=e.languages.bash.boolean,n.operator=e.languages.bash.operator,n.punctuation=e.languages.bash.punctuation}(Prism),Prism.languages.go=Prism.languages.extend('clike',{keyword:/\b(break|case|chan|const|continue|default|defer|else|fallthrough|for|func|go(to)?|if|import|interface|map|package|range|return|select|struct|switch|type|var)\b/,builtin:/\b(bool|byte|complex(64|128)|error|float(32|64)|rune|string|u?int(8|16|32|64|)|uintptr|append|cap|close|complex|copy|delete|imag|len|make|new|panic|print(ln)?|real|recover)\b/,boolean:/\b(_|iota|nil|true|false)\b/,operator:/[*\/%^!=]=?|\+[=+]?|-[=-]?|\|[=|]?|&(?:=|&|\^=?)?|>(?:>=?|=)?|<(?:<=?|=|-)?|:=|\.\.\./,number:/\b(-?(0x[a-f\d]+|(\d+\.?\d*|\.\d+)(e[-+]?\d+)?)i?)\b/i,string:/("|'|`)(\\?.|\r|\n)*?\1/}),delete Prism.languages.go['class-name'],Prism.languages.markdown=Prism.languages.extend('markup',{}),Prism.languages.insertBefore('markdown','prolog',{blockquote:{pattern:/^>(?:[\t ]*>)*/m,alias:'punctuation'},code:[{pattern:/^(?: {4}|\t).+/m,alias:'keyword'},{pattern:/``.+?``|`[^`\n]+`/,alias:'keyword'}],title:[{pattern:/\w+.*(?:\r?\n|\r)(?:==+|--+)/,alias:'important',inside:{punctuation:/==+$|--+$/}},{pattern:/(^\s*)#+.+/m,lookbehind:!0,alias:'important',inside:{punctuation:/^#+|#+$/}}],hr:{pattern:/(^\s*)([*-])([\t ]*\2){2,}(?=\s*$)/m,lookbehind:!0,alias:'punctuation'},list:{pattern:/(^\s*)(?:[*+-]|\d+\.)(?=[\t ].)/m,lookbehind:!0,alias:'punctuation'},"url-reference":{pattern:/!?\[[^\]]+\]:[\t ]+(?:\S+|<(?:\\.|[^>\\])+>)(?:[\t ]+(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\)))?/,inside:{variable:{pattern:/^(!?\[)[^\]]+/,lookbehind:!0},string:/(?:"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\((?:\\.|[^)\\])*\))$/,punctuation:/^[\[\]!:]|[<>]/},alias:'url'},bold:{pattern:/(^|[^\\])(\*\*|__)(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^\*\*|^__|\*\*$|__$/}},italic:{pattern:/(^|[^\\])([*_])(?:(?:\r?\n|\r)(?!\r?\n|\r)|.)+?\2/,lookbehind:!0,inside:{punctuation:/^[*_]|[*_]$/}},url:{pattern:/!?\[[^\]]+\](?:\([^\s)]+(?:[\t ]+"(?:\\.|[^"\\])*")?\)| ?\[[^\]\n]*\])/,inside:{variable:{pattern:/(!?\[)[^\]]+(?=\]$)/,lookbehind:!0},string:{pattern:/"(?:\\.|[^"\\])*"(?=\)$)/}}}}),Prism.languages.markdown.bold.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.italic.inside.url=Prism.util.clone(Prism.languages.markdown.url),Prism.languages.markdown.bold.inside.italic=Prism.util.clone(Prism.languages.markdown.italic),Prism.languages.markdown.italic.inside.bold=Prism.util.clone(Prism.languages.markdown.bold),Prism.languages.julia={comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},string:/"""[\s\S]+?"""|'''[\s\S]+?'''|("|')(\\?.)*?\1/,keyword:/\b(abstract|baremodule|begin|bitstype|break|catch|ccall|const|continue|do|else|elseif|end|export|finally|for|function|global|if|immutable|import|importall|let|local|macro|module|print|println|quote|return|try|type|typealias|using|while)\b/,boolean:/\b(true|false)\b/,number:/\b-?(0[box])?(?:[\da-f]+\.?\d*|\.\d+)(?:[efp][+-]?\d+)?j?\b/i,operator:/\+=?|-=?|\*=?|\/[\/=]?|\\=?|\^=?|%=?|÷=?|!=?=?|&=?|\|[=>]?|\$=?|<(?:<=?|[=:])?|>(?:=|>>?=?)?|==?=?|[~≠≤≥]/,punctuation:/[{}[\];(),.:]/};const Ii=ti('d-code',` - - - - -`);class Ni extends ei(Ii(HTMLElement)){renderContent(){if(this.languageName=this.getAttribute('language'),!this.languageName)return void console.warn('You need to provide a language attribute to your block to let us know how to highlight your code; e.g.:\n zeros = np.zeros(shape).');const e=Ui.languages[this.languageName];if(void 0==e)return void console.warn(`Distill does not yet support highlighting your code block in "${this.languageName}'.`);let t=this.textContent;const n=this.shadowRoot.querySelector('#code-container');if(this.hasAttribute('block')){t=t.replace(/\n/,'');const e=t.match(/\s*/);if(t=t.replace(new RegExp('\n'+e,'g'),'\n'),t=t.trim(),n.parentNode instanceof ShadowRoot){const e=document.createElement('pre');this.shadowRoot.removeChild(n),e.appendChild(n),this.shadowRoot.appendChild(e)}}n.className=`language-${this.languageName}`,n.innerHTML=Ui.highlight(t,e)}}const ji=ti('d-footnote',` - - - -
- -
-
- - - - - -`);class Ri extends ji(HTMLElement){constructor(){super();const e=new MutationObserver(this.notify);e.observe(this,{childList:!0,characterData:!0,subtree:!0})}notify(){const e={detail:this,bubbles:!0},t=new CustomEvent('onFootnoteChanged',e);document.dispatchEvent(t)}connectedCallback(){this.hoverBox=this.root.querySelector('d-hover-box'),window.customElements.whenDefined('d-hover-box').then(()=>{this.hoverBox.listen(this)}),Ri.currentFootnoteId+=1;const e=Ri.currentFootnoteId.toString();this.root.host.id='d-footnote-'+e;const t='dt-fn-hover-box-'+e;this.hoverBox.id=t;const n=this.root.querySelector('#fn-');n.setAttribute('id','fn-'+e),n.setAttribute('data-hover-ref',t),n.textContent=e}}Ri.currentFootnoteId=0;const qi=ti('d-footnote-list',` - - -

Footnotes

-
    -`,!1);class Fi extends qi(HTMLElement){connectedCallback(){super.connectedCallback(),this.list=this.root.querySelector('ol'),this.root.style.display='none'}set footnotes(e){if(this.list.innerHTML='',e.length){this.root.style.display='';for(const t of e){const e=document.createElement('li');e.id=t.id+'-listing',e.innerHTML=t.innerHTML;const n=document.createElement('a');n.setAttribute('class','footnote-backlink'),n.textContent='[\u21A9]',n.href='#'+t.id,e.appendChild(n),this.list.appendChild(e)}}else this.root.style.display='none'}}const Pi=ti('d-hover-box',` - - -
    -
    - -
    -
    -`);class Hi extends Pi(HTMLElement){constructor(){super()}connectedCallback(){}listen(e){this.bindDivEvents(this),this.bindTriggerEvents(e)}bindDivEvents(e){e.addEventListener('mouseover',()=>{this.visible||this.showAtNode(e),this.stopTimeout()}),e.addEventListener('mouseout',()=>{this.extendTimeout(500)}),e.addEventListener('touchstart',(e)=>{e.stopPropagation()},{passive:!0}),document.body.addEventListener('touchstart',()=>{this.hide()},{passive:!0})}bindTriggerEvents(e){e.addEventListener('mouseover',()=>{this.visible||this.showAtNode(e),this.stopTimeout()}),e.addEventListener('mouseout',()=>{this.extendTimeout(300)}),e.addEventListener('touchstart',(t)=>{this.visible?this.hide():this.showAtNode(e),t.stopPropagation()},{passive:!0})}show(e){this.visible=!0,this.style.display='block',this.style.top=Pn(e[1]+10)+'px'}showAtNode(e){const t=e.getBoundingClientRect();this.show([e.offsetLeft+t.width,e.offsetTop+t.height])}hide(){this.visible=!1,this.style.display='none',this.stopTimeout()}stopTimeout(){this.timeout&&clearTimeout(this.timeout)}extendTimeout(e){this.stopTimeout(),this.timeout=setTimeout(()=>{this.hide()},e)}}class zi extends HTMLElement{static get is(){return'd-title'}}const Yi=ti('d-references',` - -`,!1);class Bi extends Yi(HTMLElement){}class Wi extends HTMLElement{static get is(){return'd-toc'}connectedCallback(){this.getAttribute('prerendered')||(window.onload=()=>{const e=document.querySelector('d-article'),t=e.querySelectorAll('h2, h3');k(this,t)})}}class Vi extends HTMLElement{static get is(){return'd-figure'}static get readyQueue(){return Vi._readyQueue||(Vi._readyQueue=[]),Vi._readyQueue}static addToReadyQueue(e){-1===Vi.readyQueue.indexOf(e)&&(Vi.readyQueue.push(e),Vi.runReadyQueue())}static runReadyQueue(){const e=Vi.readyQueue.sort((e,t)=>e._seenOnScreen-t._seenOnScreen).filter((e)=>!e._ready).pop();e&&(e.ready(),requestAnimationFrame(Vi.runReadyQueue))}constructor(){super(),this._ready=!1,this._onscreen=!1,this._offscreen=!0}connectedCallback(){this.loadsWhileScrolling=this.hasAttribute('loadsWhileScrolling'),Vi.marginObserver.observe(this),Vi.directObserver.observe(this)}disconnectedCallback(){Vi.marginObserver.unobserve(this),Vi.directObserver.unobserve(this)}static get marginObserver(){if(!Vi._marginObserver){const e=window.innerHeight,t=Fn(2*e),n=Vi.didObserveMarginIntersection,i=new IntersectionObserver(n,{rootMargin:t+'px 0px '+t+'px 0px',threshold:0.01});Vi._marginObserver=i}return Vi._marginObserver}static didObserveMarginIntersection(e){for(const t of e){const e=t.target;t.isIntersecting&&!e._ready&&Vi.addToReadyQueue(e)}}static get directObserver(){return Vi._directObserver||(Vi._directObserver=new IntersectionObserver(Vi.didObserveDirectIntersection,{rootMargin:'0px',threshold:[0,1]})),Vi._directObserver}static didObserveDirectIntersection(e){for(const t of e){const e=t.target;t.isIntersecting?(e._seenOnScreen=new Date,e._offscreen&&e.onscreen()):e._onscreen&&e.offscreen()}}addEventListener(e,t){super.addEventListener(e,t),'ready'===e&&-1!==Vi.readyQueue.indexOf(this)&&(this._ready=!1,Vi.runReadyQueue()),'onscreen'===e&&this.onscreen()}ready(){this._ready=!0,Vi.marginObserver.unobserve(this);const e=new CustomEvent('ready');this.dispatchEvent(e)}onscreen(){this._onscreen=!0,this._offscreen=!1;const e=new CustomEvent('onscreen');this.dispatchEvent(e)}offscreen(){this._onscreen=!1,this._offscreen=!0;const e=new CustomEvent('offscreen');this.dispatchEvent(e)}}if('undefined'!=typeof window){Vi.isScrolling=!1;let e;window.addEventListener('scroll',()=>{Vi.isScrolling=!0,clearTimeout(e),e=setTimeout(()=>{Vi.isScrolling=!1,Vi.runReadyQueue()},500)},!0)}const Ki=ti('d-interstitial',` - - -
    -
    -

    This article is in review.

    -

    Do not share this URL or the contents of this article. Thank you!

    - -

    Enter the password we shared with you as part of the review process to view the article.

    -
    -
    -`);class $i extends Ki(HTMLElement){connectedCallback(){if(this.shouldRemoveSelf())this.parentElement.removeChild(this);else{const e=this.root.querySelector('#interstitial-password-input');e.oninput=(e)=>this.passwordChanged(e)}}passwordChanged(e){const t=e.target.value;t===this.password&&(console.log('Correct password entered.'),this.parentElement.removeChild(this),'undefined'!=typeof Storage&&(console.log('Saved that correct password was entered.'),localStorage.setItem(this.localStorageIdentifier(),'true')))}shouldRemoveSelf(){return window&&window.location.hostname==='distill.pub'?(console.warn('Interstitial found on production, hiding it.'),!0):'undefined'!=typeof Storage&&'true'===localStorage.getItem(this.localStorageIdentifier())&&(console.log('Loaded that correct password was entered before; skipping interstitial.'),!0)}localStorageIdentifier(){return'distill-drafts'+(window?window.location.pathname:'-')+'interstitial-password-correct'}}var Xi=function(e,t){return et?1:e>=t?0:NaN},Ji=function(e){return 1===e.length&&(e=v(e)),{left:function(t,n,i,a){for(null==i&&(i=0),null==a&&(a=t.length);i>>1;0>e(t[d],n)?i=d+1:a=d}return i},right:function(t,n,i,a){for(null==i&&(i=0),null==a&&(a=t.length);i>>1;0(i=arguments.length)?(t=e,e=0,1):3>i?1:+a;for(var d=-1,i=0|Rn(0,qn((t-e)/a)),n=Array(i);++d=this.r&&0<=this.g&&255>=this.g&&0<=this.b&&255>=this.b&&0<=this.opacity&&1>=this.opacity},toString:function(){var e=this.opacity;return e=isNaN(e)?1:Rn(0,Hn(1,e)),(1===e?'rgb(':'rgba(')+Rn(0,Hn(255,Pn(this.r)||0))+', '+Rn(0,Hn(255,Pn(this.g)||0))+', '+Rn(0,Hn(255,Pn(this.b)||0))+(1===e?')':', '+e+')')}})),ra(F,function(e,t,n,i){return 1===arguments.length?q(e):new F(e,t,n,null==i?1:i)},_(L,{brighter:function(e){return e=null==e?la:In(la,e),new F(this.h,this.s,this.l*e,this.opacity)},darker:function(e){return e=null==e?oa:In(oa,e),new F(this.h,this.s,this.l*e,this.opacity)},rgb:function(){var e=this.h%360+360*(0>this.h),t=isNaN(e)||isNaN(this.s)?0:this.s,n=this.l,i=n+(0.5>n?n:1-n)*t,a=2*n-i;return new j(P(240<=e?e-240:e+120,a,i),P(e,a,i),P(120>e?e+240:e-120,a,i),this.opacity)},displayable:function(){return(0<=this.s&&1>=this.s||isNaN(this.s))&&0<=this.l&&1>=this.l&&0<=this.opacity&&1>=this.opacity}}));var ya=On/180,xa=180/On,ka=18,Kn=0.95047,Xn=1,Yn=1.08883,Zn=4/29,va=6/29,wa=3*va*va,Sa=va*va*va;ra(Y,function(e,t,n,i){return 1===arguments.length?H(e):new Y(e,t,n,null==i?1:i)},_(L,{brighter:function(e){return new Y(this.l+ka*(null==e?1:e),this.a,this.b,this.opacity)},darker:function(e){return new Y(this.l-ka*(null==e?1:e),this.a,this.b,this.opacity)},rgb:function(){var e=(this.l+16)/116,t=isNaN(this.a)?e:e+this.a/500,n=isNaN(this.b)?e:e-this.b/200;return e=Xn*V(e),t=Kn*V(t),n=Yn*V(n),new j(K(3.2404542*t-1.5371385*e-0.4985314*n),K(-0.969266*t+1.8760108*e+0.041556*n),K(0.0556434*t-0.2040259*e+1.0572252*n),this.opacity)}})),ra(X,function(e,t,n,i){return 1===arguments.length?z(e):new X(e,t,n,null==i?1:i)},_(L,{brighter:function(e){return new X(this.h,this.c,this.l+ka*(null==e?1:e),this.opacity)},darker:function(e){return new X(this.h,this.c,this.l-ka*(null==e?1:e),this.opacity)},rgb:function(){return H(this).rgb()}}));var Ca=-0.14861,A=+1.78277,B=-0.29227,C=-0.90649,D=+1.97294,E=D*C,Ta=D*A,_a=A*B-C*Ca;ra(Z,Q,_(L,{brighter:function(e){return e=null==e?la:In(la,e),new Z(this.h,this.s,this.l*e,this.opacity)},darker:function(e){return e=null==e?oa:In(oa,e),new Z(this.h,this.s,this.l*e,this.opacity)},rgb:function(){var e=isNaN(this.h)?0:(this.h+120)*ya,t=+this.l,n=isNaN(this.s)?0:this.s*t*(1-t),i=Mn(e),a=Dn(e);return new j(255*(t+n*(Ca*i+A*a)),255*(t+n*(B*i+C*a)),255*(t+n*(D*i)),this.opacity)}}));var La=function(e){return function(){return e}},Aa=function e(t){function n(e,t){var n=i((e=N(e)).r,(t=N(t)).r),a=i(e.g,t.g),d=i(e.b,t.b),r=ne(e.opacity,t.opacity);return function(i){return e.r=n(i),e.g=a(i),e.b=d(i),e.opacity=r(i),e+''}}var i=te(t);return n.gamma=e,n}(1),Ea=function(e,t){var n,i=t?t.length:0,a=e?Hn(i,e.length):0,d=Array(i),r=Array(i);for(n=0;nr&&(d=n.slice(r,d),l[o]?l[o]+=d:l[++o]=d),(t=t[0])===(a=a[0])?l[o]?l[o]+=a:l[++o]=a:(l[++o]=null,s.push({i:o,x:Ma(t,a)})),r=Ia.lastIndex;return rl.length?s[0]?ae(s[0].x):ie(n):(n=s.length,function(e){for(var t,a=0;an?n-360*Pn(n/360):n):La(isNaN(e)?t:e)});var qa,Fa=de(ne),Pa=function(e){return function(){return e}},Ha=function(e){return+e},za=[0,1],Ya=function(e,t){if(0>(n=(e=t?e.toExponential(t-1):e.toExponential()).indexOf('e')))return null;var n,i=e.slice(0,n);return[1d&&(o=Rn(1,d-l)),i.push(a.substring(r-=o,r+o)),!((l+=o+1)>d));)o=e[t=(t+1)%e.length];return i.reverse().join(n)}},Va=function(e){return function(t){return t.replace(/[0-9]/g,function(t){return e[+t]})}},Ka=function(e,t){var n=Ya(e,t);if(!n)return e+'';var i=n[0],a=n[1];return 0>a?'0.'+Array(-a).join('0')+i:i.length>a+1?i.slice(0,a+1)+'.'+i.slice(a+1):i+Array(a-i.length+2).join('0')},$a={"":function(e,t){e=e.toPrecision(t);out:for(var a,d=e.length,n=1,i=-1;ni?r+Array(l-i+1).join('0'):0=^]))?([+\-\( ])?([$#])?(0)?(\d+)?(,)?(\.\d+)?([a-z%])?$/i;fe.prototype=he.prototype,he.prototype.toString=function(){return this.fill+this.align+this.sign+this.symbol+(this.zero?'0':'')+(null==this.width?'':Rn(1,0|this.width))+(this.comma?',':'')+(null==this.precision?'':'.'+Rn(0,0|this.precision))+this.type};var re,Ja,Qa,Za=function(e){return e},Ga=['y','z','a','f','p','n','\xB5','m','','k','M','G','T','P','E','Z','Y'],ed=function(e){function t(e){function t(e){var t,i,n,c=b,k=m;if('c'===h)k=y(e)+k,e='';else{e=+e;var v=0>e;if(e=y(Un(e),f),v&&0==+e&&(v=!1),c=(v?'('===s?s:'-':'-'===s||'('===s?'':s)+c,k=k+('s'===h?Ga[8+qa/3]:'')+(v&&'('===s?')':''),x)for(t=-1,i=e.length;++tn||57>1)+c+e+k+S.slice(w);break;default:e=S+c+e+k;}return r(e)}e=fe(e);var o=e.fill,l=e.align,s=e.sign,c=e.symbol,u=e.zero,p=e.width,g=e.comma,f=e.precision,h=e.type,b='$'===c?n[0]:'#'===c&&/[boxX]/.test(h)?'0'+h.toLowerCase():'',m='$'===c?n[1]:/[%p]/.test(h)?i:'',y=$a[h],x=!h||/[defgprs%]/.test(h);return f=null==f?h?6:12:/[gprs]/.test(h)?Rn(1,Hn(21,f)):Rn(0,Hn(20,f)),t.toString=function(){return e+''},t}var a=e.grouping&&e.thousands?Wa(e.grouping,e.thousands):Za,n=e.currency,d=e.decimal,r=e.numerals?Va(e.numerals):Za,i=e.percent||'%';return{format:t,formatPrefix:function(n,i){var a=t((n=fe(n),n.type='f',n)),d=3*Rn(-8,Hn(8,Fn(Ba(i)/3))),r=In(10,-d),o=Ga[8+d/3];return function(e){return a(r*e)+o}}}};(function(e){return re=ed(e),Ja=re.format,Qa=re.formatPrefix,re})({decimal:'.',thousands:',',grouping:[3],currency:['$','']});var td=function(e){return Rn(0,-Ba(Un(e)))},nd=function(e,t){return Rn(0,3*Rn(-8,Hn(8,Fn(Ba(t)/3)))-Ba(Un(e)))},id=function(e,t){return e=Un(e),t=Un(t)-e,Rn(0,Ba(t)-Ba(e))+1},ad=function(e,t,n){var i,a=e[0],d=e[e.length-1],r=S(a,d,null==t?10:t);switch(n=fe(null==n?',f':n),n.type){case's':{var o=Rn(Un(a),Un(d));return null!=n.precision||isNaN(i=nd(r,o))||(n.precision=i),Qa(n,o)}case'':case'e':case'g':case'p':case'r':{null!=n.precision||isNaN(i=id(r,Rn(Un(a),Un(d))))||(n.precision=i-('e'===n.type));break}case'f':case'%':{null!=n.precision||isNaN(i=td(r))||(n.precision=i-2*('%'===n.type));break}}return Ja(n)},dd=new Date,rd=new Date,od=ye(function(){},function(e,t){e.setTime(+e+t)},function(e,t){return t-e});od.every=function(e){return e=Fn(e),isFinite(e)&&0t&&(t+=cd),e.setTime(Fn((+e-t)/cd)*cd+t)},function(e,t){e.setTime(+e+t*cd)},function(e,t){return(t-e)/cd},function(e){return e.getHours()}),bd=ye(function(e){e.setHours(0,0,0,0)},function(e,t){e.setDate(e.getDate()+t)},function(e,t){return(t-e-(t.getTimezoneOffset()-e.getTimezoneOffset())*sd)/ud},function(e){return e.getDate()-1}),md=xe(0),yd=xe(1),xd=xe(2),kd=xe(3),vd=xe(4),wd=xe(5),Sd=xe(6),Cd=ye(function(e){e.setDate(1),e.setHours(0,0,0,0)},function(e,t){e.setMonth(e.getMonth()+t)},function(e,t){return t.getMonth()-e.getMonth()+12*(t.getFullYear()-e.getFullYear())},function(e){return e.getMonth()}),Td=ye(function(e){e.setMonth(0,1),e.setHours(0,0,0,0)},function(e,t){e.setFullYear(e.getFullYear()+t)},function(e,t){return t.getFullYear()-e.getFullYear()},function(e){return e.getFullYear()});Td.every=function(e){return isFinite(e=Fn(e))&&0arguments.length){for(;++ot&&(this._names.push(e),this._node.setAttribute('class',this._names.join(' ')))},remove:function(e){var t=this._names.indexOf(e);0<=t&&(this._names.splice(t,1),this._node.setAttribute('class',this._names.join(' ')))},contains:function(e){return 0<=this._names.indexOf(e)}};var wr=[null];xn.prototype=function(){return new xn([[document.documentElement]],wr)}.prototype={constructor:xn,select:function(e){'function'!=typeof e&&(e=br(e));for(var t=this._groups,a=t.length,d=Array(a),r=0;r=v&&(v=k+1);!(x=b[v])&&++varguments.length){var i=this.node();return n.local?i.getAttributeNS(n.space,n.local):i.getAttribute(n)}return this.each((null==t?n.local?Ft:qt:'function'==typeof t?n.local?Yt:zt:n.local?Ht:Pt)(n,t))},style:function(e,t,n){return 1arguments.length){for(var d=Zt(this.node()),r=-1,i=a.length;++rarguments.length){var n=this.node().__on;if(n)for(var s,o=0,c=n.length;oarguments.length&&(a=t,t=gr().changedTouches);for(var d,r=0,i=t?t.length:0;rx}b.mouse('drag')}function i(){Sr(ur.view).on('mousemove.drag mouseup.drag',null),vn(ur.view,c),Tr(),b.mouse('end')}function a(){if(p.apply(this,arguments)){var e,t,i=ur.changedTouches,a=g.apply(this,arguments),d=i.length;for(e=0;e - :host { - position: relative; - display: inline-block; - } - - :host(:focus) { - outline: none; - } - - .background { - padding: 9px 0; - color: white; - position: relative; - } - - .track { - height: 3px; - width: 100%; - border-radius: 2px; - background-color: hsla(0, 0%, 0%, 0.2); - } - - .track-fill { - position: absolute; - top: 9px; - height: 3px; - border-radius: 4px; - background-color: hsl(24, 100%, 50%); - } - - .knob-container { - position: absolute; - top: 10px; - } - - .knob { - position: absolute; - top: -6px; - left: -6px; - width: 13px; - height: 13px; - background-color: hsl(24, 100%, 50%); - border-radius: 50%; - transition-property: transform; - transition-duration: 0.18s; - transition-timing-function: ease; - } - .mousedown .knob { - transform: scale(1.5); - } - - .knob-highlight { - position: absolute; - top: -6px; - left: -6px; - width: 13px; - height: 13px; - background-color: hsla(0, 0%, 0%, 0.1); - border-radius: 50%; - transition-property: transform; - transition-duration: 0.18s; - transition-timing-function: ease; - } - - .focus .knob-highlight { - transform: scale(2); - } - - .ticks { - position: absolute; - top: 16px; - height: 4px; - width: 100%; - z-index: -1; - } - - .ticks .tick { - position: absolute; - height: 100%; - border-left: 1px solid hsla(0, 0%, 0%, 0.2); - } - - - -
    -
    -
    -
    -
    -
    -
    -
    -
    -`),Dr={left:37,up:38,right:39,down:40,pageUp:33,pageDown:34,end:35,home:36};class Mr extends Er(HTMLElement){connectedCallback(){this.connected=!0,this.setAttribute('role','slider'),this.hasAttribute('tabindex')||this.setAttribute('tabindex',0),this.mouseEvent=!1,this.knob=this.root.querySelector('.knob-container'),this.background=this.root.querySelector('.background'),this.trackFill=this.root.querySelector('.track-fill'),this.track=this.root.querySelector('.track'),this.min=this.min?this.min:0,this.max=this.max?this.max:100,this.scale=me().domain([this.min,this.max]).range([0,1]).clamp(!0),this.origin=this.origin===void 0?this.min:this.origin,this.step=this.step?this.step:1,this.update(this.value?this.value:0),this.ticks=!!this.ticks&&this.ticks,this.renderTicks(),this.drag=Ar().container(this.background).on('start',()=>{this.mouseEvent=!0,this.background.classList.add('mousedown'),this.changeValue=this.value,this.dragUpdate()}).on('drag',()=>{this.dragUpdate()}).on('end',()=>{this.mouseEvent=!1,this.background.classList.remove('mousedown'),this.dragUpdate(),this.changeValue!==this.value&&this.dispatchChange(),this.changeValue=this.value}),this.drag(Sr(this.background)),this.addEventListener('focusin',()=>{this.mouseEvent||this.background.classList.add('focus')}),this.addEventListener('focusout',()=>{this.background.classList.remove('focus')}),this.addEventListener('keydown',this.onKeyDown)}static get observedAttributes(){return['min','max','value','step','ticks','origin','tickValues','tickLabels']}attributeChangedCallback(e,t,n){isNaN(n)||void 0===n||null===n||('min'==e&&(this.min=+n,this.setAttribute('aria-valuemin',this.min)),'max'==e&&(this.max=+n,this.setAttribute('aria-valuemax',this.max)),'value'==e&&this.update(+n),'origin'==e&&(this.origin=+n),'step'==e&&0{const n=document.createElement('div');n.classList.add('tick'),n.style.left=100*this.scale(t)+'%',e.appendChild(n)})}else e.style.display='none'}}var Or='\n \n\n';const Ur=ti('distill-header',` - - -`,!1);class Ir extends Ur(HTMLElement){}const Nr=` - -`;class jr extends HTMLElement{static get is(){return'distill-appendix'}set frontMatter(e){this.innerHTML=Ln(e)}}const Rr=ti('distill-footer',` - - -
    - - is dedicated to clear explanations of machine learning - - - -
    - -`);class qr extends Rr(HTMLElement){}const Fr=function(){if(1>window.distillRunlevel)throw new Error('Insufficient Runlevel for Distill Template!');if('distillTemplateIsLoading'in window&&window.distillTemplateIsLoading)throw new Error('Runlevel 1: Distill Template is getting loaded more than once, aborting!');else window.distillTemplateIsLoading=!0,console.info('Runlevel 1: Distill Template has started loading.');p(document),console.info('Runlevel 1: Static Distill styles have been added.'),console.info('Runlevel 1->2.'),window.distillRunlevel+=1;for(const[e,t]of Object.entries(hi.listeners))'function'==typeof t?document.addEventListener(e,t):console.error('Runlevel 2: Controller listeners need to be functions!');console.info('Runlevel 2: We can now listen to controller events.'),console.info('Runlevel 2->3.'),window.distillRunlevel+=1;if(2>window.distillRunlevel)throw new Error('Insufficient Runlevel for adding custom elements!');const e=[ki,wi,Ci,Li,Ai,Di,Oi,Ni,Ri,Fi,pi,Hi,zi,T,Bi,Wi,Vi,Mr,$i].concat([Ir,jr,qr]);for(const t of e)console.info('Runlevel 2: Registering custom element: '+t.is),customElements.define(t.is,t);console.info('Runlevel 3: Distill Template finished registering custom elements.'),console.info('Runlevel 3->4.'),window.distillRunlevel+=1,hi.listeners.DOMContentLoaded(),console.info('Runlevel 4: Distill Template initialisation complete.')};window.distillRunlevel=0,yi.browserSupportsAllFeatures()?(console.info('Runlevel 0: No need for polyfills.'),console.info('Runlevel 0->1.'),window.distillRunlevel+=1,Fr()):(console.info('Runlevel 0: Distill Template is loading polyfills.'),yi.load(Fr))}); -//# sourceMappingURL=template.v2.js.map -} diff --git a/_articles/RJ-2024-001/RJ-2024-001_files/header-attrs-2.29/header-attrs.js b/_articles/RJ-2024-001/RJ-2024-001_files/header-attrs-2.29/header-attrs.js deleted file mode 100644 index dd57d92e02..0000000000 --- a/_articles/RJ-2024-001/RJ-2024-001_files/header-attrs-2.29/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/_articles/RJ-2024-001/RJ-2024-001_files/jquery-3.6.0/jquery-3.6.0.js b/_articles/RJ-2024-001/RJ-2024-001_files/jquery-3.6.0/jquery-3.6.0.js deleted file mode 100644 index fc6c299b73..0000000000 --- a/_articles/RJ-2024-001/RJ-2024-001_files/jquery-3.6.0/jquery-3.6.0.js +++ /dev/null @@ -1,10881 +0,0 @@ -/*! - * jQuery JavaScript Library v3.6.0 - * https://jquery.com/ - * - * Includes Sizzle.js - * https://sizzlejs.com/ - * - * Copyright OpenJS Foundation and other contributors - * Released under the MIT license - * https://jquery.org/license - * - * Date: 2021-03-02T17:08Z - */ -( function( global, factory ) { - - "use strict"; - - if ( typeof module === "object" && typeof module.exports === "object" ) { - - // For CommonJS and CommonJS-like environments where a proper `window` - // is present, execute the factory and get jQuery. - // For environments that do not have a `window` with a `document` - // (such as Node.js), expose a factory as module.exports. - // This accentuates the need for the creation of a real `window`. - // e.g. var jQuery = require("jquery")(window); - // See ticket #14549 for more info. - module.exports = global.document ? - factory( global, true ) : - function( w ) { - if ( !w.document ) { - throw new Error( "jQuery requires a window with a document" ); - } - return factory( w ); - }; - } else { - factory( global ); - } - -// Pass this if window is not defined yet -} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { - -// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 -// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode -// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common -// enough that all such attempts are guarded in a try block. -"use strict"; - -var arr = []; - -var getProto = Object.getPrototypeOf; - -var slice = arr.slice; - -var flat = arr.flat ? function( array ) { - return arr.flat.call( array ); -} : function( array ) { - return arr.concat.apply( [], array ); -}; - - -var push = arr.push; - -var indexOf = arr.indexOf; - -var class2type = {}; - -var toString = class2type.toString; - -var hasOwn = class2type.hasOwnProperty; - -var fnToString = hasOwn.toString; - -var ObjectFunctionString = fnToString.call( Object ); - -var support = {}; - -var isFunction = function isFunction( obj ) { - - // Support: Chrome <=57, Firefox <=52 - // In some browsers, typeof returns "function" for HTML elements - // (i.e., `typeof document.createElement( "object" ) === "function"`). - // We don't want to classify *any* DOM node as a function. - // Support: QtWeb <=3.8.5, WebKit <=534.34, wkhtmltopdf tool <=0.12.5 - // Plus for old WebKit, typeof returns "function" for HTML collections - // (e.g., `typeof document.getElementsByTagName("div") === "function"`). (gh-4756) - return typeof obj === "function" && typeof obj.nodeType !== "number" && - typeof obj.item !== "function"; - }; - - -var isWindow = function isWindow( obj ) { - return obj != null && obj === obj.window; - }; - - -var document = window.document; - - - - var preservedScriptAttributes = { - type: true, - src: true, - nonce: true, - noModule: true - }; - - function DOMEval( code, node, doc ) { - doc = doc || document; - - var i, val, - script = doc.createElement( "script" ); - - script.text = code; - if ( node ) { - for ( i in preservedScriptAttributes ) { - - // Support: Firefox 64+, Edge 18+ - // Some browsers don't support the "nonce" property on scripts. - // On the other hand, just using `getAttribute` is not enough as - // the `nonce` attribute is reset to an empty string whenever it - // becomes browsing-context connected. - // See https://github.com/whatwg/html/issues/2369 - // See https://html.spec.whatwg.org/#nonce-attributes - // The `node.getAttribute` check was added for the sake of - // `jQuery.globalEval` so that it can fake a nonce-containing node - // via an object. - val = node[ i ] || node.getAttribute && node.getAttribute( i ); - if ( val ) { - script.setAttribute( i, val ); - } - } - } - doc.head.appendChild( script ).parentNode.removeChild( script ); - } - - -function toType( obj ) { - if ( obj == null ) { - return obj + ""; - } - - // Support: Android <=2.3 only (functionish RegExp) - return typeof obj === "object" || typeof obj === "function" ? - class2type[ toString.call( obj ) ] || "object" : - typeof obj; -} -/* global Symbol */ -// Defining this global in .eslintrc.json would create a danger of using the global -// unguarded in another place, it seems safer to define global only for this module - - - -var - version = "3.6.0", - - // Define a local copy of jQuery - jQuery = function( selector, context ) { - - // The jQuery object is actually just the init constructor 'enhanced' - // Need init if jQuery is called (just allow error to be thrown if not included) - return new jQuery.fn.init( selector, context ); - }; - -jQuery.fn = jQuery.prototype = { - - // The current version of jQuery being used - jquery: version, - - constructor: jQuery, - - // The default length of a jQuery object is 0 - length: 0, - - toArray: function() { - return slice.call( this ); - }, - - // Get the Nth element in the matched element set OR - // Get the whole matched element set as a clean array - get: function( num ) { - - // Return all the elements in a clean array - if ( num == null ) { - return slice.call( this ); - } - - // Return just the one element from the set - return num < 0 ? this[ num + this.length ] : this[ num ]; - }, - - // Take an array of elements and push it onto the stack - // (returning the new matched element set) - pushStack: function( elems ) { - - // Build a new jQuery matched element set - var ret = jQuery.merge( this.constructor(), elems ); - - // Add the old object onto the stack (as a reference) - ret.prevObject = this; - - // Return the newly-formed element set - return ret; - }, - - // Execute a callback for every element in the matched set. - each: function( callback ) { - return jQuery.each( this, callback ); - }, - - map: function( callback ) { - return this.pushStack( jQuery.map( this, function( elem, i ) { - return callback.call( elem, i, elem ); - } ) ); - }, - - slice: function() { - return this.pushStack( slice.apply( this, arguments ) ); - }, - - first: function() { - return this.eq( 0 ); - }, - - last: function() { - return this.eq( -1 ); - }, - - even: function() { - return this.pushStack( jQuery.grep( this, function( _elem, i ) { - return ( i + 1 ) % 2; - } ) ); - }, - - odd: function() { - return this.pushStack( jQuery.grep( this, function( _elem, i ) { - return i % 2; - } ) ); - }, - - eq: function( i ) { - var len = this.length, - j = +i + ( i < 0 ? len : 0 ); - return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); - }, - - end: function() { - return this.prevObject || this.constructor(); - }, - - // For internal use only. - // Behaves like an Array's method, not like a jQuery method. - push: push, - sort: arr.sort, - splice: arr.splice -}; - -jQuery.extend = jQuery.fn.extend = function() { - var options, name, src, copy, copyIsArray, clone, - target = arguments[ 0 ] || {}, - i = 1, - length = arguments.length, - deep = false; - - // Handle a deep copy situation - if ( typeof target === "boolean" ) { - deep = target; - - // Skip the boolean and the target - target = arguments[ i ] || {}; - i++; - } - - // Handle case when target is a string or something (possible in deep copy) - if ( typeof target !== "object" && !isFunction( target ) ) { - target = {}; - } - - // Extend jQuery itself if only one argument is passed - if ( i === length ) { - target = this; - i--; - } - - for ( ; i < length; i++ ) { - - // Only deal with non-null/undefined values - if ( ( options = arguments[ i ] ) != null ) { - - // Extend the base object - for ( name in options ) { - copy = options[ name ]; - - // Prevent Object.prototype pollution - // Prevent never-ending loop - if ( name === "__proto__" || target === copy ) { - continue; - } - - // Recurse if we're merging plain objects or arrays - if ( deep && copy && ( jQuery.isPlainObject( copy ) || - ( copyIsArray = Array.isArray( copy ) ) ) ) { - src = target[ name ]; - - // Ensure proper type for the source value - if ( copyIsArray && !Array.isArray( src ) ) { - clone = []; - } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { - clone = {}; - } else { - clone = src; - } - copyIsArray = false; - - // Never move original objects, clone them - target[ name ] = jQuery.extend( deep, clone, copy ); - - // Don't bring in undefined values - } else if ( copy !== undefined ) { - target[ name ] = copy; - } - } - } - } - - // Return the modified object - return target; -}; - -jQuery.extend( { - - // Unique for each copy of jQuery on the page - expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), - - // Assume jQuery is ready without the ready module - isReady: true, - - error: function( msg ) { - throw new Error( msg ); - }, - - noop: function() {}, - - isPlainObject: function( obj ) { - var proto, Ctor; - - // Detect obvious negatives - // Use toString instead of jQuery.type to catch host objects - if ( !obj || toString.call( obj ) !== "[object Object]" ) { - return false; - } - - proto = getProto( obj ); - - // Objects with no prototype (e.g., `Object.create( null )`) are plain - if ( !proto ) { - return true; - } - - // Objects with prototype are plain iff they were constructed by a global Object function - Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; - return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; - }, - - isEmptyObject: function( obj ) { - var name; - - for ( name in obj ) { - return false; - } - return true; - }, - - // Evaluates a script in a provided context; falls back to the global one - // if not specified. - globalEval: function( code, options, doc ) { - DOMEval( code, { nonce: options && options.nonce }, doc ); - }, - - each: function( obj, callback ) { - var length, i = 0; - - if ( isArrayLike( obj ) ) { - length = obj.length; - for ( ; i < length; i++ ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } else { - for ( i in obj ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } - - return obj; - }, - - // results is for internal usage only - makeArray: function( arr, results ) { - var ret = results || []; - - if ( arr != null ) { - if ( isArrayLike( Object( arr ) ) ) { - jQuery.merge( ret, - typeof arr === "string" ? - [ arr ] : arr - ); - } else { - push.call( ret, arr ); - } - } - - return ret; - }, - - inArray: function( elem, arr, i ) { - return arr == null ? -1 : indexOf.call( arr, elem, i ); - }, - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - merge: function( first, second ) { - var len = +second.length, - j = 0, - i = first.length; - - for ( ; j < len; j++ ) { - first[ i++ ] = second[ j ]; - } - - first.length = i; - - return first; - }, - - grep: function( elems, callback, invert ) { - var callbackInverse, - matches = [], - i = 0, - length = elems.length, - callbackExpect = !invert; - - // Go through the array, only saving the items - // that pass the validator function - for ( ; i < length; i++ ) { - callbackInverse = !callback( elems[ i ], i ); - if ( callbackInverse !== callbackExpect ) { - matches.push( elems[ i ] ); - } - } - - return matches; - }, - - // arg is for internal usage only - map: function( elems, callback, arg ) { - var length, value, - i = 0, - ret = []; - - // Go through the array, translating each of the items to their new values - if ( isArrayLike( elems ) ) { - length = elems.length; - for ( ; i < length; i++ ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - - // Go through every key on the object, - } else { - for ( i in elems ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - } - - // Flatten any nested arrays - return flat( ret ); - }, - - // A global GUID counter for objects - guid: 1, - - // jQuery.support is not used in Core but other projects attach their - // properties to it so it needs to exist. - support: support -} ); - -if ( typeof Symbol === "function" ) { - jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; -} - -// Populate the class2type map -jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), - function( _i, name ) { - class2type[ "[object " + name + "]" ] = name.toLowerCase(); - } ); - -function isArrayLike( obj ) { - - // Support: real iOS 8.2 only (not reproducible in simulator) - // `in` check used to prevent JIT error (gh-2145) - // hasOwn isn't used here due to false negatives - // regarding Nodelist length in IE - var length = !!obj && "length" in obj && obj.length, - type = toType( obj ); - - if ( isFunction( obj ) || isWindow( obj ) ) { - return false; - } - - return type === "array" || length === 0 || - typeof length === "number" && length > 0 && ( length - 1 ) in obj; -} -var Sizzle = -/*! - * Sizzle CSS Selector Engine v2.3.6 - * https://sizzlejs.com/ - * - * Copyright JS Foundation and other contributors - * Released under the MIT license - * https://js.foundation/ - * - * Date: 2021-02-16 - */ -( function( window ) { -var i, - support, - Expr, - getText, - isXML, - tokenize, - compile, - select, - outermostContext, - sortInput, - hasDuplicate, - - // Local document vars - setDocument, - document, - docElem, - documentIsHTML, - rbuggyQSA, - rbuggyMatches, - matches, - contains, - - // Instance-specific data - expando = "sizzle" + 1 * new Date(), - preferredDoc = window.document, - dirruns = 0, - done = 0, - classCache = createCache(), - tokenCache = createCache(), - compilerCache = createCache(), - nonnativeSelectorCache = createCache(), - sortOrder = function( a, b ) { - if ( a === b ) { - hasDuplicate = true; - } - return 0; - }, - - // Instance methods - hasOwn = ( {} ).hasOwnProperty, - arr = [], - pop = arr.pop, - pushNative = arr.push, - push = arr.push, - slice = arr.slice, - - // Use a stripped-down indexOf as it's faster than native - // https://jsperf.com/thor-indexof-vs-for/5 - indexOf = function( list, elem ) { - var i = 0, - len = list.length; - for ( ; i < len; i++ ) { - if ( list[ i ] === elem ) { - return i; - } - } - return -1; - }, - - booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + - "ismap|loop|multiple|open|readonly|required|scoped", - - // Regular expressions - - // http://www.w3.org/TR/css3-selectors/#whitespace - whitespace = "[\\x20\\t\\r\\n\\f]", - - // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram - identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + - "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", - - // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors - attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + - - // Operator (capture 2) - "*([*^$|!~]?=)" + whitespace + - - // "Attribute values must be CSS identifiers [capture 5] - // or strings [capture 3 or capture 4]" - "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + - whitespace + "*\\]", - - pseudos = ":(" + identifier + ")(?:\\((" + - - // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: - // 1. quoted (capture 3; capture 4 or capture 5) - "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + - - // 2. simple (capture 6) - "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + - - // 3. anything else (capture 2) - ".*" + - ")\\)|)", - - // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter - rwhitespace = new RegExp( whitespace + "+", "g" ), - rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + - whitespace + "+$", "g" ), - - rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), - rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + - "*" ), - rdescend = new RegExp( whitespace + "|>" ), - - rpseudo = new RegExp( pseudos ), - ridentifier = new RegExp( "^" + identifier + "$" ), - - matchExpr = { - "ID": new RegExp( "^#(" + identifier + ")" ), - "CLASS": new RegExp( "^\\.(" + identifier + ")" ), - "TAG": new RegExp( "^(" + identifier + "|[*])" ), - "ATTR": new RegExp( "^" + attributes ), - "PSEUDO": new RegExp( "^" + pseudos ), - "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + - whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + - whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), - "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), - - // For use in libraries implementing .is() - // We use this for POS matching in `select` - "needsContext": new RegExp( "^" + whitespace + - "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + - "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) - }, - - rhtml = /HTML$/i, - rinputs = /^(?:input|select|textarea|button)$/i, - rheader = /^h\d$/i, - - rnative = /^[^{]+\{\s*\[native \w/, - - // Easily-parseable/retrievable ID or TAG or CLASS selectors - rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, - - rsibling = /[+~]/, - - // CSS escapes - // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters - runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), - funescape = function( escape, nonHex ) { - var high = "0x" + escape.slice( 1 ) - 0x10000; - - return nonHex ? - - // Strip the backslash prefix from a non-hex escape sequence - nonHex : - - // Replace a hexadecimal escape sequence with the encoded Unicode code point - // Support: IE <=11+ - // For values outside the Basic Multilingual Plane (BMP), manually construct a - // surrogate pair - high < 0 ? - String.fromCharCode( high + 0x10000 ) : - String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); - }, - - // CSS string/identifier serialization - // https://drafts.csswg.org/cssom/#common-serializing-idioms - rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, - fcssescape = function( ch, asCodePoint ) { - if ( asCodePoint ) { - - // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER - if ( ch === "\0" ) { - return "\uFFFD"; - } - - // Control characters and (dependent upon position) numbers get escaped as code points - return ch.slice( 0, -1 ) + "\\" + - ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; - } - - // Other potentially-special ASCII characters get backslash-escaped - return "\\" + ch; - }, - - // Used for iframes - // See setDocument() - // Removing the function wrapper causes a "Permission Denied" - // error in IE - unloadHandler = function() { - setDocument(); - }, - - inDisabledFieldset = addCombinator( - function( elem ) { - return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; - }, - { dir: "parentNode", next: "legend" } - ); - -// Optimize for push.apply( _, NodeList ) -try { - push.apply( - ( arr = slice.call( preferredDoc.childNodes ) ), - preferredDoc.childNodes - ); - - // Support: Android<4.0 - // Detect silently failing push.apply - // eslint-disable-next-line no-unused-expressions - arr[ preferredDoc.childNodes.length ].nodeType; -} catch ( e ) { - push = { apply: arr.length ? - - // Leverage slice if possible - function( target, els ) { - pushNative.apply( target, slice.call( els ) ); - } : - - // Support: IE<9 - // Otherwise append directly - function( target, els ) { - var j = target.length, - i = 0; - - // Can't trust NodeList.length - while ( ( target[ j++ ] = els[ i++ ] ) ) {} - target.length = j - 1; - } - }; -} - -function Sizzle( selector, context, results, seed ) { - var m, i, elem, nid, match, groups, newSelector, - newContext = context && context.ownerDocument, - - // nodeType defaults to 9, since context defaults to document - nodeType = context ? context.nodeType : 9; - - results = results || []; - - // Return early from calls with invalid selector or context - if ( typeof selector !== "string" || !selector || - nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { - - return results; - } - - // Try to shortcut find operations (as opposed to filters) in HTML documents - if ( !seed ) { - setDocument( context ); - context = context || document; - - if ( documentIsHTML ) { - - // If the selector is sufficiently simple, try using a "get*By*" DOM method - // (excepting DocumentFragment context, where the methods don't exist) - if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { - - // ID selector - if ( ( m = match[ 1 ] ) ) { - - // Document context - if ( nodeType === 9 ) { - if ( ( elem = context.getElementById( m ) ) ) { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( elem.id === m ) { - results.push( elem ); - return results; - } - } else { - return results; - } - - // Element context - } else { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( newContext && ( elem = newContext.getElementById( m ) ) && - contains( context, elem ) && - elem.id === m ) { - - results.push( elem ); - return results; - } - } - - // Type selector - } else if ( match[ 2 ] ) { - push.apply( results, context.getElementsByTagName( selector ) ); - return results; - - // Class selector - } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && - context.getElementsByClassName ) { - - push.apply( results, context.getElementsByClassName( m ) ); - return results; - } - } - - // Take advantage of querySelectorAll - if ( support.qsa && - !nonnativeSelectorCache[ selector + " " ] && - ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && - - // Support: IE 8 only - // Exclude object elements - ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { - - newSelector = selector; - newContext = context; - - // qSA considers elements outside a scoping root when evaluating child or - // descendant combinators, which is not what we want. - // In such cases, we work around the behavior by prefixing every selector in the - // list with an ID selector referencing the scope context. - // The technique has to be used as well when a leading combinator is used - // as such selectors are not recognized by querySelectorAll. - // Thanks to Andrew Dupont for this technique. - if ( nodeType === 1 && - ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { - - // Expand context for sibling selectors - newContext = rsibling.test( selector ) && testContext( context.parentNode ) || - context; - - // We can use :scope instead of the ID hack if the browser - // supports it & if we're not changing the context. - if ( newContext !== context || !support.scope ) { - - // Capture the context ID, setting it first if necessary - if ( ( nid = context.getAttribute( "id" ) ) ) { - nid = nid.replace( rcssescape, fcssescape ); - } else { - context.setAttribute( "id", ( nid = expando ) ); - } - } - - // Prefix every selector in the list - groups = tokenize( selector ); - i = groups.length; - while ( i-- ) { - groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + - toSelector( groups[ i ] ); - } - newSelector = groups.join( "," ); - } - - try { - push.apply( results, - newContext.querySelectorAll( newSelector ) - ); - return results; - } catch ( qsaError ) { - nonnativeSelectorCache( selector, true ); - } finally { - if ( nid === expando ) { - context.removeAttribute( "id" ); - } - } - } - } - } - - // All others - return select( selector.replace( rtrim, "$1" ), context, results, seed ); -} - -/** - * Create key-value caches of limited size - * @returns {function(string, object)} Returns the Object data after storing it on itself with - * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) - * deleting the oldest entry - */ -function createCache() { - var keys = []; - - function cache( key, value ) { - - // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) - if ( keys.push( key + " " ) > Expr.cacheLength ) { - - // Only keep the most recent entries - delete cache[ keys.shift() ]; - } - return ( cache[ key + " " ] = value ); - } - return cache; -} - -/** - * Mark a function for special use by Sizzle - * @param {Function} fn The function to mark - */ -function markFunction( fn ) { - fn[ expando ] = true; - return fn; -} - -/** - * Support testing using an element - * @param {Function} fn Passed the created element and returns a boolean result - */ -function assert( fn ) { - var el = document.createElement( "fieldset" ); - - try { - return !!fn( el ); - } catch ( e ) { - return false; - } finally { - - // Remove from its parent by default - if ( el.parentNode ) { - el.parentNode.removeChild( el ); - } - - // release memory in IE - el = null; - } -} - -/** - * Adds the same handler for all of the specified attrs - * @param {String} attrs Pipe-separated list of attributes - * @param {Function} handler The method that will be applied - */ -function addHandle( attrs, handler ) { - var arr = attrs.split( "|" ), - i = arr.length; - - while ( i-- ) { - Expr.attrHandle[ arr[ i ] ] = handler; - } -} - -/** - * Checks document order of two siblings - * @param {Element} a - * @param {Element} b - * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b - */ -function siblingCheck( a, b ) { - var cur = b && a, - diff = cur && a.nodeType === 1 && b.nodeType === 1 && - a.sourceIndex - b.sourceIndex; - - // Use IE sourceIndex if available on both nodes - if ( diff ) { - return diff; - } - - // Check if b follows a - if ( cur ) { - while ( ( cur = cur.nextSibling ) ) { - if ( cur === b ) { - return -1; - } - } - } - - return a ? 1 : -1; -} - -/** - * Returns a function to use in pseudos for input types - * @param {String} type - */ -function createInputPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for buttons - * @param {String} type - */ -function createButtonPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return ( name === "input" || name === "button" ) && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for :enabled/:disabled - * @param {Boolean} disabled true for :disabled; false for :enabled - */ -function createDisabledPseudo( disabled ) { - - // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable - return function( elem ) { - - // Only certain elements can match :enabled or :disabled - // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled - // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled - if ( "form" in elem ) { - - // Check for inherited disabledness on relevant non-disabled elements: - // * listed form-associated elements in a disabled fieldset - // https://html.spec.whatwg.org/multipage/forms.html#category-listed - // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled - // * option elements in a disabled optgroup - // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled - // All such elements have a "form" property. - if ( elem.parentNode && elem.disabled === false ) { - - // Option elements defer to a parent optgroup if present - if ( "label" in elem ) { - if ( "label" in elem.parentNode ) { - return elem.parentNode.disabled === disabled; - } else { - return elem.disabled === disabled; - } - } - - // Support: IE 6 - 11 - // Use the isDisabled shortcut property to check for disabled fieldset ancestors - return elem.isDisabled === disabled || - - // Where there is no isDisabled, check manually - /* jshint -W018 */ - elem.isDisabled !== !disabled && - inDisabledFieldset( elem ) === disabled; - } - - return elem.disabled === disabled; - - // Try to winnow out elements that can't be disabled before trusting the disabled property. - // Some victims get caught in our net (label, legend, menu, track), but it shouldn't - // even exist on them, let alone have a boolean value. - } else if ( "label" in elem ) { - return elem.disabled === disabled; - } - - // Remaining elements are neither :enabled nor :disabled - return false; - }; -} - -/** - * Returns a function to use in pseudos for positionals - * @param {Function} fn - */ -function createPositionalPseudo( fn ) { - return markFunction( function( argument ) { - argument = +argument; - return markFunction( function( seed, matches ) { - var j, - matchIndexes = fn( [], seed.length, argument ), - i = matchIndexes.length; - - // Match elements found at the specified indexes - while ( i-- ) { - if ( seed[ ( j = matchIndexes[ i ] ) ] ) { - seed[ j ] = !( matches[ j ] = seed[ j ] ); - } - } - } ); - } ); -} - -/** - * Checks a node for validity as a Sizzle context - * @param {Element|Object=} context - * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value - */ -function testContext( context ) { - return context && typeof context.getElementsByTagName !== "undefined" && context; -} - -// Expose support vars for convenience -support = Sizzle.support = {}; - -/** - * Detects XML nodes - * @param {Element|Object} elem An element or a document - * @returns {Boolean} True iff elem is a non-HTML XML node - */ -isXML = Sizzle.isXML = function( elem ) { - var namespace = elem && elem.namespaceURI, - docElem = elem && ( elem.ownerDocument || elem ).documentElement; - - // Support: IE <=8 - // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes - // https://bugs.jquery.com/ticket/4833 - return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); -}; - -/** - * Sets document-related variables once based on the current document - * @param {Element|Object} [doc] An element or document object to use to set the document - * @returns {Object} Returns the current document - */ -setDocument = Sizzle.setDocument = function( node ) { - var hasCompare, subWindow, - doc = node ? node.ownerDocument || node : preferredDoc; - - // Return early if doc is invalid or already selected - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { - return document; - } - - // Update global variables - document = doc; - docElem = document.documentElement; - documentIsHTML = !isXML( document ); - - // Support: IE 9 - 11+, Edge 12 - 18+ - // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( preferredDoc != document && - ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { - - // Support: IE 11, Edge - if ( subWindow.addEventListener ) { - subWindow.addEventListener( "unload", unloadHandler, false ); - - // Support: IE 9 - 10 only - } else if ( subWindow.attachEvent ) { - subWindow.attachEvent( "onunload", unloadHandler ); - } - } - - // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, - // Safari 4 - 5 only, Opera <=11.6 - 12.x only - // IE/Edge & older browsers don't support the :scope pseudo-class. - // Support: Safari 6.0 only - // Safari 6.0 supports :scope but it's an alias of :root there. - support.scope = assert( function( el ) { - docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); - return typeof el.querySelectorAll !== "undefined" && - !el.querySelectorAll( ":scope fieldset div" ).length; - } ); - - /* Attributes - ---------------------------------------------------------------------- */ - - // Support: IE<8 - // Verify that getAttribute really returns attributes and not properties - // (excepting IE8 booleans) - support.attributes = assert( function( el ) { - el.className = "i"; - return !el.getAttribute( "className" ); - } ); - - /* getElement(s)By* - ---------------------------------------------------------------------- */ - - // Check if getElementsByTagName("*") returns only elements - support.getElementsByTagName = assert( function( el ) { - el.appendChild( document.createComment( "" ) ); - return !el.getElementsByTagName( "*" ).length; - } ); - - // Support: IE<9 - support.getElementsByClassName = rnative.test( document.getElementsByClassName ); - - // Support: IE<10 - // Check if getElementById returns elements by name - // The broken getElementById methods don't pick up programmatically-set names, - // so use a roundabout getElementsByName test - support.getById = assert( function( el ) { - docElem.appendChild( el ).id = expando; - return !document.getElementsByName || !document.getElementsByName( expando ).length; - } ); - - // ID filter and find - if ( support.getById ) { - Expr.filter[ "ID" ] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - return elem.getAttribute( "id" ) === attrId; - }; - }; - Expr.find[ "ID" ] = function( id, context ) { - if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { - var elem = context.getElementById( id ); - return elem ? [ elem ] : []; - } - }; - } else { - Expr.filter[ "ID" ] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - var node = typeof elem.getAttributeNode !== "undefined" && - elem.getAttributeNode( "id" ); - return node && node.value === attrId; - }; - }; - - // Support: IE 6 - 7 only - // getElementById is not reliable as a find shortcut - Expr.find[ "ID" ] = function( id, context ) { - if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { - var node, i, elems, - elem = context.getElementById( id ); - - if ( elem ) { - - // Verify the id attribute - node = elem.getAttributeNode( "id" ); - if ( node && node.value === id ) { - return [ elem ]; - } - - // Fall back on getElementsByName - elems = context.getElementsByName( id ); - i = 0; - while ( ( elem = elems[ i++ ] ) ) { - node = elem.getAttributeNode( "id" ); - if ( node && node.value === id ) { - return [ elem ]; - } - } - } - - return []; - } - }; - } - - // Tag - Expr.find[ "TAG" ] = support.getElementsByTagName ? - function( tag, context ) { - if ( typeof context.getElementsByTagName !== "undefined" ) { - return context.getElementsByTagName( tag ); - - // DocumentFragment nodes don't have gEBTN - } else if ( support.qsa ) { - return context.querySelectorAll( tag ); - } - } : - - function( tag, context ) { - var elem, - tmp = [], - i = 0, - - // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too - results = context.getElementsByTagName( tag ); - - // Filter out possible comments - if ( tag === "*" ) { - while ( ( elem = results[ i++ ] ) ) { - if ( elem.nodeType === 1 ) { - tmp.push( elem ); - } - } - - return tmp; - } - return results; - }; - - // Class - Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { - if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { - return context.getElementsByClassName( className ); - } - }; - - /* QSA/matchesSelector - ---------------------------------------------------------------------- */ - - // QSA and matchesSelector support - - // matchesSelector(:active) reports false when true (IE9/Opera 11.5) - rbuggyMatches = []; - - // qSa(:focus) reports false when true (Chrome 21) - // We allow this because of a bug in IE8/9 that throws an error - // whenever `document.activeElement` is accessed on an iframe - // So, we allow :focus to pass through QSA all the time to avoid the IE error - // See https://bugs.jquery.com/ticket/13378 - rbuggyQSA = []; - - if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { - - // Build QSA regex - // Regex strategy adopted from Diego Perini - assert( function( el ) { - - var input; - - // Select is set to empty string on purpose - // This is to test IE's treatment of not explicitly - // setting a boolean content attribute, - // since its presence should be enough - // https://bugs.jquery.com/ticket/12359 - docElem.appendChild( el ).innerHTML = "" + - ""; - - // Support: IE8, Opera 11-12.16 - // Nothing should be selected when empty strings follow ^= or $= or *= - // The test attribute must be unknown in Opera but "safe" for WinRT - // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section - if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { - rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); - } - - // Support: IE8 - // Boolean attributes and "value" are not treated correctly - if ( !el.querySelectorAll( "[selected]" ).length ) { - rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); - } - - // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ - if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { - rbuggyQSA.push( "~=" ); - } - - // Support: IE 11+, Edge 15 - 18+ - // IE 11/Edge don't find elements on a `[name='']` query in some cases. - // Adding a temporary attribute to the document before the selection works - // around the issue. - // Interestingly, IE 10 & older don't seem to have the issue. - input = document.createElement( "input" ); - input.setAttribute( "name", "" ); - el.appendChild( input ); - if ( !el.querySelectorAll( "[name='']" ).length ) { - rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + - whitespace + "*(?:''|\"\")" ); - } - - // Webkit/Opera - :checked should return selected option elements - // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - // IE8 throws error here and will not see later tests - if ( !el.querySelectorAll( ":checked" ).length ) { - rbuggyQSA.push( ":checked" ); - } - - // Support: Safari 8+, iOS 8+ - // https://bugs.webkit.org/show_bug.cgi?id=136851 - // In-page `selector#id sibling-combinator selector` fails - if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { - rbuggyQSA.push( ".#.+[+~]" ); - } - - // Support: Firefox <=3.6 - 5 only - // Old Firefox doesn't throw on a badly-escaped identifier. - el.querySelectorAll( "\\\f" ); - rbuggyQSA.push( "[\\r\\n\\f]" ); - } ); - - assert( function( el ) { - el.innerHTML = "" + - ""; - - // Support: Windows 8 Native Apps - // The type and name attributes are restricted during .innerHTML assignment - var input = document.createElement( "input" ); - input.setAttribute( "type", "hidden" ); - el.appendChild( input ).setAttribute( "name", "D" ); - - // Support: IE8 - // Enforce case-sensitivity of name attribute - if ( el.querySelectorAll( "[name=d]" ).length ) { - rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); - } - - // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) - // IE8 throws error here and will not see later tests - if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Support: IE9-11+ - // IE's :disabled selector does not pick up the children of disabled fieldsets - docElem.appendChild( el ).disabled = true; - if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Support: Opera 10 - 11 only - // Opera 10-11 does not throw on post-comma invalid pseudos - el.querySelectorAll( "*,:x" ); - rbuggyQSA.push( ",.*:" ); - } ); - } - - if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || - docElem.webkitMatchesSelector || - docElem.mozMatchesSelector || - docElem.oMatchesSelector || - docElem.msMatchesSelector ) ) ) ) { - - assert( function( el ) { - - // Check to see if it's possible to do matchesSelector - // on a disconnected node (IE 9) - support.disconnectedMatch = matches.call( el, "*" ); - - // This should fail with an exception - // Gecko does not error, returns false instead - matches.call( el, "[s!='']:x" ); - rbuggyMatches.push( "!=", pseudos ); - } ); - } - - rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); - rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); - - /* Contains - ---------------------------------------------------------------------- */ - hasCompare = rnative.test( docElem.compareDocumentPosition ); - - // Element contains another - // Purposefully self-exclusive - // As in, an element does not contain itself - contains = hasCompare || rnative.test( docElem.contains ) ? - function( a, b ) { - var adown = a.nodeType === 9 ? a.documentElement : a, - bup = b && b.parentNode; - return a === bup || !!( bup && bup.nodeType === 1 && ( - adown.contains ? - adown.contains( bup ) : - a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 - ) ); - } : - function( a, b ) { - if ( b ) { - while ( ( b = b.parentNode ) ) { - if ( b === a ) { - return true; - } - } - } - return false; - }; - - /* Sorting - ---------------------------------------------------------------------- */ - - // Document order sorting - sortOrder = hasCompare ? - function( a, b ) { - - // Flag for duplicate removal - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - // Sort on method existence if only one input has compareDocumentPosition - var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; - if ( compare ) { - return compare; - } - - // Calculate position if both inputs belong to the same document - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? - a.compareDocumentPosition( b ) : - - // Otherwise we know they are disconnected - 1; - - // Disconnected nodes - if ( compare & 1 || - ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { - - // Choose the first element that is related to our preferred document - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( a == document || a.ownerDocument == preferredDoc && - contains( preferredDoc, a ) ) { - return -1; - } - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( b == document || b.ownerDocument == preferredDoc && - contains( preferredDoc, b ) ) { - return 1; - } - - // Maintain original order - return sortInput ? - ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : - 0; - } - - return compare & 4 ? -1 : 1; - } : - function( a, b ) { - - // Exit early if the nodes are identical - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - var cur, - i = 0, - aup = a.parentNode, - bup = b.parentNode, - ap = [ a ], - bp = [ b ]; - - // Parentless nodes are either documents or disconnected - if ( !aup || !bup ) { - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - /* eslint-disable eqeqeq */ - return a == document ? -1 : - b == document ? 1 : - /* eslint-enable eqeqeq */ - aup ? -1 : - bup ? 1 : - sortInput ? - ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : - 0; - - // If the nodes are siblings, we can do a quick check - } else if ( aup === bup ) { - return siblingCheck( a, b ); - } - - // Otherwise we need full lists of their ancestors for comparison - cur = a; - while ( ( cur = cur.parentNode ) ) { - ap.unshift( cur ); - } - cur = b; - while ( ( cur = cur.parentNode ) ) { - bp.unshift( cur ); - } - - // Walk down the tree looking for a discrepancy - while ( ap[ i ] === bp[ i ] ) { - i++; - } - - return i ? - - // Do a sibling check if the nodes have a common ancestor - siblingCheck( ap[ i ], bp[ i ] ) : - - // Otherwise nodes in our document sort first - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - /* eslint-disable eqeqeq */ - ap[ i ] == preferredDoc ? -1 : - bp[ i ] == preferredDoc ? 1 : - /* eslint-enable eqeqeq */ - 0; - }; - - return document; -}; - -Sizzle.matches = function( expr, elements ) { - return Sizzle( expr, null, null, elements ); -}; - -Sizzle.matchesSelector = function( elem, expr ) { - setDocument( elem ); - - if ( support.matchesSelector && documentIsHTML && - !nonnativeSelectorCache[ expr + " " ] && - ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && - ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { - - try { - var ret = matches.call( elem, expr ); - - // IE 9's matchesSelector returns false on disconnected nodes - if ( ret || support.disconnectedMatch || - - // As well, disconnected nodes are said to be in a document - // fragment in IE 9 - elem.document && elem.document.nodeType !== 11 ) { - return ret; - } - } catch ( e ) { - nonnativeSelectorCache( expr, true ); - } - } - - return Sizzle( expr, document, null, [ elem ] ).length > 0; -}; - -Sizzle.contains = function( context, elem ) { - - // Set document vars if needed - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( ( context.ownerDocument || context ) != document ) { - setDocument( context ); - } - return contains( context, elem ); -}; - -Sizzle.attr = function( elem, name ) { - - // Set document vars if needed - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( ( elem.ownerDocument || elem ) != document ) { - setDocument( elem ); - } - - var fn = Expr.attrHandle[ name.toLowerCase() ], - - // Don't get fooled by Object.prototype properties (jQuery #13807) - val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? - fn( elem, name, !documentIsHTML ) : - undefined; - - return val !== undefined ? - val : - support.attributes || !documentIsHTML ? - elem.getAttribute( name ) : - ( val = elem.getAttributeNode( name ) ) && val.specified ? - val.value : - null; -}; - -Sizzle.escape = function( sel ) { - return ( sel + "" ).replace( rcssescape, fcssescape ); -}; - -Sizzle.error = function( msg ) { - throw new Error( "Syntax error, unrecognized expression: " + msg ); -}; - -/** - * Document sorting and removing duplicates - * @param {ArrayLike} results - */ -Sizzle.uniqueSort = function( results ) { - var elem, - duplicates = [], - j = 0, - i = 0; - - // Unless we *know* we can detect duplicates, assume their presence - hasDuplicate = !support.detectDuplicates; - sortInput = !support.sortStable && results.slice( 0 ); - results.sort( sortOrder ); - - if ( hasDuplicate ) { - while ( ( elem = results[ i++ ] ) ) { - if ( elem === results[ i ] ) { - j = duplicates.push( i ); - } - } - while ( j-- ) { - results.splice( duplicates[ j ], 1 ); - } - } - - // Clear input after sorting to release objects - // See https://github.com/jquery/sizzle/pull/225 - sortInput = null; - - return results; -}; - -/** - * Utility function for retrieving the text value of an array of DOM nodes - * @param {Array|Element} elem - */ -getText = Sizzle.getText = function( elem ) { - var node, - ret = "", - i = 0, - nodeType = elem.nodeType; - - if ( !nodeType ) { - - // If no nodeType, this is expected to be an array - while ( ( node = elem[ i++ ] ) ) { - - // Do not traverse comment nodes - ret += getText( node ); - } - } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { - - // Use textContent for elements - // innerText usage removed for consistency of new lines (jQuery #11153) - if ( typeof elem.textContent === "string" ) { - return elem.textContent; - } else { - - // Traverse its children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - ret += getText( elem ); - } - } - } else if ( nodeType === 3 || nodeType === 4 ) { - return elem.nodeValue; - } - - // Do not include comment or processing instruction nodes - - return ret; -}; - -Expr = Sizzle.selectors = { - - // Can be adjusted by the user - cacheLength: 50, - - createPseudo: markFunction, - - match: matchExpr, - - attrHandle: {}, - - find: {}, - - relative: { - ">": { dir: "parentNode", first: true }, - " ": { dir: "parentNode" }, - "+": { dir: "previousSibling", first: true }, - "~": { dir: "previousSibling" } - }, - - preFilter: { - "ATTR": function( match ) { - match[ 1 ] = match[ 1 ].replace( runescape, funescape ); - - // Move the given value to match[3] whether quoted or unquoted - match[ 3 ] = ( match[ 3 ] || match[ 4 ] || - match[ 5 ] || "" ).replace( runescape, funescape ); - - if ( match[ 2 ] === "~=" ) { - match[ 3 ] = " " + match[ 3 ] + " "; - } - - return match.slice( 0, 4 ); - }, - - "CHILD": function( match ) { - - /* matches from matchExpr["CHILD"] - 1 type (only|nth|...) - 2 what (child|of-type) - 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) - 4 xn-component of xn+y argument ([+-]?\d*n|) - 5 sign of xn-component - 6 x of xn-component - 7 sign of y-component - 8 y of y-component - */ - match[ 1 ] = match[ 1 ].toLowerCase(); - - if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { - - // nth-* requires argument - if ( !match[ 3 ] ) { - Sizzle.error( match[ 0 ] ); - } - - // numeric x and y parameters for Expr.filter.CHILD - // remember that false/true cast respectively to 0/1 - match[ 4 ] = +( match[ 4 ] ? - match[ 5 ] + ( match[ 6 ] || 1 ) : - 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); - match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); - - // other types prohibit arguments - } else if ( match[ 3 ] ) { - Sizzle.error( match[ 0 ] ); - } - - return match; - }, - - "PSEUDO": function( match ) { - var excess, - unquoted = !match[ 6 ] && match[ 2 ]; - - if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { - return null; - } - - // Accept quoted arguments as-is - if ( match[ 3 ] ) { - match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; - - // Strip excess characters from unquoted arguments - } else if ( unquoted && rpseudo.test( unquoted ) && - - // Get excess from tokenize (recursively) - ( excess = tokenize( unquoted, true ) ) && - - // advance to the next closing parenthesis - ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { - - // excess is a negative index - match[ 0 ] = match[ 0 ].slice( 0, excess ); - match[ 2 ] = unquoted.slice( 0, excess ); - } - - // Return only captures needed by the pseudo filter method (type and argument) - return match.slice( 0, 3 ); - } - }, - - filter: { - - "TAG": function( nodeNameSelector ) { - var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); - return nodeNameSelector === "*" ? - function() { - return true; - } : - function( elem ) { - return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; - }; - }, - - "CLASS": function( className ) { - var pattern = classCache[ className + " " ]; - - return pattern || - ( pattern = new RegExp( "(^|" + whitespace + - ")" + className + "(" + whitespace + "|$)" ) ) && classCache( - className, function( elem ) { - return pattern.test( - typeof elem.className === "string" && elem.className || - typeof elem.getAttribute !== "undefined" && - elem.getAttribute( "class" ) || - "" - ); - } ); - }, - - "ATTR": function( name, operator, check ) { - return function( elem ) { - var result = Sizzle.attr( elem, name ); - - if ( result == null ) { - return operator === "!="; - } - if ( !operator ) { - return true; - } - - result += ""; - - /* eslint-disable max-len */ - - return operator === "=" ? result === check : - operator === "!=" ? result !== check : - operator === "^=" ? check && result.indexOf( check ) === 0 : - operator === "*=" ? check && result.indexOf( check ) > -1 : - operator === "$=" ? check && result.slice( -check.length ) === check : - operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : - operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : - false; - /* eslint-enable max-len */ - - }; - }, - - "CHILD": function( type, what, _argument, first, last ) { - var simple = type.slice( 0, 3 ) !== "nth", - forward = type.slice( -4 ) !== "last", - ofType = what === "of-type"; - - return first === 1 && last === 0 ? - - // Shortcut for :nth-*(n) - function( elem ) { - return !!elem.parentNode; - } : - - function( elem, _context, xml ) { - var cache, uniqueCache, outerCache, node, nodeIndex, start, - dir = simple !== forward ? "nextSibling" : "previousSibling", - parent = elem.parentNode, - name = ofType && elem.nodeName.toLowerCase(), - useCache = !xml && !ofType, - diff = false; - - if ( parent ) { - - // :(first|last|only)-(child|of-type) - if ( simple ) { - while ( dir ) { - node = elem; - while ( ( node = node[ dir ] ) ) { - if ( ofType ? - node.nodeName.toLowerCase() === name : - node.nodeType === 1 ) { - - return false; - } - } - - // Reverse direction for :only-* (if we haven't yet done so) - start = dir = type === "only" && !start && "nextSibling"; - } - return true; - } - - start = [ forward ? parent.firstChild : parent.lastChild ]; - - // non-xml :nth-child(...) stores cache data on `parent` - if ( forward && useCache ) { - - // Seek `elem` from a previously-cached index - - // ...in a gzip-friendly way - node = parent; - outerCache = node[ expando ] || ( node[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - ( outerCache[ node.uniqueID ] = {} ); - - cache = uniqueCache[ type ] || []; - nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; - diff = nodeIndex && cache[ 2 ]; - node = nodeIndex && parent.childNodes[ nodeIndex ]; - - while ( ( node = ++nodeIndex && node && node[ dir ] || - - // Fallback to seeking `elem` from the start - ( diff = nodeIndex = 0 ) || start.pop() ) ) { - - // When found, cache indexes on `parent` and break - if ( node.nodeType === 1 && ++diff && node === elem ) { - uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; - break; - } - } - - } else { - - // Use previously-cached element index if available - if ( useCache ) { - - // ...in a gzip-friendly way - node = elem; - outerCache = node[ expando ] || ( node[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - ( outerCache[ node.uniqueID ] = {} ); - - cache = uniqueCache[ type ] || []; - nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; - diff = nodeIndex; - } - - // xml :nth-child(...) - // or :nth-last-child(...) or :nth(-last)?-of-type(...) - if ( diff === false ) { - - // Use the same loop as above to seek `elem` from the start - while ( ( node = ++nodeIndex && node && node[ dir ] || - ( diff = nodeIndex = 0 ) || start.pop() ) ) { - - if ( ( ofType ? - node.nodeName.toLowerCase() === name : - node.nodeType === 1 ) && - ++diff ) { - - // Cache the index of each encountered element - if ( useCache ) { - outerCache = node[ expando ] || - ( node[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - ( outerCache[ node.uniqueID ] = {} ); - - uniqueCache[ type ] = [ dirruns, diff ]; - } - - if ( node === elem ) { - break; - } - } - } - } - } - - // Incorporate the offset, then check against cycle size - diff -= last; - return diff === first || ( diff % first === 0 && diff / first >= 0 ); - } - }; - }, - - "PSEUDO": function( pseudo, argument ) { - - // pseudo-class names are case-insensitive - // http://www.w3.org/TR/selectors/#pseudo-classes - // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters - // Remember that setFilters inherits from pseudos - var args, - fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || - Sizzle.error( "unsupported pseudo: " + pseudo ); - - // The user may use createPseudo to indicate that - // arguments are needed to create the filter function - // just as Sizzle does - if ( fn[ expando ] ) { - return fn( argument ); - } - - // But maintain support for old signatures - if ( fn.length > 1 ) { - args = [ pseudo, pseudo, "", argument ]; - return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? - markFunction( function( seed, matches ) { - var idx, - matched = fn( seed, argument ), - i = matched.length; - while ( i-- ) { - idx = indexOf( seed, matched[ i ] ); - seed[ idx ] = !( matches[ idx ] = matched[ i ] ); - } - } ) : - function( elem ) { - return fn( elem, 0, args ); - }; - } - - return fn; - } - }, - - pseudos: { - - // Potentially complex pseudos - "not": markFunction( function( selector ) { - - // Trim the selector passed to compile - // to avoid treating leading and trailing - // spaces as combinators - var input = [], - results = [], - matcher = compile( selector.replace( rtrim, "$1" ) ); - - return matcher[ expando ] ? - markFunction( function( seed, matches, _context, xml ) { - var elem, - unmatched = matcher( seed, null, xml, [] ), - i = seed.length; - - // Match elements unmatched by `matcher` - while ( i-- ) { - if ( ( elem = unmatched[ i ] ) ) { - seed[ i ] = !( matches[ i ] = elem ); - } - } - } ) : - function( elem, _context, xml ) { - input[ 0 ] = elem; - matcher( input, null, xml, results ); - - // Don't keep the element (issue #299) - input[ 0 ] = null; - return !results.pop(); - }; - } ), - - "has": markFunction( function( selector ) { - return function( elem ) { - return Sizzle( selector, elem ).length > 0; - }; - } ), - - "contains": markFunction( function( text ) { - text = text.replace( runescape, funescape ); - return function( elem ) { - return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; - }; - } ), - - // "Whether an element is represented by a :lang() selector - // is based solely on the element's language value - // being equal to the identifier C, - // or beginning with the identifier C immediately followed by "-". - // The matching of C against the element's language value is performed case-insensitively. - // The identifier C does not have to be a valid language name." - // http://www.w3.org/TR/selectors/#lang-pseudo - "lang": markFunction( function( lang ) { - - // lang value must be a valid identifier - if ( !ridentifier.test( lang || "" ) ) { - Sizzle.error( "unsupported lang: " + lang ); - } - lang = lang.replace( runescape, funescape ).toLowerCase(); - return function( elem ) { - var elemLang; - do { - if ( ( elemLang = documentIsHTML ? - elem.lang : - elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { - - elemLang = elemLang.toLowerCase(); - return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; - } - } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); - return false; - }; - } ), - - // Miscellaneous - "target": function( elem ) { - var hash = window.location && window.location.hash; - return hash && hash.slice( 1 ) === elem.id; - }, - - "root": function( elem ) { - return elem === docElem; - }, - - "focus": function( elem ) { - return elem === document.activeElement && - ( !document.hasFocus || document.hasFocus() ) && - !!( elem.type || elem.href || ~elem.tabIndex ); - }, - - // Boolean properties - "enabled": createDisabledPseudo( false ), - "disabled": createDisabledPseudo( true ), - - "checked": function( elem ) { - - // In CSS3, :checked should return both checked and selected elements - // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - var nodeName = elem.nodeName.toLowerCase(); - return ( nodeName === "input" && !!elem.checked ) || - ( nodeName === "option" && !!elem.selected ); - }, - - "selected": function( elem ) { - - // Accessing this property makes selected-by-default - // options in Safari work properly - if ( elem.parentNode ) { - // eslint-disable-next-line no-unused-expressions - elem.parentNode.selectedIndex; - } - - return elem.selected === true; - }, - - // Contents - "empty": function( elem ) { - - // http://www.w3.org/TR/selectors/#empty-pseudo - // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), - // but not by others (comment: 8; processing instruction: 7; etc.) - // nodeType < 6 works because attributes (2) do not appear as children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - if ( elem.nodeType < 6 ) { - return false; - } - } - return true; - }, - - "parent": function( elem ) { - return !Expr.pseudos[ "empty" ]( elem ); - }, - - // Element/input types - "header": function( elem ) { - return rheader.test( elem.nodeName ); - }, - - "input": function( elem ) { - return rinputs.test( elem.nodeName ); - }, - - "button": function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === "button" || name === "button"; - }, - - "text": function( elem ) { - var attr; - return elem.nodeName.toLowerCase() === "input" && - elem.type === "text" && - - // Support: IE<8 - // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" - ( ( attr = elem.getAttribute( "type" ) ) == null || - attr.toLowerCase() === "text" ); - }, - - // Position-in-collection - "first": createPositionalPseudo( function() { - return [ 0 ]; - } ), - - "last": createPositionalPseudo( function( _matchIndexes, length ) { - return [ length - 1 ]; - } ), - - "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { - return [ argument < 0 ? argument + length : argument ]; - } ), - - "even": createPositionalPseudo( function( matchIndexes, length ) { - var i = 0; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ), - - "odd": createPositionalPseudo( function( matchIndexes, length ) { - var i = 1; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ), - - "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { - var i = argument < 0 ? - argument + length : - argument > length ? - length : - argument; - for ( ; --i >= 0; ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ), - - "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { - var i = argument < 0 ? argument + length : argument; - for ( ; ++i < length; ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ) - } -}; - -Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; - -// Add button/input type pseudos -for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { - Expr.pseudos[ i ] = createInputPseudo( i ); -} -for ( i in { submit: true, reset: true } ) { - Expr.pseudos[ i ] = createButtonPseudo( i ); -} - -// Easy API for creating new setFilters -function setFilters() {} -setFilters.prototype = Expr.filters = Expr.pseudos; -Expr.setFilters = new setFilters(); - -tokenize = Sizzle.tokenize = function( selector, parseOnly ) { - var matched, match, tokens, type, - soFar, groups, preFilters, - cached = tokenCache[ selector + " " ]; - - if ( cached ) { - return parseOnly ? 0 : cached.slice( 0 ); - } - - soFar = selector; - groups = []; - preFilters = Expr.preFilter; - - while ( soFar ) { - - // Comma and first run - if ( !matched || ( match = rcomma.exec( soFar ) ) ) { - if ( match ) { - - // Don't consume trailing commas as valid - soFar = soFar.slice( match[ 0 ].length ) || soFar; - } - groups.push( ( tokens = [] ) ); - } - - matched = false; - - // Combinators - if ( ( match = rcombinators.exec( soFar ) ) ) { - matched = match.shift(); - tokens.push( { - value: matched, - - // Cast descendant combinators to space - type: match[ 0 ].replace( rtrim, " " ) - } ); - soFar = soFar.slice( matched.length ); - } - - // Filters - for ( type in Expr.filter ) { - if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || - ( match = preFilters[ type ]( match ) ) ) ) { - matched = match.shift(); - tokens.push( { - value: matched, - type: type, - matches: match - } ); - soFar = soFar.slice( matched.length ); - } - } - - if ( !matched ) { - break; - } - } - - // Return the length of the invalid excess - // if we're just parsing - // Otherwise, throw an error or return tokens - return parseOnly ? - soFar.length : - soFar ? - Sizzle.error( selector ) : - - // Cache the tokens - tokenCache( selector, groups ).slice( 0 ); -}; - -function toSelector( tokens ) { - var i = 0, - len = tokens.length, - selector = ""; - for ( ; i < len; i++ ) { - selector += tokens[ i ].value; - } - return selector; -} - -function addCombinator( matcher, combinator, base ) { - var dir = combinator.dir, - skip = combinator.next, - key = skip || dir, - checkNonElements = base && key === "parentNode", - doneName = done++; - - return combinator.first ? - - // Check against closest ancestor/preceding element - function( elem, context, xml ) { - while ( ( elem = elem[ dir ] ) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - return matcher( elem, context, xml ); - } - } - return false; - } : - - // Check against all ancestor/preceding elements - function( elem, context, xml ) { - var oldCache, uniqueCache, outerCache, - newCache = [ dirruns, doneName ]; - - // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching - if ( xml ) { - while ( ( elem = elem[ dir ] ) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - if ( matcher( elem, context, xml ) ) { - return true; - } - } - } - } else { - while ( ( elem = elem[ dir ] ) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - outerCache = elem[ expando ] || ( elem[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ elem.uniqueID ] || - ( outerCache[ elem.uniqueID ] = {} ); - - if ( skip && skip === elem.nodeName.toLowerCase() ) { - elem = elem[ dir ] || elem; - } else if ( ( oldCache = uniqueCache[ key ] ) && - oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { - - // Assign to newCache so results back-propagate to previous elements - return ( newCache[ 2 ] = oldCache[ 2 ] ); - } else { - - // Reuse newcache so results back-propagate to previous elements - uniqueCache[ key ] = newCache; - - // A match means we're done; a fail means we have to keep checking - if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { - return true; - } - } - } - } - } - return false; - }; -} - -function elementMatcher( matchers ) { - return matchers.length > 1 ? - function( elem, context, xml ) { - var i = matchers.length; - while ( i-- ) { - if ( !matchers[ i ]( elem, context, xml ) ) { - return false; - } - } - return true; - } : - matchers[ 0 ]; -} - -function multipleContexts( selector, contexts, results ) { - var i = 0, - len = contexts.length; - for ( ; i < len; i++ ) { - Sizzle( selector, contexts[ i ], results ); - } - return results; -} - -function condense( unmatched, map, filter, context, xml ) { - var elem, - newUnmatched = [], - i = 0, - len = unmatched.length, - mapped = map != null; - - for ( ; i < len; i++ ) { - if ( ( elem = unmatched[ i ] ) ) { - if ( !filter || filter( elem, context, xml ) ) { - newUnmatched.push( elem ); - if ( mapped ) { - map.push( i ); - } - } - } - } - - return newUnmatched; -} - -function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { - if ( postFilter && !postFilter[ expando ] ) { - postFilter = setMatcher( postFilter ); - } - if ( postFinder && !postFinder[ expando ] ) { - postFinder = setMatcher( postFinder, postSelector ); - } - return markFunction( function( seed, results, context, xml ) { - var temp, i, elem, - preMap = [], - postMap = [], - preexisting = results.length, - - // Get initial elements from seed or context - elems = seed || multipleContexts( - selector || "*", - context.nodeType ? [ context ] : context, - [] - ), - - // Prefilter to get matcher input, preserving a map for seed-results synchronization - matcherIn = preFilter && ( seed || !selector ) ? - condense( elems, preMap, preFilter, context, xml ) : - elems, - - matcherOut = matcher ? - - // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, - postFinder || ( seed ? preFilter : preexisting || postFilter ) ? - - // ...intermediate processing is necessary - [] : - - // ...otherwise use results directly - results : - matcherIn; - - // Find primary matches - if ( matcher ) { - matcher( matcherIn, matcherOut, context, xml ); - } - - // Apply postFilter - if ( postFilter ) { - temp = condense( matcherOut, postMap ); - postFilter( temp, [], context, xml ); - - // Un-match failing elements by moving them back to matcherIn - i = temp.length; - while ( i-- ) { - if ( ( elem = temp[ i ] ) ) { - matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); - } - } - } - - if ( seed ) { - if ( postFinder || preFilter ) { - if ( postFinder ) { - - // Get the final matcherOut by condensing this intermediate into postFinder contexts - temp = []; - i = matcherOut.length; - while ( i-- ) { - if ( ( elem = matcherOut[ i ] ) ) { - - // Restore matcherIn since elem is not yet a final match - temp.push( ( matcherIn[ i ] = elem ) ); - } - } - postFinder( null, ( matcherOut = [] ), temp, xml ); - } - - // Move matched elements from seed to results to keep them synchronized - i = matcherOut.length; - while ( i-- ) { - if ( ( elem = matcherOut[ i ] ) && - ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { - - seed[ temp ] = !( results[ temp ] = elem ); - } - } - } - - // Add elements to results, through postFinder if defined - } else { - matcherOut = condense( - matcherOut === results ? - matcherOut.splice( preexisting, matcherOut.length ) : - matcherOut - ); - if ( postFinder ) { - postFinder( null, results, matcherOut, xml ); - } else { - push.apply( results, matcherOut ); - } - } - } ); -} - -function matcherFromTokens( tokens ) { - var checkContext, matcher, j, - len = tokens.length, - leadingRelative = Expr.relative[ tokens[ 0 ].type ], - implicitRelative = leadingRelative || Expr.relative[ " " ], - i = leadingRelative ? 1 : 0, - - // The foundational matcher ensures that elements are reachable from top-level context(s) - matchContext = addCombinator( function( elem ) { - return elem === checkContext; - }, implicitRelative, true ), - matchAnyContext = addCombinator( function( elem ) { - return indexOf( checkContext, elem ) > -1; - }, implicitRelative, true ), - matchers = [ function( elem, context, xml ) { - var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( - ( checkContext = context ).nodeType ? - matchContext( elem, context, xml ) : - matchAnyContext( elem, context, xml ) ); - - // Avoid hanging onto element (issue #299) - checkContext = null; - return ret; - } ]; - - for ( ; i < len; i++ ) { - if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { - matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; - } else { - matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); - - // Return special upon seeing a positional matcher - if ( matcher[ expando ] ) { - - // Find the next relative operator (if any) for proper handling - j = ++i; - for ( ; j < len; j++ ) { - if ( Expr.relative[ tokens[ j ].type ] ) { - break; - } - } - return setMatcher( - i > 1 && elementMatcher( matchers ), - i > 1 && toSelector( - - // If the preceding token was a descendant combinator, insert an implicit any-element `*` - tokens - .slice( 0, i - 1 ) - .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) - ).replace( rtrim, "$1" ), - matcher, - i < j && matcherFromTokens( tokens.slice( i, j ) ), - j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), - j < len && toSelector( tokens ) - ); - } - matchers.push( matcher ); - } - } - - return elementMatcher( matchers ); -} - -function matcherFromGroupMatchers( elementMatchers, setMatchers ) { - var bySet = setMatchers.length > 0, - byElement = elementMatchers.length > 0, - superMatcher = function( seed, context, xml, results, outermost ) { - var elem, j, matcher, - matchedCount = 0, - i = "0", - unmatched = seed && [], - setMatched = [], - contextBackup = outermostContext, - - // We must always have either seed elements or outermost context - elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), - - // Use integer dirruns iff this is the outermost matcher - dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), - len = elems.length; - - if ( outermost ) { - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - outermostContext = context == document || context || outermost; - } - - // Add elements passing elementMatchers directly to results - // Support: IE<9, Safari - // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id - for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { - if ( byElement && elem ) { - j = 0; - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( !context && elem.ownerDocument != document ) { - setDocument( elem ); - xml = !documentIsHTML; - } - while ( ( matcher = elementMatchers[ j++ ] ) ) { - if ( matcher( elem, context || document, xml ) ) { - results.push( elem ); - break; - } - } - if ( outermost ) { - dirruns = dirrunsUnique; - } - } - - // Track unmatched elements for set filters - if ( bySet ) { - - // They will have gone through all possible matchers - if ( ( elem = !matcher && elem ) ) { - matchedCount--; - } - - // Lengthen the array for every element, matched or not - if ( seed ) { - unmatched.push( elem ); - } - } - } - - // `i` is now the count of elements visited above, and adding it to `matchedCount` - // makes the latter nonnegative. - matchedCount += i; - - // Apply set filters to unmatched elements - // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` - // equals `i`), unless we didn't visit _any_ elements in the above loop because we have - // no element matchers and no seed. - // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that - // case, which will result in a "00" `matchedCount` that differs from `i` but is also - // numerically zero. - if ( bySet && i !== matchedCount ) { - j = 0; - while ( ( matcher = setMatchers[ j++ ] ) ) { - matcher( unmatched, setMatched, context, xml ); - } - - if ( seed ) { - - // Reintegrate element matches to eliminate the need for sorting - if ( matchedCount > 0 ) { - while ( i-- ) { - if ( !( unmatched[ i ] || setMatched[ i ] ) ) { - setMatched[ i ] = pop.call( results ); - } - } - } - - // Discard index placeholder values to get only actual matches - setMatched = condense( setMatched ); - } - - // Add matches to results - push.apply( results, setMatched ); - - // Seedless set matches succeeding multiple successful matchers stipulate sorting - if ( outermost && !seed && setMatched.length > 0 && - ( matchedCount + setMatchers.length ) > 1 ) { - - Sizzle.uniqueSort( results ); - } - } - - // Override manipulation of globals by nested matchers - if ( outermost ) { - dirruns = dirrunsUnique; - outermostContext = contextBackup; - } - - return unmatched; - }; - - return bySet ? - markFunction( superMatcher ) : - superMatcher; -} - -compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { - var i, - setMatchers = [], - elementMatchers = [], - cached = compilerCache[ selector + " " ]; - - if ( !cached ) { - - // Generate a function of recursive functions that can be used to check each element - if ( !match ) { - match = tokenize( selector ); - } - i = match.length; - while ( i-- ) { - cached = matcherFromTokens( match[ i ] ); - if ( cached[ expando ] ) { - setMatchers.push( cached ); - } else { - elementMatchers.push( cached ); - } - } - - // Cache the compiled function - cached = compilerCache( - selector, - matcherFromGroupMatchers( elementMatchers, setMatchers ) - ); - - // Save selector and tokenization - cached.selector = selector; - } - return cached; -}; - -/** - * A low-level selection function that works with Sizzle's compiled - * selector functions - * @param {String|Function} selector A selector or a pre-compiled - * selector function built with Sizzle.compile - * @param {Element} context - * @param {Array} [results] - * @param {Array} [seed] A set of elements to match against - */ -select = Sizzle.select = function( selector, context, results, seed ) { - var i, tokens, token, type, find, - compiled = typeof selector === "function" && selector, - match = !seed && tokenize( ( selector = compiled.selector || selector ) ); - - results = results || []; - - // Try to minimize operations if there is only one selector in the list and no seed - // (the latter of which guarantees us context) - if ( match.length === 1 ) { - - // Reduce context if the leading compound selector is an ID - tokens = match[ 0 ] = match[ 0 ].slice( 0 ); - if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && - context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { - - context = ( Expr.find[ "ID" ]( token.matches[ 0 ] - .replace( runescape, funescape ), context ) || [] )[ 0 ]; - if ( !context ) { - return results; - - // Precompiled matchers will still verify ancestry, so step up a level - } else if ( compiled ) { - context = context.parentNode; - } - - selector = selector.slice( tokens.shift().value.length ); - } - - // Fetch a seed set for right-to-left matching - i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; - while ( i-- ) { - token = tokens[ i ]; - - // Abort if we hit a combinator - if ( Expr.relative[ ( type = token.type ) ] ) { - break; - } - if ( ( find = Expr.find[ type ] ) ) { - - // Search, expanding context for leading sibling combinators - if ( ( seed = find( - token.matches[ 0 ].replace( runescape, funescape ), - rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || - context - ) ) ) { - - // If seed is empty or no tokens remain, we can return early - tokens.splice( i, 1 ); - selector = seed.length && toSelector( tokens ); - if ( !selector ) { - push.apply( results, seed ); - return results; - } - - break; - } - } - } - } - - // Compile and execute a filtering function if one is not provided - // Provide `match` to avoid retokenization if we modified the selector above - ( compiled || compile( selector, match ) )( - seed, - context, - !documentIsHTML, - results, - !context || rsibling.test( selector ) && testContext( context.parentNode ) || context - ); - return results; -}; - -// One-time assignments - -// Sort stability -support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; - -// Support: Chrome 14-35+ -// Always assume duplicates if they aren't passed to the comparison function -support.detectDuplicates = !!hasDuplicate; - -// Initialize against the default document -setDocument(); - -// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) -// Detached nodes confoundingly follow *each other* -support.sortDetached = assert( function( el ) { - - // Should return 1, but returns 4 (following) - return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; -} ); - -// Support: IE<8 -// Prevent attribute/property "interpolation" -// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx -if ( !assert( function( el ) { - el.innerHTML = ""; - return el.firstChild.getAttribute( "href" ) === "#"; -} ) ) { - addHandle( "type|href|height|width", function( elem, name, isXML ) { - if ( !isXML ) { - return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); - } - } ); -} - -// Support: IE<9 -// Use defaultValue in place of getAttribute("value") -if ( !support.attributes || !assert( function( el ) { - el.innerHTML = ""; - el.firstChild.setAttribute( "value", "" ); - return el.firstChild.getAttribute( "value" ) === ""; -} ) ) { - addHandle( "value", function( elem, _name, isXML ) { - if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { - return elem.defaultValue; - } - } ); -} - -// Support: IE<9 -// Use getAttributeNode to fetch booleans when getAttribute lies -if ( !assert( function( el ) { - return el.getAttribute( "disabled" ) == null; -} ) ) { - addHandle( booleans, function( elem, name, isXML ) { - var val; - if ( !isXML ) { - return elem[ name ] === true ? name.toLowerCase() : - ( val = elem.getAttributeNode( name ) ) && val.specified ? - val.value : - null; - } - } ); -} - -return Sizzle; - -} )( window ); - - - -jQuery.find = Sizzle; -jQuery.expr = Sizzle.selectors; - -// Deprecated -jQuery.expr[ ":" ] = jQuery.expr.pseudos; -jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; -jQuery.text = Sizzle.getText; -jQuery.isXMLDoc = Sizzle.isXML; -jQuery.contains = Sizzle.contains; -jQuery.escapeSelector = Sizzle.escape; - - - - -var dir = function( elem, dir, until ) { - var matched = [], - truncate = until !== undefined; - - while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { - if ( elem.nodeType === 1 ) { - if ( truncate && jQuery( elem ).is( until ) ) { - break; - } - matched.push( elem ); - } - } - return matched; -}; - - -var siblings = function( n, elem ) { - var matched = []; - - for ( ; n; n = n.nextSibling ) { - if ( n.nodeType === 1 && n !== elem ) { - matched.push( n ); - } - } - - return matched; -}; - - -var rneedsContext = jQuery.expr.match.needsContext; - - - -function nodeName( elem, name ) { - - return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); - -} -var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); - - - -// Implement the identical functionality for filter and not -function winnow( elements, qualifier, not ) { - if ( isFunction( qualifier ) ) { - return jQuery.grep( elements, function( elem, i ) { - return !!qualifier.call( elem, i, elem ) !== not; - } ); - } - - // Single element - if ( qualifier.nodeType ) { - return jQuery.grep( elements, function( elem ) { - return ( elem === qualifier ) !== not; - } ); - } - - // Arraylike of elements (jQuery, arguments, Array) - if ( typeof qualifier !== "string" ) { - return jQuery.grep( elements, function( elem ) { - return ( indexOf.call( qualifier, elem ) > -1 ) !== not; - } ); - } - - // Filtered directly for both simple and complex selectors - return jQuery.filter( qualifier, elements, not ); -} - -jQuery.filter = function( expr, elems, not ) { - var elem = elems[ 0 ]; - - if ( not ) { - expr = ":not(" + expr + ")"; - } - - if ( elems.length === 1 && elem.nodeType === 1 ) { - return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; - } - - return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { - return elem.nodeType === 1; - } ) ); -}; - -jQuery.fn.extend( { - find: function( selector ) { - var i, ret, - len = this.length, - self = this; - - if ( typeof selector !== "string" ) { - return this.pushStack( jQuery( selector ).filter( function() { - for ( i = 0; i < len; i++ ) { - if ( jQuery.contains( self[ i ], this ) ) { - return true; - } - } - } ) ); - } - - ret = this.pushStack( [] ); - - for ( i = 0; i < len; i++ ) { - jQuery.find( selector, self[ i ], ret ); - } - - return len > 1 ? jQuery.uniqueSort( ret ) : ret; - }, - filter: function( selector ) { - return this.pushStack( winnow( this, selector || [], false ) ); - }, - not: function( selector ) { - return this.pushStack( winnow( this, selector || [], true ) ); - }, - is: function( selector ) { - return !!winnow( - this, - - // If this is a positional/relative selector, check membership in the returned set - // so $("p:first").is("p:last") won't return true for a doc with two "p". - typeof selector === "string" && rneedsContext.test( selector ) ? - jQuery( selector ) : - selector || [], - false - ).length; - } -} ); - - -// Initialize a jQuery object - - -// A central reference to the root jQuery(document) -var rootjQuery, - - // A simple way to check for HTML strings - // Prioritize #id over to avoid XSS via location.hash (#9521) - // Strict HTML recognition (#11290: must start with <) - // Shortcut simple #id case for speed - rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, - - init = jQuery.fn.init = function( selector, context, root ) { - var match, elem; - - // HANDLE: $(""), $(null), $(undefined), $(false) - if ( !selector ) { - return this; - } - - // Method init() accepts an alternate rootjQuery - // so migrate can support jQuery.sub (gh-2101) - root = root || rootjQuery; - - // Handle HTML strings - if ( typeof selector === "string" ) { - if ( selector[ 0 ] === "<" && - selector[ selector.length - 1 ] === ">" && - selector.length >= 3 ) { - - // Assume that strings that start and end with <> are HTML and skip the regex check - match = [ null, selector, null ]; - - } else { - match = rquickExpr.exec( selector ); - } - - // Match html or make sure no context is specified for #id - if ( match && ( match[ 1 ] || !context ) ) { - - // HANDLE: $(html) -> $(array) - if ( match[ 1 ] ) { - context = context instanceof jQuery ? context[ 0 ] : context; - - // Option to run scripts is true for back-compat - // Intentionally let the error be thrown if parseHTML is not present - jQuery.merge( this, jQuery.parseHTML( - match[ 1 ], - context && context.nodeType ? context.ownerDocument || context : document, - true - ) ); - - // HANDLE: $(html, props) - if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { - for ( match in context ) { - - // Properties of context are called as methods if possible - if ( isFunction( this[ match ] ) ) { - this[ match ]( context[ match ] ); - - // ...and otherwise set as attributes - } else { - this.attr( match, context[ match ] ); - } - } - } - - return this; - - // HANDLE: $(#id) - } else { - elem = document.getElementById( match[ 2 ] ); - - if ( elem ) { - - // Inject the element directly into the jQuery object - this[ 0 ] = elem; - this.length = 1; - } - return this; - } - - // HANDLE: $(expr, $(...)) - } else if ( !context || context.jquery ) { - return ( context || root ).find( selector ); - - // HANDLE: $(expr, context) - // (which is just equivalent to: $(context).find(expr) - } else { - return this.constructor( context ).find( selector ); - } - - // HANDLE: $(DOMElement) - } else if ( selector.nodeType ) { - this[ 0 ] = selector; - this.length = 1; - return this; - - // HANDLE: $(function) - // Shortcut for document ready - } else if ( isFunction( selector ) ) { - return root.ready !== undefined ? - root.ready( selector ) : - - // Execute immediately if ready is not present - selector( jQuery ); - } - - return jQuery.makeArray( selector, this ); - }; - -// Give the init function the jQuery prototype for later instantiation -init.prototype = jQuery.fn; - -// Initialize central reference -rootjQuery = jQuery( document ); - - -var rparentsprev = /^(?:parents|prev(?:Until|All))/, - - // Methods guaranteed to produce a unique set when starting from a unique set - guaranteedUnique = { - children: true, - contents: true, - next: true, - prev: true - }; - -jQuery.fn.extend( { - has: function( target ) { - var targets = jQuery( target, this ), - l = targets.length; - - return this.filter( function() { - var i = 0; - for ( ; i < l; i++ ) { - if ( jQuery.contains( this, targets[ i ] ) ) { - return true; - } - } - } ); - }, - - closest: function( selectors, context ) { - var cur, - i = 0, - l = this.length, - matched = [], - targets = typeof selectors !== "string" && jQuery( selectors ); - - // Positional selectors never match, since there's no _selection_ context - if ( !rneedsContext.test( selectors ) ) { - for ( ; i < l; i++ ) { - for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { - - // Always skip document fragments - if ( cur.nodeType < 11 && ( targets ? - targets.index( cur ) > -1 : - - // Don't pass non-elements to Sizzle - cur.nodeType === 1 && - jQuery.find.matchesSelector( cur, selectors ) ) ) { - - matched.push( cur ); - break; - } - } - } - } - - return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); - }, - - // Determine the position of an element within the set - index: function( elem ) { - - // No argument, return index in parent - if ( !elem ) { - return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; - } - - // Index in selector - if ( typeof elem === "string" ) { - return indexOf.call( jQuery( elem ), this[ 0 ] ); - } - - // Locate the position of the desired element - return indexOf.call( this, - - // If it receives a jQuery object, the first element is used - elem.jquery ? elem[ 0 ] : elem - ); - }, - - add: function( selector, context ) { - return this.pushStack( - jQuery.uniqueSort( - jQuery.merge( this.get(), jQuery( selector, context ) ) - ) - ); - }, - - addBack: function( selector ) { - return this.add( selector == null ? - this.prevObject : this.prevObject.filter( selector ) - ); - } -} ); - -function sibling( cur, dir ) { - while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} - return cur; -} - -jQuery.each( { - parent: function( elem ) { - var parent = elem.parentNode; - return parent && parent.nodeType !== 11 ? parent : null; - }, - parents: function( elem ) { - return dir( elem, "parentNode" ); - }, - parentsUntil: function( elem, _i, until ) { - return dir( elem, "parentNode", until ); - }, - next: function( elem ) { - return sibling( elem, "nextSibling" ); - }, - prev: function( elem ) { - return sibling( elem, "previousSibling" ); - }, - nextAll: function( elem ) { - return dir( elem, "nextSibling" ); - }, - prevAll: function( elem ) { - return dir( elem, "previousSibling" ); - }, - nextUntil: function( elem, _i, until ) { - return dir( elem, "nextSibling", until ); - }, - prevUntil: function( elem, _i, until ) { - return dir( elem, "previousSibling", until ); - }, - siblings: function( elem ) { - return siblings( ( elem.parentNode || {} ).firstChild, elem ); - }, - children: function( elem ) { - return siblings( elem.firstChild ); - }, - contents: function( elem ) { - if ( elem.contentDocument != null && - - // Support: IE 11+ - // elements with no `data` attribute has an object - // `contentDocument` with a `null` prototype. - getProto( elem.contentDocument ) ) { - - return elem.contentDocument; - } - - // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only - // Treat the template element as a regular one in browsers that - // don't support it. - if ( nodeName( elem, "template" ) ) { - elem = elem.content || elem; - } - - return jQuery.merge( [], elem.childNodes ); - } -}, function( name, fn ) { - jQuery.fn[ name ] = function( until, selector ) { - var matched = jQuery.map( this, fn, until ); - - if ( name.slice( -5 ) !== "Until" ) { - selector = until; - } - - if ( selector && typeof selector === "string" ) { - matched = jQuery.filter( selector, matched ); - } - - if ( this.length > 1 ) { - - // Remove duplicates - if ( !guaranteedUnique[ name ] ) { - jQuery.uniqueSort( matched ); - } - - // Reverse order for parents* and prev-derivatives - if ( rparentsprev.test( name ) ) { - matched.reverse(); - } - } - - return this.pushStack( matched ); - }; -} ); -var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); - - - -// Convert String-formatted options into Object-formatted ones -function createOptions( options ) { - var object = {}; - jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { - object[ flag ] = true; - } ); - return object; -} - -/* - * Create a callback list using the following parameters: - * - * options: an optional list of space-separated options that will change how - * the callback list behaves or a more traditional option object - * - * By default a callback list will act like an event callback list and can be - * "fired" multiple times. - * - * Possible options: - * - * once: will ensure the callback list can only be fired once (like a Deferred) - * - * memory: will keep track of previous values and will call any callback added - * after the list has been fired right away with the latest "memorized" - * values (like a Deferred) - * - * unique: will ensure a callback can only be added once (no duplicate in the list) - * - * stopOnFalse: interrupt callings when a callback returns false - * - */ -jQuery.Callbacks = function( options ) { - - // Convert options from String-formatted to Object-formatted if needed - // (we check in cache first) - options = typeof options === "string" ? - createOptions( options ) : - jQuery.extend( {}, options ); - - var // Flag to know if list is currently firing - firing, - - // Last fire value for non-forgettable lists - memory, - - // Flag to know if list was already fired - fired, - - // Flag to prevent firing - locked, - - // Actual callback list - list = [], - - // Queue of execution data for repeatable lists - queue = [], - - // Index of currently firing callback (modified by add/remove as needed) - firingIndex = -1, - - // Fire callbacks - fire = function() { - - // Enforce single-firing - locked = locked || options.once; - - // Execute callbacks for all pending executions, - // respecting firingIndex overrides and runtime changes - fired = firing = true; - for ( ; queue.length; firingIndex = -1 ) { - memory = queue.shift(); - while ( ++firingIndex < list.length ) { - - // Run callback and check for early termination - if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && - options.stopOnFalse ) { - - // Jump to end and forget the data so .add doesn't re-fire - firingIndex = list.length; - memory = false; - } - } - } - - // Forget the data if we're done with it - if ( !options.memory ) { - memory = false; - } - - firing = false; - - // Clean up if we're done firing for good - if ( locked ) { - - // Keep an empty list if we have data for future add calls - if ( memory ) { - list = []; - - // Otherwise, this object is spent - } else { - list = ""; - } - } - }, - - // Actual Callbacks object - self = { - - // Add a callback or a collection of callbacks to the list - add: function() { - if ( list ) { - - // If we have memory from a past run, we should fire after adding - if ( memory && !firing ) { - firingIndex = list.length - 1; - queue.push( memory ); - } - - ( function add( args ) { - jQuery.each( args, function( _, arg ) { - if ( isFunction( arg ) ) { - if ( !options.unique || !self.has( arg ) ) { - list.push( arg ); - } - } else if ( arg && arg.length && toType( arg ) !== "string" ) { - - // Inspect recursively - add( arg ); - } - } ); - } )( arguments ); - - if ( memory && !firing ) { - fire(); - } - } - return this; - }, - - // Remove a callback from the list - remove: function() { - jQuery.each( arguments, function( _, arg ) { - var index; - while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { - list.splice( index, 1 ); - - // Handle firing indexes - if ( index <= firingIndex ) { - firingIndex--; - } - } - } ); - return this; - }, - - // Check if a given callback is in the list. - // If no argument is given, return whether or not list has callbacks attached. - has: function( fn ) { - return fn ? - jQuery.inArray( fn, list ) > -1 : - list.length > 0; - }, - - // Remove all callbacks from the list - empty: function() { - if ( list ) { - list = []; - } - return this; - }, - - // Disable .fire and .add - // Abort any current/pending executions - // Clear all callbacks and values - disable: function() { - locked = queue = []; - list = memory = ""; - return this; - }, - disabled: function() { - return !list; - }, - - // Disable .fire - // Also disable .add unless we have memory (since it would have no effect) - // Abort any pending executions - lock: function() { - locked = queue = []; - if ( !memory && !firing ) { - list = memory = ""; - } - return this; - }, - locked: function() { - return !!locked; - }, - - // Call all callbacks with the given context and arguments - fireWith: function( context, args ) { - if ( !locked ) { - args = args || []; - args = [ context, args.slice ? args.slice() : args ]; - queue.push( args ); - if ( !firing ) { - fire(); - } - } - return this; - }, - - // Call all the callbacks with the given arguments - fire: function() { - self.fireWith( this, arguments ); - return this; - }, - - // To know if the callbacks have already been called at least once - fired: function() { - return !!fired; - } - }; - - return self; -}; - - -function Identity( v ) { - return v; -} -function Thrower( ex ) { - throw ex; -} - -function adoptValue( value, resolve, reject, noValue ) { - var method; - - try { - - // Check for promise aspect first to privilege synchronous behavior - if ( value && isFunction( ( method = value.promise ) ) ) { - method.call( value ).done( resolve ).fail( reject ); - - // Other thenables - } else if ( value && isFunction( ( method = value.then ) ) ) { - method.call( value, resolve, reject ); - - // Other non-thenables - } else { - - // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: - // * false: [ value ].slice( 0 ) => resolve( value ) - // * true: [ value ].slice( 1 ) => resolve() - resolve.apply( undefined, [ value ].slice( noValue ) ); - } - - // For Promises/A+, convert exceptions into rejections - // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in - // Deferred#then to conditionally suppress rejection. - } catch ( value ) { - - // Support: Android 4.0 only - // Strict mode functions invoked without .call/.apply get global-object context - reject.apply( undefined, [ value ] ); - } -} - -jQuery.extend( { - - Deferred: function( func ) { - var tuples = [ - - // action, add listener, callbacks, - // ... .then handlers, argument index, [final state] - [ "notify", "progress", jQuery.Callbacks( "memory" ), - jQuery.Callbacks( "memory" ), 2 ], - [ "resolve", "done", jQuery.Callbacks( "once memory" ), - jQuery.Callbacks( "once memory" ), 0, "resolved" ], - [ "reject", "fail", jQuery.Callbacks( "once memory" ), - jQuery.Callbacks( "once memory" ), 1, "rejected" ] - ], - state = "pending", - promise = { - state: function() { - return state; - }, - always: function() { - deferred.done( arguments ).fail( arguments ); - return this; - }, - "catch": function( fn ) { - return promise.then( null, fn ); - }, - - // Keep pipe for back-compat - pipe: function( /* fnDone, fnFail, fnProgress */ ) { - var fns = arguments; - - return jQuery.Deferred( function( newDefer ) { - jQuery.each( tuples, function( _i, tuple ) { - - // Map tuples (progress, done, fail) to arguments (done, fail, progress) - var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; - - // deferred.progress(function() { bind to newDefer or newDefer.notify }) - // deferred.done(function() { bind to newDefer or newDefer.resolve }) - // deferred.fail(function() { bind to newDefer or newDefer.reject }) - deferred[ tuple[ 1 ] ]( function() { - var returned = fn && fn.apply( this, arguments ); - if ( returned && isFunction( returned.promise ) ) { - returned.promise() - .progress( newDefer.notify ) - .done( newDefer.resolve ) - .fail( newDefer.reject ); - } else { - newDefer[ tuple[ 0 ] + "With" ]( - this, - fn ? [ returned ] : arguments - ); - } - } ); - } ); - fns = null; - } ).promise(); - }, - then: function( onFulfilled, onRejected, onProgress ) { - var maxDepth = 0; - function resolve( depth, deferred, handler, special ) { - return function() { - var that = this, - args = arguments, - mightThrow = function() { - var returned, then; - - // Support: Promises/A+ section 2.3.3.3.3 - // https://promisesaplus.com/#point-59 - // Ignore double-resolution attempts - if ( depth < maxDepth ) { - return; - } - - returned = handler.apply( that, args ); - - // Support: Promises/A+ section 2.3.1 - // https://promisesaplus.com/#point-48 - if ( returned === deferred.promise() ) { - throw new TypeError( "Thenable self-resolution" ); - } - - // Support: Promises/A+ sections 2.3.3.1, 3.5 - // https://promisesaplus.com/#point-54 - // https://promisesaplus.com/#point-75 - // Retrieve `then` only once - then = returned && - - // Support: Promises/A+ section 2.3.4 - // https://promisesaplus.com/#point-64 - // Only check objects and functions for thenability - ( typeof returned === "object" || - typeof returned === "function" ) && - returned.then; - - // Handle a returned thenable - if ( isFunction( then ) ) { - - // Special processors (notify) just wait for resolution - if ( special ) { - then.call( - returned, - resolve( maxDepth, deferred, Identity, special ), - resolve( maxDepth, deferred, Thrower, special ) - ); - - // Normal processors (resolve) also hook into progress - } else { - - // ...and disregard older resolution values - maxDepth++; - - then.call( - returned, - resolve( maxDepth, deferred, Identity, special ), - resolve( maxDepth, deferred, Thrower, special ), - resolve( maxDepth, deferred, Identity, - deferred.notifyWith ) - ); - } - - // Handle all other returned values - } else { - - // Only substitute handlers pass on context - // and multiple values (non-spec behavior) - if ( handler !== Identity ) { - that = undefined; - args = [ returned ]; - } - - // Process the value(s) - // Default process is resolve - ( special || deferred.resolveWith )( that, args ); - } - }, - - // Only normal processors (resolve) catch and reject exceptions - process = special ? - mightThrow : - function() { - try { - mightThrow(); - } catch ( e ) { - - if ( jQuery.Deferred.exceptionHook ) { - jQuery.Deferred.exceptionHook( e, - process.stackTrace ); - } - - // Support: Promises/A+ section 2.3.3.3.4.1 - // https://promisesaplus.com/#point-61 - // Ignore post-resolution exceptions - if ( depth + 1 >= maxDepth ) { - - // Only substitute handlers pass on context - // and multiple values (non-spec behavior) - if ( handler !== Thrower ) { - that = undefined; - args = [ e ]; - } - - deferred.rejectWith( that, args ); - } - } - }; - - // Support: Promises/A+ section 2.3.3.3.1 - // https://promisesaplus.com/#point-57 - // Re-resolve promises immediately to dodge false rejection from - // subsequent errors - if ( depth ) { - process(); - } else { - - // Call an optional hook to record the stack, in case of exception - // since it's otherwise lost when execution goes async - if ( jQuery.Deferred.getStackHook ) { - process.stackTrace = jQuery.Deferred.getStackHook(); - } - window.setTimeout( process ); - } - }; - } - - return jQuery.Deferred( function( newDefer ) { - - // progress_handlers.add( ... ) - tuples[ 0 ][ 3 ].add( - resolve( - 0, - newDefer, - isFunction( onProgress ) ? - onProgress : - Identity, - newDefer.notifyWith - ) - ); - - // fulfilled_handlers.add( ... ) - tuples[ 1 ][ 3 ].add( - resolve( - 0, - newDefer, - isFunction( onFulfilled ) ? - onFulfilled : - Identity - ) - ); - - // rejected_handlers.add( ... ) - tuples[ 2 ][ 3 ].add( - resolve( - 0, - newDefer, - isFunction( onRejected ) ? - onRejected : - Thrower - ) - ); - } ).promise(); - }, - - // Get a promise for this deferred - // If obj is provided, the promise aspect is added to the object - promise: function( obj ) { - return obj != null ? jQuery.extend( obj, promise ) : promise; - } - }, - deferred = {}; - - // Add list-specific methods - jQuery.each( tuples, function( i, tuple ) { - var list = tuple[ 2 ], - stateString = tuple[ 5 ]; - - // promise.progress = list.add - // promise.done = list.add - // promise.fail = list.add - promise[ tuple[ 1 ] ] = list.add; - - // Handle state - if ( stateString ) { - list.add( - function() { - - // state = "resolved" (i.e., fulfilled) - // state = "rejected" - state = stateString; - }, - - // rejected_callbacks.disable - // fulfilled_callbacks.disable - tuples[ 3 - i ][ 2 ].disable, - - // rejected_handlers.disable - // fulfilled_handlers.disable - tuples[ 3 - i ][ 3 ].disable, - - // progress_callbacks.lock - tuples[ 0 ][ 2 ].lock, - - // progress_handlers.lock - tuples[ 0 ][ 3 ].lock - ); - } - - // progress_handlers.fire - // fulfilled_handlers.fire - // rejected_handlers.fire - list.add( tuple[ 3 ].fire ); - - // deferred.notify = function() { deferred.notifyWith(...) } - // deferred.resolve = function() { deferred.resolveWith(...) } - // deferred.reject = function() { deferred.rejectWith(...) } - deferred[ tuple[ 0 ] ] = function() { - deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); - return this; - }; - - // deferred.notifyWith = list.fireWith - // deferred.resolveWith = list.fireWith - // deferred.rejectWith = list.fireWith - deferred[ tuple[ 0 ] + "With" ] = list.fireWith; - } ); - - // Make the deferred a promise - promise.promise( deferred ); - - // Call given func if any - if ( func ) { - func.call( deferred, deferred ); - } - - // All done! - return deferred; - }, - - // Deferred helper - when: function( singleValue ) { - var - - // count of uncompleted subordinates - remaining = arguments.length, - - // count of unprocessed arguments - i = remaining, - - // subordinate fulfillment data - resolveContexts = Array( i ), - resolveValues = slice.call( arguments ), - - // the primary Deferred - primary = jQuery.Deferred(), - - // subordinate callback factory - updateFunc = function( i ) { - return function( value ) { - resolveContexts[ i ] = this; - resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; - if ( !( --remaining ) ) { - primary.resolveWith( resolveContexts, resolveValues ); - } - }; - }; - - // Single- and empty arguments are adopted like Promise.resolve - if ( remaining <= 1 ) { - adoptValue( singleValue, primary.done( updateFunc( i ) ).resolve, primary.reject, - !remaining ); - - // Use .then() to unwrap secondary thenables (cf. gh-3000) - if ( primary.state() === "pending" || - isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { - - return primary.then(); - } - } - - // Multiple arguments are aggregated like Promise.all array elements - while ( i-- ) { - adoptValue( resolveValues[ i ], updateFunc( i ), primary.reject ); - } - - return primary.promise(); - } -} ); - - -// These usually indicate a programmer mistake during development, -// warn about them ASAP rather than swallowing them by default. -var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; - -jQuery.Deferred.exceptionHook = function( error, stack ) { - - // Support: IE 8 - 9 only - // Console exists when dev tools are open, which can happen at any time - if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { - window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); - } -}; - - - - -jQuery.readyException = function( error ) { - window.setTimeout( function() { - throw error; - } ); -}; - - - - -// The deferred used on DOM ready -var readyList = jQuery.Deferred(); - -jQuery.fn.ready = function( fn ) { - - readyList - .then( fn ) - - // Wrap jQuery.readyException in a function so that the lookup - // happens at the time of error handling instead of callback - // registration. - .catch( function( error ) { - jQuery.readyException( error ); - } ); - - return this; -}; - -jQuery.extend( { - - // Is the DOM ready to be used? Set to true once it occurs. - isReady: false, - - // A counter to track how many items to wait for before - // the ready event fires. See #6781 - readyWait: 1, - - // Handle when the DOM is ready - ready: function( wait ) { - - // Abort if there are pending holds or we're already ready - if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { - return; - } - - // Remember that the DOM is ready - jQuery.isReady = true; - - // If a normal DOM Ready event fired, decrement, and wait if need be - if ( wait !== true && --jQuery.readyWait > 0 ) { - return; - } - - // If there are functions bound, to execute - readyList.resolveWith( document, [ jQuery ] ); - } -} ); - -jQuery.ready.then = readyList.then; - -// The ready event handler and self cleanup method -function completed() { - document.removeEventListener( "DOMContentLoaded", completed ); - window.removeEventListener( "load", completed ); - jQuery.ready(); -} - -// Catch cases where $(document).ready() is called -// after the browser event has already occurred. -// Support: IE <=9 - 10 only -// Older IE sometimes signals "interactive" too soon -if ( document.readyState === "complete" || - ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { - - // Handle it asynchronously to allow scripts the opportunity to delay ready - window.setTimeout( jQuery.ready ); - -} else { - - // Use the handy event callback - document.addEventListener( "DOMContentLoaded", completed ); - - // A fallback to window.onload, that will always work - window.addEventListener( "load", completed ); -} - - - - -// Multifunctional method to get and set values of a collection -// The value/s can optionally be executed if it's a function -var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { - var i = 0, - len = elems.length, - bulk = key == null; - - // Sets many values - if ( toType( key ) === "object" ) { - chainable = true; - for ( i in key ) { - access( elems, fn, i, key[ i ], true, emptyGet, raw ); - } - - // Sets one value - } else if ( value !== undefined ) { - chainable = true; - - if ( !isFunction( value ) ) { - raw = true; - } - - if ( bulk ) { - - // Bulk operations run against the entire set - if ( raw ) { - fn.call( elems, value ); - fn = null; - - // ...except when executing function values - } else { - bulk = fn; - fn = function( elem, _key, value ) { - return bulk.call( jQuery( elem ), value ); - }; - } - } - - if ( fn ) { - for ( ; i < len; i++ ) { - fn( - elems[ i ], key, raw ? - value : - value.call( elems[ i ], i, fn( elems[ i ], key ) ) - ); - } - } - } - - if ( chainable ) { - return elems; - } - - // Gets - if ( bulk ) { - return fn.call( elems ); - } - - return len ? fn( elems[ 0 ], key ) : emptyGet; -}; - - -// Matches dashed string for camelizing -var rmsPrefix = /^-ms-/, - rdashAlpha = /-([a-z])/g; - -// Used by camelCase as callback to replace() -function fcamelCase( _all, letter ) { - return letter.toUpperCase(); -} - -// Convert dashed to camelCase; used by the css and data modules -// Support: IE <=9 - 11, Edge 12 - 15 -// Microsoft forgot to hump their vendor prefix (#9572) -function camelCase( string ) { - return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); -} -var acceptData = function( owner ) { - - // Accepts only: - // - Node - // - Node.ELEMENT_NODE - // - Node.DOCUMENT_NODE - // - Object - // - Any - return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); -}; - - - - -function Data() { - this.expando = jQuery.expando + Data.uid++; -} - -Data.uid = 1; - -Data.prototype = { - - cache: function( owner ) { - - // Check if the owner object already has a cache - var value = owner[ this.expando ]; - - // If not, create one - if ( !value ) { - value = {}; - - // We can accept data for non-element nodes in modern browsers, - // but we should not, see #8335. - // Always return an empty object. - if ( acceptData( owner ) ) { - - // If it is a node unlikely to be stringify-ed or looped over - // use plain assignment - if ( owner.nodeType ) { - owner[ this.expando ] = value; - - // Otherwise secure it in a non-enumerable property - // configurable must be true to allow the property to be - // deleted when data is removed - } else { - Object.defineProperty( owner, this.expando, { - value: value, - configurable: true - } ); - } - } - } - - return value; - }, - set: function( owner, data, value ) { - var prop, - cache = this.cache( owner ); - - // Handle: [ owner, key, value ] args - // Always use camelCase key (gh-2257) - if ( typeof data === "string" ) { - cache[ camelCase( data ) ] = value; - - // Handle: [ owner, { properties } ] args - } else { - - // Copy the properties one-by-one to the cache object - for ( prop in data ) { - cache[ camelCase( prop ) ] = data[ prop ]; - } - } - return cache; - }, - get: function( owner, key ) { - return key === undefined ? - this.cache( owner ) : - - // Always use camelCase key (gh-2257) - owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; - }, - access: function( owner, key, value ) { - - // In cases where either: - // - // 1. No key was specified - // 2. A string key was specified, but no value provided - // - // Take the "read" path and allow the get method to determine - // which value to return, respectively either: - // - // 1. The entire cache object - // 2. The data stored at the key - // - if ( key === undefined || - ( ( key && typeof key === "string" ) && value === undefined ) ) { - - return this.get( owner, key ); - } - - // When the key is not a string, or both a key and value - // are specified, set or extend (existing objects) with either: - // - // 1. An object of properties - // 2. A key and value - // - this.set( owner, key, value ); - - // Since the "set" path can have two possible entry points - // return the expected data based on which path was taken[*] - return value !== undefined ? value : key; - }, - remove: function( owner, key ) { - var i, - cache = owner[ this.expando ]; - - if ( cache === undefined ) { - return; - } - - if ( key !== undefined ) { - - // Support array or space separated string of keys - if ( Array.isArray( key ) ) { - - // If key is an array of keys... - // We always set camelCase keys, so remove that. - key = key.map( camelCase ); - } else { - key = camelCase( key ); - - // If a key with the spaces exists, use it. - // Otherwise, create an array by matching non-whitespace - key = key in cache ? - [ key ] : - ( key.match( rnothtmlwhite ) || [] ); - } - - i = key.length; - - while ( i-- ) { - delete cache[ key[ i ] ]; - } - } - - // Remove the expando if there's no more data - if ( key === undefined || jQuery.isEmptyObject( cache ) ) { - - // Support: Chrome <=35 - 45 - // Webkit & Blink performance suffers when deleting properties - // from DOM nodes, so set to undefined instead - // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) - if ( owner.nodeType ) { - owner[ this.expando ] = undefined; - } else { - delete owner[ this.expando ]; - } - } - }, - hasData: function( owner ) { - var cache = owner[ this.expando ]; - return cache !== undefined && !jQuery.isEmptyObject( cache ); - } -}; -var dataPriv = new Data(); - -var dataUser = new Data(); - - - -// Implementation Summary -// -// 1. Enforce API surface and semantic compatibility with 1.9.x branch -// 2. Improve the module's maintainability by reducing the storage -// paths to a single mechanism. -// 3. Use the same single mechanism to support "private" and "user" data. -// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) -// 5. Avoid exposing implementation details on user objects (eg. expando properties) -// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 - -var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, - rmultiDash = /[A-Z]/g; - -function getData( data ) { - if ( data === "true" ) { - return true; - } - - if ( data === "false" ) { - return false; - } - - if ( data === "null" ) { - return null; - } - - // Only convert to a number if it doesn't change the string - if ( data === +data + "" ) { - return +data; - } - - if ( rbrace.test( data ) ) { - return JSON.parse( data ); - } - - return data; -} - -function dataAttr( elem, key, data ) { - var name; - - // If nothing was found internally, try to fetch any - // data from the HTML5 data-* attribute - if ( data === undefined && elem.nodeType === 1 ) { - name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); - data = elem.getAttribute( name ); - - if ( typeof data === "string" ) { - try { - data = getData( data ); - } catch ( e ) {} - - // Make sure we set the data so it isn't changed later - dataUser.set( elem, key, data ); - } else { - data = undefined; - } - } - return data; -} - -jQuery.extend( { - hasData: function( elem ) { - return dataUser.hasData( elem ) || dataPriv.hasData( elem ); - }, - - data: function( elem, name, data ) { - return dataUser.access( elem, name, data ); - }, - - removeData: function( elem, name ) { - dataUser.remove( elem, name ); - }, - - // TODO: Now that all calls to _data and _removeData have been replaced - // with direct calls to dataPriv methods, these can be deprecated. - _data: function( elem, name, data ) { - return dataPriv.access( elem, name, data ); - }, - - _removeData: function( elem, name ) { - dataPriv.remove( elem, name ); - } -} ); - -jQuery.fn.extend( { - data: function( key, value ) { - var i, name, data, - elem = this[ 0 ], - attrs = elem && elem.attributes; - - // Gets all values - if ( key === undefined ) { - if ( this.length ) { - data = dataUser.get( elem ); - - if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { - i = attrs.length; - while ( i-- ) { - - // Support: IE 11 only - // The attrs elements can be null (#14894) - if ( attrs[ i ] ) { - name = attrs[ i ].name; - if ( name.indexOf( "data-" ) === 0 ) { - name = camelCase( name.slice( 5 ) ); - dataAttr( elem, name, data[ name ] ); - } - } - } - dataPriv.set( elem, "hasDataAttrs", true ); - } - } - - return data; - } - - // Sets multiple values - if ( typeof key === "object" ) { - return this.each( function() { - dataUser.set( this, key ); - } ); - } - - return access( this, function( value ) { - var data; - - // The calling jQuery object (element matches) is not empty - // (and therefore has an element appears at this[ 0 ]) and the - // `value` parameter was not undefined. An empty jQuery object - // will result in `undefined` for elem = this[ 0 ] which will - // throw an exception if an attempt to read a data cache is made. - if ( elem && value === undefined ) { - - // Attempt to get data from the cache - // The key will always be camelCased in Data - data = dataUser.get( elem, key ); - if ( data !== undefined ) { - return data; - } - - // Attempt to "discover" the data in - // HTML5 custom data-* attrs - data = dataAttr( elem, key ); - if ( data !== undefined ) { - return data; - } - - // We tried really hard, but the data doesn't exist. - return; - } - - // Set the data... - this.each( function() { - - // We always store the camelCased key - dataUser.set( this, key, value ); - } ); - }, null, value, arguments.length > 1, null, true ); - }, - - removeData: function( key ) { - return this.each( function() { - dataUser.remove( this, key ); - } ); - } -} ); - - -jQuery.extend( { - queue: function( elem, type, data ) { - var queue; - - if ( elem ) { - type = ( type || "fx" ) + "queue"; - queue = dataPriv.get( elem, type ); - - // Speed up dequeue by getting out quickly if this is just a lookup - if ( data ) { - if ( !queue || Array.isArray( data ) ) { - queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); - } else { - queue.push( data ); - } - } - return queue || []; - } - }, - - dequeue: function( elem, type ) { - type = type || "fx"; - - var queue = jQuery.queue( elem, type ), - startLength = queue.length, - fn = queue.shift(), - hooks = jQuery._queueHooks( elem, type ), - next = function() { - jQuery.dequeue( elem, type ); - }; - - // If the fx queue is dequeued, always remove the progress sentinel - if ( fn === "inprogress" ) { - fn = queue.shift(); - startLength--; - } - - if ( fn ) { - - // Add a progress sentinel to prevent the fx queue from being - // automatically dequeued - if ( type === "fx" ) { - queue.unshift( "inprogress" ); - } - - // Clear up the last queue stop function - delete hooks.stop; - fn.call( elem, next, hooks ); - } - - if ( !startLength && hooks ) { - hooks.empty.fire(); - } - }, - - // Not public - generate a queueHooks object, or return the current one - _queueHooks: function( elem, type ) { - var key = type + "queueHooks"; - return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { - empty: jQuery.Callbacks( "once memory" ).add( function() { - dataPriv.remove( elem, [ type + "queue", key ] ); - } ) - } ); - } -} ); - -jQuery.fn.extend( { - queue: function( type, data ) { - var setter = 2; - - if ( typeof type !== "string" ) { - data = type; - type = "fx"; - setter--; - } - - if ( arguments.length < setter ) { - return jQuery.queue( this[ 0 ], type ); - } - - return data === undefined ? - this : - this.each( function() { - var queue = jQuery.queue( this, type, data ); - - // Ensure a hooks for this queue - jQuery._queueHooks( this, type ); - - if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { - jQuery.dequeue( this, type ); - } - } ); - }, - dequeue: function( type ) { - return this.each( function() { - jQuery.dequeue( this, type ); - } ); - }, - clearQueue: function( type ) { - return this.queue( type || "fx", [] ); - }, - - // Get a promise resolved when queues of a certain type - // are emptied (fx is the type by default) - promise: function( type, obj ) { - var tmp, - count = 1, - defer = jQuery.Deferred(), - elements = this, - i = this.length, - resolve = function() { - if ( !( --count ) ) { - defer.resolveWith( elements, [ elements ] ); - } - }; - - if ( typeof type !== "string" ) { - obj = type; - type = undefined; - } - type = type || "fx"; - - while ( i-- ) { - tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); - if ( tmp && tmp.empty ) { - count++; - tmp.empty.add( resolve ); - } - } - resolve(); - return defer.promise( obj ); - } -} ); -var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; - -var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); - - -var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; - -var documentElement = document.documentElement; - - - - var isAttached = function( elem ) { - return jQuery.contains( elem.ownerDocument, elem ); - }, - composed = { composed: true }; - - // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only - // Check attachment across shadow DOM boundaries when possible (gh-3504) - // Support: iOS 10.0-10.2 only - // Early iOS 10 versions support `attachShadow` but not `getRootNode`, - // leading to errors. We need to check for `getRootNode`. - if ( documentElement.getRootNode ) { - isAttached = function( elem ) { - return jQuery.contains( elem.ownerDocument, elem ) || - elem.getRootNode( composed ) === elem.ownerDocument; - }; - } -var isHiddenWithinTree = function( elem, el ) { - - // isHiddenWithinTree might be called from jQuery#filter function; - // in that case, element will be second argument - elem = el || elem; - - // Inline style trumps all - return elem.style.display === "none" || - elem.style.display === "" && - - // Otherwise, check computed style - // Support: Firefox <=43 - 45 - // Disconnected elements can have computed display: none, so first confirm that elem is - // in the document. - isAttached( elem ) && - - jQuery.css( elem, "display" ) === "none"; - }; - - - -function adjustCSS( elem, prop, valueParts, tween ) { - var adjusted, scale, - maxIterations = 20, - currentValue = tween ? - function() { - return tween.cur(); - } : - function() { - return jQuery.css( elem, prop, "" ); - }, - initial = currentValue(), - unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), - - // Starting value computation is required for potential unit mismatches - initialInUnit = elem.nodeType && - ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && - rcssNum.exec( jQuery.css( elem, prop ) ); - - if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { - - // Support: Firefox <=54 - // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) - initial = initial / 2; - - // Trust units reported by jQuery.css - unit = unit || initialInUnit[ 3 ]; - - // Iteratively approximate from a nonzero starting point - initialInUnit = +initial || 1; - - while ( maxIterations-- ) { - - // Evaluate and update our best guess (doubling guesses that zero out). - // Finish if the scale equals or crosses 1 (making the old*new product non-positive). - jQuery.style( elem, prop, initialInUnit + unit ); - if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { - maxIterations = 0; - } - initialInUnit = initialInUnit / scale; - - } - - initialInUnit = initialInUnit * 2; - jQuery.style( elem, prop, initialInUnit + unit ); - - // Make sure we update the tween properties later on - valueParts = valueParts || []; - } - - if ( valueParts ) { - initialInUnit = +initialInUnit || +initial || 0; - - // Apply relative offset (+=/-=) if specified - adjusted = valueParts[ 1 ] ? - initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : - +valueParts[ 2 ]; - if ( tween ) { - tween.unit = unit; - tween.start = initialInUnit; - tween.end = adjusted; - } - } - return adjusted; -} - - -var defaultDisplayMap = {}; - -function getDefaultDisplay( elem ) { - var temp, - doc = elem.ownerDocument, - nodeName = elem.nodeName, - display = defaultDisplayMap[ nodeName ]; - - if ( display ) { - return display; - } - - temp = doc.body.appendChild( doc.createElement( nodeName ) ); - display = jQuery.css( temp, "display" ); - - temp.parentNode.removeChild( temp ); - - if ( display === "none" ) { - display = "block"; - } - defaultDisplayMap[ nodeName ] = display; - - return display; -} - -function showHide( elements, show ) { - var display, elem, - values = [], - index = 0, - length = elements.length; - - // Determine new display value for elements that need to change - for ( ; index < length; index++ ) { - elem = elements[ index ]; - if ( !elem.style ) { - continue; - } - - display = elem.style.display; - if ( show ) { - - // Since we force visibility upon cascade-hidden elements, an immediate (and slow) - // check is required in this first loop unless we have a nonempty display value (either - // inline or about-to-be-restored) - if ( display === "none" ) { - values[ index ] = dataPriv.get( elem, "display" ) || null; - if ( !values[ index ] ) { - elem.style.display = ""; - } - } - if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { - values[ index ] = getDefaultDisplay( elem ); - } - } else { - if ( display !== "none" ) { - values[ index ] = "none"; - - // Remember what we're overwriting - dataPriv.set( elem, "display", display ); - } - } - } - - // Set the display of the elements in a second loop to avoid constant reflow - for ( index = 0; index < length; index++ ) { - if ( values[ index ] != null ) { - elements[ index ].style.display = values[ index ]; - } - } - - return elements; -} - -jQuery.fn.extend( { - show: function() { - return showHide( this, true ); - }, - hide: function() { - return showHide( this ); - }, - toggle: function( state ) { - if ( typeof state === "boolean" ) { - return state ? this.show() : this.hide(); - } - - return this.each( function() { - if ( isHiddenWithinTree( this ) ) { - jQuery( this ).show(); - } else { - jQuery( this ).hide(); - } - } ); - } -} ); -var rcheckableType = ( /^(?:checkbox|radio)$/i ); - -var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); - -var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); - - - -( function() { - var fragment = document.createDocumentFragment(), - div = fragment.appendChild( document.createElement( "div" ) ), - input = document.createElement( "input" ); - - // Support: Android 4.0 - 4.3 only - // Check state lost if the name is set (#11217) - // Support: Windows Web Apps (WWA) - // `name` and `type` must use .setAttribute for WWA (#14901) - input.setAttribute( "type", "radio" ); - input.setAttribute( "checked", "checked" ); - input.setAttribute( "name", "t" ); - - div.appendChild( input ); - - // Support: Android <=4.1 only - // Older WebKit doesn't clone checked state correctly in fragments - support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; - - // Support: IE <=11 only - // Make sure textarea (and checkbox) defaultValue is properly cloned - div.innerHTML = ""; - support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; - - // Support: IE <=9 only - // IE <=9 replaces "; - support.option = !!div.lastChild; -} )(); - - -// We have to close these tags to support XHTML (#13200) -var wrapMap = { - - // XHTML parsers do not magically insert elements in the - // same way that tag soup parsers do. So we cannot shorten - // this by omitting or other required elements. - thead: [ 1, "", "
    " ], - col: [ 2, "", "
    " ], - tr: [ 2, "", "
    " ], - td: [ 3, "", "
    " ], - - _default: [ 0, "", "" ] -}; - -wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; -wrapMap.th = wrapMap.td; - -// Support: IE <=9 only -if ( !support.option ) { - wrapMap.optgroup = wrapMap.option = [ 1, "" ]; -} - - -function getAll( context, tag ) { - - // Support: IE <=9 - 11 only - // Use typeof to avoid zero-argument method invocation on host objects (#15151) - var ret; - - if ( typeof context.getElementsByTagName !== "undefined" ) { - ret = context.getElementsByTagName( tag || "*" ); - - } else if ( typeof context.querySelectorAll !== "undefined" ) { - ret = context.querySelectorAll( tag || "*" ); - - } else { - ret = []; - } - - if ( tag === undefined || tag && nodeName( context, tag ) ) { - return jQuery.merge( [ context ], ret ); - } - - return ret; -} - - -// Mark scripts as having already been evaluated -function setGlobalEval( elems, refElements ) { - var i = 0, - l = elems.length; - - for ( ; i < l; i++ ) { - dataPriv.set( - elems[ i ], - "globalEval", - !refElements || dataPriv.get( refElements[ i ], "globalEval" ) - ); - } -} - - -var rhtml = /<|&#?\w+;/; - -function buildFragment( elems, context, scripts, selection, ignored ) { - var elem, tmp, tag, wrap, attached, j, - fragment = context.createDocumentFragment(), - nodes = [], - i = 0, - l = elems.length; - - for ( ; i < l; i++ ) { - elem = elems[ i ]; - - if ( elem || elem === 0 ) { - - // Add nodes directly - if ( toType( elem ) === "object" ) { - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); - - // Convert non-html into a text node - } else if ( !rhtml.test( elem ) ) { - nodes.push( context.createTextNode( elem ) ); - - // Convert html into DOM nodes - } else { - tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); - - // Deserialize a standard representation - tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); - wrap = wrapMap[ tag ] || wrapMap._default; - tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; - - // Descend through wrappers to the right content - j = wrap[ 0 ]; - while ( j-- ) { - tmp = tmp.lastChild; - } - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( nodes, tmp.childNodes ); - - // Remember the top-level container - tmp = fragment.firstChild; - - // Ensure the created nodes are orphaned (#12392) - tmp.textContent = ""; - } - } - } - - // Remove wrapper from fragment - fragment.textContent = ""; - - i = 0; - while ( ( elem = nodes[ i++ ] ) ) { - - // Skip elements already in the context collection (trac-4087) - if ( selection && jQuery.inArray( elem, selection ) > -1 ) { - if ( ignored ) { - ignored.push( elem ); - } - continue; - } - - attached = isAttached( elem ); - - // Append to fragment - tmp = getAll( fragment.appendChild( elem ), "script" ); - - // Preserve script evaluation history - if ( attached ) { - setGlobalEval( tmp ); - } - - // Capture executables - if ( scripts ) { - j = 0; - while ( ( elem = tmp[ j++ ] ) ) { - if ( rscriptType.test( elem.type || "" ) ) { - scripts.push( elem ); - } - } - } - } - - return fragment; -} - - -var rtypenamespace = /^([^.]*)(?:\.(.+)|)/; - -function returnTrue() { - return true; -} - -function returnFalse() { - return false; -} - -// Support: IE <=9 - 11+ -// focus() and blur() are asynchronous, except when they are no-op. -// So expect focus to be synchronous when the element is already active, -// and blur to be synchronous when the element is not already active. -// (focus and blur are always synchronous in other supported browsers, -// this just defines when we can count on it). -function expectSync( elem, type ) { - return ( elem === safeActiveElement() ) === ( type === "focus" ); -} - -// Support: IE <=9 only -// Accessing document.activeElement can throw unexpectedly -// https://bugs.jquery.com/ticket/13393 -function safeActiveElement() { - try { - return document.activeElement; - } catch ( err ) { } -} - -function on( elem, types, selector, data, fn, one ) { - var origFn, type; - - // Types can be a map of types/handlers - if ( typeof types === "object" ) { - - // ( types-Object, selector, data ) - if ( typeof selector !== "string" ) { - - // ( types-Object, data ) - data = data || selector; - selector = undefined; - } - for ( type in types ) { - on( elem, type, selector, data, types[ type ], one ); - } - return elem; - } - - if ( data == null && fn == null ) { - - // ( types, fn ) - fn = selector; - data = selector = undefined; - } else if ( fn == null ) { - if ( typeof selector === "string" ) { - - // ( types, selector, fn ) - fn = data; - data = undefined; - } else { - - // ( types, data, fn ) - fn = data; - data = selector; - selector = undefined; - } - } - if ( fn === false ) { - fn = returnFalse; - } else if ( !fn ) { - return elem; - } - - if ( one === 1 ) { - origFn = fn; - fn = function( event ) { - - // Can use an empty set, since event contains the info - jQuery().off( event ); - return origFn.apply( this, arguments ); - }; - - // Use same guid so caller can remove using origFn - fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); - } - return elem.each( function() { - jQuery.event.add( this, types, fn, data, selector ); - } ); -} - -/* - * Helper functions for managing events -- not part of the public interface. - * Props to Dean Edwards' addEvent library for many of the ideas. - */ -jQuery.event = { - - global: {}, - - add: function( elem, types, handler, data, selector ) { - - var handleObjIn, eventHandle, tmp, - events, t, handleObj, - special, handlers, type, namespaces, origType, - elemData = dataPriv.get( elem ); - - // Only attach events to objects that accept data - if ( !acceptData( elem ) ) { - return; - } - - // Caller can pass in an object of custom data in lieu of the handler - if ( handler.handler ) { - handleObjIn = handler; - handler = handleObjIn.handler; - selector = handleObjIn.selector; - } - - // Ensure that invalid selectors throw exceptions at attach time - // Evaluate against documentElement in case elem is a non-element node (e.g., document) - if ( selector ) { - jQuery.find.matchesSelector( documentElement, selector ); - } - - // Make sure that the handler has a unique ID, used to find/remove it later - if ( !handler.guid ) { - handler.guid = jQuery.guid++; - } - - // Init the element's event structure and main handler, if this is the first - if ( !( events = elemData.events ) ) { - events = elemData.events = Object.create( null ); - } - if ( !( eventHandle = elemData.handle ) ) { - eventHandle = elemData.handle = function( e ) { - - // Discard the second event of a jQuery.event.trigger() and - // when an event is called after a page has unloaded - return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? - jQuery.event.dispatch.apply( elem, arguments ) : undefined; - }; - } - - // Handle multiple events separated by a space - types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[ t ] ) || []; - type = origType = tmp[ 1 ]; - namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); - - // There *must* be a type, no attaching namespace-only handlers - if ( !type ) { - continue; - } - - // If event changes its type, use the special event handlers for the changed type - special = jQuery.event.special[ type ] || {}; - - // If selector defined, determine special event api type, otherwise given type - type = ( selector ? special.delegateType : special.bindType ) || type; - - // Update special based on newly reset type - special = jQuery.event.special[ type ] || {}; - - // handleObj is passed to all event handlers - handleObj = jQuery.extend( { - type: type, - origType: origType, - data: data, - handler: handler, - guid: handler.guid, - selector: selector, - needsContext: selector && jQuery.expr.match.needsContext.test( selector ), - namespace: namespaces.join( "." ) - }, handleObjIn ); - - // Init the event handler queue if we're the first - if ( !( handlers = events[ type ] ) ) { - handlers = events[ type ] = []; - handlers.delegateCount = 0; - - // Only use addEventListener if the special events handler returns false - if ( !special.setup || - special.setup.call( elem, data, namespaces, eventHandle ) === false ) { - - if ( elem.addEventListener ) { - elem.addEventListener( type, eventHandle ); - } - } - } - - if ( special.add ) { - special.add.call( elem, handleObj ); - - if ( !handleObj.handler.guid ) { - handleObj.handler.guid = handler.guid; - } - } - - // Add to the element's handler list, delegates in front - if ( selector ) { - handlers.splice( handlers.delegateCount++, 0, handleObj ); - } else { - handlers.push( handleObj ); - } - - // Keep track of which events have ever been used, for event optimization - jQuery.event.global[ type ] = true; - } - - }, - - // Detach an event or set of events from an element - remove: function( elem, types, handler, selector, mappedTypes ) { - - var j, origCount, tmp, - events, t, handleObj, - special, handlers, type, namespaces, origType, - elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); - - if ( !elemData || !( events = elemData.events ) ) { - return; - } - - // Once for each type.namespace in types; type may be omitted - types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[ t ] ) || []; - type = origType = tmp[ 1 ]; - namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); - - // Unbind all events (on this namespace, if provided) for the element - if ( !type ) { - for ( type in events ) { - jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); - } - continue; - } - - special = jQuery.event.special[ type ] || {}; - type = ( selector ? special.delegateType : special.bindType ) || type; - handlers = events[ type ] || []; - tmp = tmp[ 2 ] && - new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); - - // Remove matching events - origCount = j = handlers.length; - while ( j-- ) { - handleObj = handlers[ j ]; - - if ( ( mappedTypes || origType === handleObj.origType ) && - ( !handler || handler.guid === handleObj.guid ) && - ( !tmp || tmp.test( handleObj.namespace ) ) && - ( !selector || selector === handleObj.selector || - selector === "**" && handleObj.selector ) ) { - handlers.splice( j, 1 ); - - if ( handleObj.selector ) { - handlers.delegateCount--; - } - if ( special.remove ) { - special.remove.call( elem, handleObj ); - } - } - } - - // Remove generic event handler if we removed something and no more handlers exist - // (avoids potential for endless recursion during removal of special event handlers) - if ( origCount && !handlers.length ) { - if ( !special.teardown || - special.teardown.call( elem, namespaces, elemData.handle ) === false ) { - - jQuery.removeEvent( elem, type, elemData.handle ); - } - - delete events[ type ]; - } - } - - // Remove data and the expando if it's no longer used - if ( jQuery.isEmptyObject( events ) ) { - dataPriv.remove( elem, "handle events" ); - } - }, - - dispatch: function( nativeEvent ) { - - var i, j, ret, matched, handleObj, handlerQueue, - args = new Array( arguments.length ), - - // Make a writable jQuery.Event from the native event object - event = jQuery.event.fix( nativeEvent ), - - handlers = ( - dataPriv.get( this, "events" ) || Object.create( null ) - )[ event.type ] || [], - special = jQuery.event.special[ event.type ] || {}; - - // Use the fix-ed jQuery.Event rather than the (read-only) native event - args[ 0 ] = event; - - for ( i = 1; i < arguments.length; i++ ) { - args[ i ] = arguments[ i ]; - } - - event.delegateTarget = this; - - // Call the preDispatch hook for the mapped type, and let it bail if desired - if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { - return; - } - - // Determine handlers - handlerQueue = jQuery.event.handlers.call( this, event, handlers ); - - // Run delegates first; they may want to stop propagation beneath us - i = 0; - while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { - event.currentTarget = matched.elem; - - j = 0; - while ( ( handleObj = matched.handlers[ j++ ] ) && - !event.isImmediatePropagationStopped() ) { - - // If the event is namespaced, then each handler is only invoked if it is - // specially universal or its namespaces are a superset of the event's. - if ( !event.rnamespace || handleObj.namespace === false || - event.rnamespace.test( handleObj.namespace ) ) { - - event.handleObj = handleObj; - event.data = handleObj.data; - - ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || - handleObj.handler ).apply( matched.elem, args ); - - if ( ret !== undefined ) { - if ( ( event.result = ret ) === false ) { - event.preventDefault(); - event.stopPropagation(); - } - } - } - } - } - - // Call the postDispatch hook for the mapped type - if ( special.postDispatch ) { - special.postDispatch.call( this, event ); - } - - return event.result; - }, - - handlers: function( event, handlers ) { - var i, handleObj, sel, matchedHandlers, matchedSelectors, - handlerQueue = [], - delegateCount = handlers.delegateCount, - cur = event.target; - - // Find delegate handlers - if ( delegateCount && - - // Support: IE <=9 - // Black-hole SVG instance trees (trac-13180) - cur.nodeType && - - // Support: Firefox <=42 - // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) - // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click - // Support: IE 11 only - // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) - !( event.type === "click" && event.button >= 1 ) ) { - - for ( ; cur !== this; cur = cur.parentNode || this ) { - - // Don't check non-elements (#13208) - // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) - if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { - matchedHandlers = []; - matchedSelectors = {}; - for ( i = 0; i < delegateCount; i++ ) { - handleObj = handlers[ i ]; - - // Don't conflict with Object.prototype properties (#13203) - sel = handleObj.selector + " "; - - if ( matchedSelectors[ sel ] === undefined ) { - matchedSelectors[ sel ] = handleObj.needsContext ? - jQuery( sel, this ).index( cur ) > -1 : - jQuery.find( sel, this, null, [ cur ] ).length; - } - if ( matchedSelectors[ sel ] ) { - matchedHandlers.push( handleObj ); - } - } - if ( matchedHandlers.length ) { - handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); - } - } - } - } - - // Add the remaining (directly-bound) handlers - cur = this; - if ( delegateCount < handlers.length ) { - handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); - } - - return handlerQueue; - }, - - addProp: function( name, hook ) { - Object.defineProperty( jQuery.Event.prototype, name, { - enumerable: true, - configurable: true, - - get: isFunction( hook ) ? - function() { - if ( this.originalEvent ) { - return hook( this.originalEvent ); - } - } : - function() { - if ( this.originalEvent ) { - return this.originalEvent[ name ]; - } - }, - - set: function( value ) { - Object.defineProperty( this, name, { - enumerable: true, - configurable: true, - writable: true, - value: value - } ); - } - } ); - }, - - fix: function( originalEvent ) { - return originalEvent[ jQuery.expando ] ? - originalEvent : - new jQuery.Event( originalEvent ); - }, - - special: { - load: { - - // Prevent triggered image.load events from bubbling to window.load - noBubble: true - }, - click: { - - // Utilize native event to ensure correct state for checkable inputs - setup: function( data ) { - - // For mutual compressibility with _default, replace `this` access with a local var. - // `|| data` is dead code meant only to preserve the variable through minification. - var el = this || data; - - // Claim the first handler - if ( rcheckableType.test( el.type ) && - el.click && nodeName( el, "input" ) ) { - - // dataPriv.set( el, "click", ... ) - leverageNative( el, "click", returnTrue ); - } - - // Return false to allow normal processing in the caller - return false; - }, - trigger: function( data ) { - - // For mutual compressibility with _default, replace `this` access with a local var. - // `|| data` is dead code meant only to preserve the variable through minification. - var el = this || data; - - // Force setup before triggering a click - if ( rcheckableType.test( el.type ) && - el.click && nodeName( el, "input" ) ) { - - leverageNative( el, "click" ); - } - - // Return non-false to allow normal event-path propagation - return true; - }, - - // For cross-browser consistency, suppress native .click() on links - // Also prevent it if we're currently inside a leveraged native-event stack - _default: function( event ) { - var target = event.target; - return rcheckableType.test( target.type ) && - target.click && nodeName( target, "input" ) && - dataPriv.get( target, "click" ) || - nodeName( target, "a" ); - } - }, - - beforeunload: { - postDispatch: function( event ) { - - // Support: Firefox 20+ - // Firefox doesn't alert if the returnValue field is not set. - if ( event.result !== undefined && event.originalEvent ) { - event.originalEvent.returnValue = event.result; - } - } - } - } -}; - -// Ensure the presence of an event listener that handles manually-triggered -// synthetic events by interrupting progress until reinvoked in response to -// *native* events that it fires directly, ensuring that state changes have -// already occurred before other listeners are invoked. -function leverageNative( el, type, expectSync ) { - - // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add - if ( !expectSync ) { - if ( dataPriv.get( el, type ) === undefined ) { - jQuery.event.add( el, type, returnTrue ); - } - return; - } - - // Register the controller as a special universal handler for all event namespaces - dataPriv.set( el, type, false ); - jQuery.event.add( el, type, { - namespace: false, - handler: function( event ) { - var notAsync, result, - saved = dataPriv.get( this, type ); - - if ( ( event.isTrigger & 1 ) && this[ type ] ) { - - // Interrupt processing of the outer synthetic .trigger()ed event - // Saved data should be false in such cases, but might be a leftover capture object - // from an async native handler (gh-4350) - if ( !saved.length ) { - - // Store arguments for use when handling the inner native event - // There will always be at least one argument (an event object), so this array - // will not be confused with a leftover capture object. - saved = slice.call( arguments ); - dataPriv.set( this, type, saved ); - - // Trigger the native event and capture its result - // Support: IE <=9 - 11+ - // focus() and blur() are asynchronous - notAsync = expectSync( this, type ); - this[ type ](); - result = dataPriv.get( this, type ); - if ( saved !== result || notAsync ) { - dataPriv.set( this, type, false ); - } else { - result = {}; - } - if ( saved !== result ) { - - // Cancel the outer synthetic event - event.stopImmediatePropagation(); - event.preventDefault(); - - // Support: Chrome 86+ - // In Chrome, if an element having a focusout handler is blurred by - // clicking outside of it, it invokes the handler synchronously. If - // that handler calls `.remove()` on the element, the data is cleared, - // leaving `result` undefined. We need to guard against this. - return result && result.value; - } - - // If this is an inner synthetic event for an event with a bubbling surrogate - // (focus or blur), assume that the surrogate already propagated from triggering the - // native event and prevent that from happening again here. - // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the - // bubbling surrogate propagates *after* the non-bubbling base), but that seems - // less bad than duplication. - } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { - event.stopPropagation(); - } - - // If this is a native event triggered above, everything is now in order - // Fire an inner synthetic event with the original arguments - } else if ( saved.length ) { - - // ...and capture the result - dataPriv.set( this, type, { - value: jQuery.event.trigger( - - // Support: IE <=9 - 11+ - // Extend with the prototype to reset the above stopImmediatePropagation() - jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), - saved.slice( 1 ), - this - ) - } ); - - // Abort handling of the native event - event.stopImmediatePropagation(); - } - } - } ); -} - -jQuery.removeEvent = function( elem, type, handle ) { - - // This "if" is needed for plain objects - if ( elem.removeEventListener ) { - elem.removeEventListener( type, handle ); - } -}; - -jQuery.Event = function( src, props ) { - - // Allow instantiation without the 'new' keyword - if ( !( this instanceof jQuery.Event ) ) { - return new jQuery.Event( src, props ); - } - - // Event object - if ( src && src.type ) { - this.originalEvent = src; - this.type = src.type; - - // Events bubbling up the document may have been marked as prevented - // by a handler lower down the tree; reflect the correct value. - this.isDefaultPrevented = src.defaultPrevented || - src.defaultPrevented === undefined && - - // Support: Android <=2.3 only - src.returnValue === false ? - returnTrue : - returnFalse; - - // Create target properties - // Support: Safari <=6 - 7 only - // Target should not be a text node (#504, #13143) - this.target = ( src.target && src.target.nodeType === 3 ) ? - src.target.parentNode : - src.target; - - this.currentTarget = src.currentTarget; - this.relatedTarget = src.relatedTarget; - - // Event type - } else { - this.type = src; - } - - // Put explicitly provided properties onto the event object - if ( props ) { - jQuery.extend( this, props ); - } - - // Create a timestamp if incoming event doesn't have one - this.timeStamp = src && src.timeStamp || Date.now(); - - // Mark it as fixed - this[ jQuery.expando ] = true; -}; - -// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding -// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html -jQuery.Event.prototype = { - constructor: jQuery.Event, - isDefaultPrevented: returnFalse, - isPropagationStopped: returnFalse, - isImmediatePropagationStopped: returnFalse, - isSimulated: false, - - preventDefault: function() { - var e = this.originalEvent; - - this.isDefaultPrevented = returnTrue; - - if ( e && !this.isSimulated ) { - e.preventDefault(); - } - }, - stopPropagation: function() { - var e = this.originalEvent; - - this.isPropagationStopped = returnTrue; - - if ( e && !this.isSimulated ) { - e.stopPropagation(); - } - }, - stopImmediatePropagation: function() { - var e = this.originalEvent; - - this.isImmediatePropagationStopped = returnTrue; - - if ( e && !this.isSimulated ) { - e.stopImmediatePropagation(); - } - - this.stopPropagation(); - } -}; - -// Includes all common event props including KeyEvent and MouseEvent specific props -jQuery.each( { - altKey: true, - bubbles: true, - cancelable: true, - changedTouches: true, - ctrlKey: true, - detail: true, - eventPhase: true, - metaKey: true, - pageX: true, - pageY: true, - shiftKey: true, - view: true, - "char": true, - code: true, - charCode: true, - key: true, - keyCode: true, - button: true, - buttons: true, - clientX: true, - clientY: true, - offsetX: true, - offsetY: true, - pointerId: true, - pointerType: true, - screenX: true, - screenY: true, - targetTouches: true, - toElement: true, - touches: true, - which: true -}, jQuery.event.addProp ); - -jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { - jQuery.event.special[ type ] = { - - // Utilize native event if possible so blur/focus sequence is correct - setup: function() { - - // Claim the first handler - // dataPriv.set( this, "focus", ... ) - // dataPriv.set( this, "blur", ... ) - leverageNative( this, type, expectSync ); - - // Return false to allow normal processing in the caller - return false; - }, - trigger: function() { - - // Force setup before trigger - leverageNative( this, type ); - - // Return non-false to allow normal event-path propagation - return true; - }, - - // Suppress native focus or blur as it's already being fired - // in leverageNative. - _default: function() { - return true; - }, - - delegateType: delegateType - }; -} ); - -// Create mouseenter/leave events using mouseover/out and event-time checks -// so that event delegation works in jQuery. -// Do the same for pointerenter/pointerleave and pointerover/pointerout -// -// Support: Safari 7 only -// Safari sends mouseenter too often; see: -// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 -// for the description of the bug (it existed in older Chrome versions as well). -jQuery.each( { - mouseenter: "mouseover", - mouseleave: "mouseout", - pointerenter: "pointerover", - pointerleave: "pointerout" -}, function( orig, fix ) { - jQuery.event.special[ orig ] = { - delegateType: fix, - bindType: fix, - - handle: function( event ) { - var ret, - target = this, - related = event.relatedTarget, - handleObj = event.handleObj; - - // For mouseenter/leave call the handler if related is outside the target. - // NB: No relatedTarget if the mouse left/entered the browser window - if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { - event.type = handleObj.origType; - ret = handleObj.handler.apply( this, arguments ); - event.type = fix; - } - return ret; - } - }; -} ); - -jQuery.fn.extend( { - - on: function( types, selector, data, fn ) { - return on( this, types, selector, data, fn ); - }, - one: function( types, selector, data, fn ) { - return on( this, types, selector, data, fn, 1 ); - }, - off: function( types, selector, fn ) { - var handleObj, type; - if ( types && types.preventDefault && types.handleObj ) { - - // ( event ) dispatched jQuery.Event - handleObj = types.handleObj; - jQuery( types.delegateTarget ).off( - handleObj.namespace ? - handleObj.origType + "." + handleObj.namespace : - handleObj.origType, - handleObj.selector, - handleObj.handler - ); - return this; - } - if ( typeof types === "object" ) { - - // ( types-object [, selector] ) - for ( type in types ) { - this.off( type, selector, types[ type ] ); - } - return this; - } - if ( selector === false || typeof selector === "function" ) { - - // ( types [, fn] ) - fn = selector; - selector = undefined; - } - if ( fn === false ) { - fn = returnFalse; - } - return this.each( function() { - jQuery.event.remove( this, types, fn, selector ); - } ); - } -} ); - - -var - - // Support: IE <=10 - 11, Edge 12 - 13 only - // In IE/Edge using regex groups here causes severe slowdowns. - // See https://connect.microsoft.com/IE/feedback/details/1736512/ - rnoInnerhtml = /\s*$/g; - -// Prefer a tbody over its parent table for containing new rows -function manipulationTarget( elem, content ) { - if ( nodeName( elem, "table" ) && - nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { - - return jQuery( elem ).children( "tbody" )[ 0 ] || elem; - } - - return elem; -} - -// Replace/restore the type attribute of script elements for safe DOM manipulation -function disableScript( elem ) { - elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; - return elem; -} -function restoreScript( elem ) { - if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { - elem.type = elem.type.slice( 5 ); - } else { - elem.removeAttribute( "type" ); - } - - return elem; -} - -function cloneCopyEvent( src, dest ) { - var i, l, type, pdataOld, udataOld, udataCur, events; - - if ( dest.nodeType !== 1 ) { - return; - } - - // 1. Copy private data: events, handlers, etc. - if ( dataPriv.hasData( src ) ) { - pdataOld = dataPriv.get( src ); - events = pdataOld.events; - - if ( events ) { - dataPriv.remove( dest, "handle events" ); - - for ( type in events ) { - for ( i = 0, l = events[ type ].length; i < l; i++ ) { - jQuery.event.add( dest, type, events[ type ][ i ] ); - } - } - } - } - - // 2. Copy user data - if ( dataUser.hasData( src ) ) { - udataOld = dataUser.access( src ); - udataCur = jQuery.extend( {}, udataOld ); - - dataUser.set( dest, udataCur ); - } -} - -// Fix IE bugs, see support tests -function fixInput( src, dest ) { - var nodeName = dest.nodeName.toLowerCase(); - - // Fails to persist the checked state of a cloned checkbox or radio button. - if ( nodeName === "input" && rcheckableType.test( src.type ) ) { - dest.checked = src.checked; - - // Fails to return the selected option to the default selected state when cloning options - } else if ( nodeName === "input" || nodeName === "textarea" ) { - dest.defaultValue = src.defaultValue; - } -} - -function domManip( collection, args, callback, ignored ) { - - // Flatten any nested arrays - args = flat( args ); - - var fragment, first, scripts, hasScripts, node, doc, - i = 0, - l = collection.length, - iNoClone = l - 1, - value = args[ 0 ], - valueIsFunction = isFunction( value ); - - // We can't cloneNode fragments that contain checked, in WebKit - if ( valueIsFunction || - ( l > 1 && typeof value === "string" && - !support.checkClone && rchecked.test( value ) ) ) { - return collection.each( function( index ) { - var self = collection.eq( index ); - if ( valueIsFunction ) { - args[ 0 ] = value.call( this, index, self.html() ); - } - domManip( self, args, callback, ignored ); - } ); - } - - if ( l ) { - fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); - first = fragment.firstChild; - - if ( fragment.childNodes.length === 1 ) { - fragment = first; - } - - // Require either new content or an interest in ignored elements to invoke the callback - if ( first || ignored ) { - scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); - hasScripts = scripts.length; - - // Use the original fragment for the last item - // instead of the first because it can end up - // being emptied incorrectly in certain situations (#8070). - for ( ; i < l; i++ ) { - node = fragment; - - if ( i !== iNoClone ) { - node = jQuery.clone( node, true, true ); - - // Keep references to cloned scripts for later restoration - if ( hasScripts ) { - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( scripts, getAll( node, "script" ) ); - } - } - - callback.call( collection[ i ], node, i ); - } - - if ( hasScripts ) { - doc = scripts[ scripts.length - 1 ].ownerDocument; - - // Reenable scripts - jQuery.map( scripts, restoreScript ); - - // Evaluate executable scripts on first document insertion - for ( i = 0; i < hasScripts; i++ ) { - node = scripts[ i ]; - if ( rscriptType.test( node.type || "" ) && - !dataPriv.access( node, "globalEval" ) && - jQuery.contains( doc, node ) ) { - - if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { - - // Optional AJAX dependency, but won't run scripts if not present - if ( jQuery._evalUrl && !node.noModule ) { - jQuery._evalUrl( node.src, { - nonce: node.nonce || node.getAttribute( "nonce" ) - }, doc ); - } - } else { - DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); - } - } - } - } - } - } - - return collection; -} - -function remove( elem, selector, keepData ) { - var node, - nodes = selector ? jQuery.filter( selector, elem ) : elem, - i = 0; - - for ( ; ( node = nodes[ i ] ) != null; i++ ) { - if ( !keepData && node.nodeType === 1 ) { - jQuery.cleanData( getAll( node ) ); - } - - if ( node.parentNode ) { - if ( keepData && isAttached( node ) ) { - setGlobalEval( getAll( node, "script" ) ); - } - node.parentNode.removeChild( node ); - } - } - - return elem; -} - -jQuery.extend( { - htmlPrefilter: function( html ) { - return html; - }, - - clone: function( elem, dataAndEvents, deepDataAndEvents ) { - var i, l, srcElements, destElements, - clone = elem.cloneNode( true ), - inPage = isAttached( elem ); - - // Fix IE cloning issues - if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && - !jQuery.isXMLDoc( elem ) ) { - - // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 - destElements = getAll( clone ); - srcElements = getAll( elem ); - - for ( i = 0, l = srcElements.length; i < l; i++ ) { - fixInput( srcElements[ i ], destElements[ i ] ); - } - } - - // Copy the events from the original to the clone - if ( dataAndEvents ) { - if ( deepDataAndEvents ) { - srcElements = srcElements || getAll( elem ); - destElements = destElements || getAll( clone ); - - for ( i = 0, l = srcElements.length; i < l; i++ ) { - cloneCopyEvent( srcElements[ i ], destElements[ i ] ); - } - } else { - cloneCopyEvent( elem, clone ); - } - } - - // Preserve script evaluation history - destElements = getAll( clone, "script" ); - if ( destElements.length > 0 ) { - setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); - } - - // Return the cloned set - return clone; - }, - - cleanData: function( elems ) { - var data, elem, type, - special = jQuery.event.special, - i = 0; - - for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { - if ( acceptData( elem ) ) { - if ( ( data = elem[ dataPriv.expando ] ) ) { - if ( data.events ) { - for ( type in data.events ) { - if ( special[ type ] ) { - jQuery.event.remove( elem, type ); - - // This is a shortcut to avoid jQuery.event.remove's overhead - } else { - jQuery.removeEvent( elem, type, data.handle ); - } - } - } - - // Support: Chrome <=35 - 45+ - // Assign undefined instead of using delete, see Data#remove - elem[ dataPriv.expando ] = undefined; - } - if ( elem[ dataUser.expando ] ) { - - // Support: Chrome <=35 - 45+ - // Assign undefined instead of using delete, see Data#remove - elem[ dataUser.expando ] = undefined; - } - } - } - } -} ); - -jQuery.fn.extend( { - detach: function( selector ) { - return remove( this, selector, true ); - }, - - remove: function( selector ) { - return remove( this, selector ); - }, - - text: function( value ) { - return access( this, function( value ) { - return value === undefined ? - jQuery.text( this ) : - this.empty().each( function() { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - this.textContent = value; - } - } ); - }, null, value, arguments.length ); - }, - - append: function() { - return domManip( this, arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.appendChild( elem ); - } - } ); - }, - - prepend: function() { - return domManip( this, arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.insertBefore( elem, target.firstChild ); - } - } ); - }, - - before: function() { - return domManip( this, arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this ); - } - } ); - }, - - after: function() { - return domManip( this, arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this.nextSibling ); - } - } ); - }, - - empty: function() { - var elem, - i = 0; - - for ( ; ( elem = this[ i ] ) != null; i++ ) { - if ( elem.nodeType === 1 ) { - - // Prevent memory leaks - jQuery.cleanData( getAll( elem, false ) ); - - // Remove any remaining nodes - elem.textContent = ""; - } - } - - return this; - }, - - clone: function( dataAndEvents, deepDataAndEvents ) { - dataAndEvents = dataAndEvents == null ? false : dataAndEvents; - deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; - - return this.map( function() { - return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); - } ); - }, - - html: function( value ) { - return access( this, function( value ) { - var elem = this[ 0 ] || {}, - i = 0, - l = this.length; - - if ( value === undefined && elem.nodeType === 1 ) { - return elem.innerHTML; - } - - // See if we can take a shortcut and just use innerHTML - if ( typeof value === "string" && !rnoInnerhtml.test( value ) && - !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { - - value = jQuery.htmlPrefilter( value ); - - try { - for ( ; i < l; i++ ) { - elem = this[ i ] || {}; - - // Remove element nodes and prevent memory leaks - if ( elem.nodeType === 1 ) { - jQuery.cleanData( getAll( elem, false ) ); - elem.innerHTML = value; - } - } - - elem = 0; - - // If using innerHTML throws an exception, use the fallback method - } catch ( e ) {} - } - - if ( elem ) { - this.empty().append( value ); - } - }, null, value, arguments.length ); - }, - - replaceWith: function() { - var ignored = []; - - // Make the changes, replacing each non-ignored context element with the new content - return domManip( this, arguments, function( elem ) { - var parent = this.parentNode; - - if ( jQuery.inArray( this, ignored ) < 0 ) { - jQuery.cleanData( getAll( this ) ); - if ( parent ) { - parent.replaceChild( elem, this ); - } - } - - // Force callback invocation - }, ignored ); - } -} ); - -jQuery.each( { - appendTo: "append", - prependTo: "prepend", - insertBefore: "before", - insertAfter: "after", - replaceAll: "replaceWith" -}, function( name, original ) { - jQuery.fn[ name ] = function( selector ) { - var elems, - ret = [], - insert = jQuery( selector ), - last = insert.length - 1, - i = 0; - - for ( ; i <= last; i++ ) { - elems = i === last ? this : this.clone( true ); - jQuery( insert[ i ] )[ original ]( elems ); - - // Support: Android <=4.0 only, PhantomJS 1 only - // .get() because push.apply(_, arraylike) throws on ancient WebKit - push.apply( ret, elems.get() ); - } - - return this.pushStack( ret ); - }; -} ); -var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); - -var getStyles = function( elem ) { - - // Support: IE <=11 only, Firefox <=30 (#15098, #14150) - // IE throws on elements created in popups - // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" - var view = elem.ownerDocument.defaultView; - - if ( !view || !view.opener ) { - view = window; - } - - return view.getComputedStyle( elem ); - }; - -var swap = function( elem, options, callback ) { - var ret, name, - old = {}; - - // Remember the old values, and insert the new ones - for ( name in options ) { - old[ name ] = elem.style[ name ]; - elem.style[ name ] = options[ name ]; - } - - ret = callback.call( elem ); - - // Revert the old values - for ( name in options ) { - elem.style[ name ] = old[ name ]; - } - - return ret; -}; - - -var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); - - - -( function() { - - // Executing both pixelPosition & boxSizingReliable tests require only one layout - // so they're executed at the same time to save the second computation. - function computeStyleTests() { - - // This is a singleton, we need to execute it only once - if ( !div ) { - return; - } - - container.style.cssText = "position:absolute;left:-11111px;width:60px;" + - "margin-top:1px;padding:0;border:0"; - div.style.cssText = - "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + - "margin:auto;border:1px;padding:1px;" + - "width:60%;top:1%"; - documentElement.appendChild( container ).appendChild( div ); - - var divStyle = window.getComputedStyle( div ); - pixelPositionVal = divStyle.top !== "1%"; - - // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 - reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; - - // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 - // Some styles come back with percentage values, even though they shouldn't - div.style.right = "60%"; - pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; - - // Support: IE 9 - 11 only - // Detect misreporting of content dimensions for box-sizing:border-box elements - boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; - - // Support: IE 9 only - // Detect overflow:scroll screwiness (gh-3699) - // Support: Chrome <=64 - // Don't get tricked when zoom affects offsetWidth (gh-4029) - div.style.position = "absolute"; - scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; - - documentElement.removeChild( container ); - - // Nullify the div so it wouldn't be stored in the memory and - // it will also be a sign that checks already performed - div = null; - } - - function roundPixelMeasures( measure ) { - return Math.round( parseFloat( measure ) ); - } - - var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, - reliableTrDimensionsVal, reliableMarginLeftVal, - container = document.createElement( "div" ), - div = document.createElement( "div" ); - - // Finish early in limited (non-browser) environments - if ( !div.style ) { - return; - } - - // Support: IE <=9 - 11 only - // Style of cloned element affects source element cloned (#8908) - div.style.backgroundClip = "content-box"; - div.cloneNode( true ).style.backgroundClip = ""; - support.clearCloneStyle = div.style.backgroundClip === "content-box"; - - jQuery.extend( support, { - boxSizingReliable: function() { - computeStyleTests(); - return boxSizingReliableVal; - }, - pixelBoxStyles: function() { - computeStyleTests(); - return pixelBoxStylesVal; - }, - pixelPosition: function() { - computeStyleTests(); - return pixelPositionVal; - }, - reliableMarginLeft: function() { - computeStyleTests(); - return reliableMarginLeftVal; - }, - scrollboxSize: function() { - computeStyleTests(); - return scrollboxSizeVal; - }, - - // Support: IE 9 - 11+, Edge 15 - 18+ - // IE/Edge misreport `getComputedStyle` of table rows with width/height - // set in CSS while `offset*` properties report correct values. - // Behavior in IE 9 is more subtle than in newer versions & it passes - // some versions of this test; make sure not to make it pass there! - // - // Support: Firefox 70+ - // Only Firefox includes border widths - // in computed dimensions. (gh-4529) - reliableTrDimensions: function() { - var table, tr, trChild, trStyle; - if ( reliableTrDimensionsVal == null ) { - table = document.createElement( "table" ); - tr = document.createElement( "tr" ); - trChild = document.createElement( "div" ); - - table.style.cssText = "position:absolute;left:-11111px;border-collapse:separate"; - tr.style.cssText = "border:1px solid"; - - // Support: Chrome 86+ - // Height set through cssText does not get applied. - // Computed height then comes back as 0. - tr.style.height = "1px"; - trChild.style.height = "9px"; - - // Support: Android 8 Chrome 86+ - // In our bodyBackground.html iframe, - // display for all div elements is set to "inline", - // which causes a problem only in Android 8 Chrome 86. - // Ensuring the div is display: block - // gets around this issue. - trChild.style.display = "block"; - - documentElement - .appendChild( table ) - .appendChild( tr ) - .appendChild( trChild ); - - trStyle = window.getComputedStyle( tr ); - reliableTrDimensionsVal = ( parseInt( trStyle.height, 10 ) + - parseInt( trStyle.borderTopWidth, 10 ) + - parseInt( trStyle.borderBottomWidth, 10 ) ) === tr.offsetHeight; - - documentElement.removeChild( table ); - } - return reliableTrDimensionsVal; - } - } ); -} )(); - - -function curCSS( elem, name, computed ) { - var width, minWidth, maxWidth, ret, - - // Support: Firefox 51+ - // Retrieving style before computed somehow - // fixes an issue with getting wrong values - // on detached elements - style = elem.style; - - computed = computed || getStyles( elem ); - - // getPropertyValue is needed for: - // .css('filter') (IE 9 only, #12537) - // .css('--customProperty) (#3144) - if ( computed ) { - ret = computed.getPropertyValue( name ) || computed[ name ]; - - if ( ret === "" && !isAttached( elem ) ) { - ret = jQuery.style( elem, name ); - } - - // A tribute to the "awesome hack by Dean Edwards" - // Android Browser returns percentage for some values, - // but width seems to be reliably pixels. - // This is against the CSSOM draft spec: - // https://drafts.csswg.org/cssom/#resolved-values - if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { - - // Remember the original values - width = style.width; - minWidth = style.minWidth; - maxWidth = style.maxWidth; - - // Put in the new values to get a computed value out - style.minWidth = style.maxWidth = style.width = ret; - ret = computed.width; - - // Revert the changed values - style.width = width; - style.minWidth = minWidth; - style.maxWidth = maxWidth; - } - } - - return ret !== undefined ? - - // Support: IE <=9 - 11 only - // IE returns zIndex value as an integer. - ret + "" : - ret; -} - - -function addGetHookIf( conditionFn, hookFn ) { - - // Define the hook, we'll check on the first run if it's really needed. - return { - get: function() { - if ( conditionFn() ) { - - // Hook not needed (or it's not possible to use it due - // to missing dependency), remove it. - delete this.get; - return; - } - - // Hook needed; redefine it so that the support test is not executed again. - return ( this.get = hookFn ).apply( this, arguments ); - } - }; -} - - -var cssPrefixes = [ "Webkit", "Moz", "ms" ], - emptyStyle = document.createElement( "div" ).style, - vendorProps = {}; - -// Return a vendor-prefixed property or undefined -function vendorPropName( name ) { - - // Check for vendor prefixed names - var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), - i = cssPrefixes.length; - - while ( i-- ) { - name = cssPrefixes[ i ] + capName; - if ( name in emptyStyle ) { - return name; - } - } -} - -// Return a potentially-mapped jQuery.cssProps or vendor prefixed property -function finalPropName( name ) { - var final = jQuery.cssProps[ name ] || vendorProps[ name ]; - - if ( final ) { - return final; - } - if ( name in emptyStyle ) { - return name; - } - return vendorProps[ name ] = vendorPropName( name ) || name; -} - - -var - - // Swappable if display is none or starts with table - // except "table", "table-cell", or "table-caption" - // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display - rdisplayswap = /^(none|table(?!-c[ea]).+)/, - rcustomProp = /^--/, - cssShow = { position: "absolute", visibility: "hidden", display: "block" }, - cssNormalTransform = { - letterSpacing: "0", - fontWeight: "400" - }; - -function setPositiveNumber( _elem, value, subtract ) { - - // Any relative (+/-) values have already been - // normalized at this point - var matches = rcssNum.exec( value ); - return matches ? - - // Guard against undefined "subtract", e.g., when used as in cssHooks - Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : - value; -} - -function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { - var i = dimension === "width" ? 1 : 0, - extra = 0, - delta = 0; - - // Adjustment may not be necessary - if ( box === ( isBorderBox ? "border" : "content" ) ) { - return 0; - } - - for ( ; i < 4; i += 2 ) { - - // Both box models exclude margin - if ( box === "margin" ) { - delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); - } - - // If we get here with a content-box, we're seeking "padding" or "border" or "margin" - if ( !isBorderBox ) { - - // Add padding - delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); - - // For "border" or "margin", add border - if ( box !== "padding" ) { - delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - - // But still keep track of it otherwise - } else { - extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - } - - // If we get here with a border-box (content + padding + border), we're seeking "content" or - // "padding" or "margin" - } else { - - // For "content", subtract padding - if ( box === "content" ) { - delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); - } - - // For "content" or "padding", subtract border - if ( box !== "margin" ) { - delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - } - } - } - - // Account for positive content-box scroll gutter when requested by providing computedVal - if ( !isBorderBox && computedVal >= 0 ) { - - // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border - // Assuming integer scroll gutter, subtract the rest and round down - delta += Math.max( 0, Math.ceil( - elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - - computedVal - - delta - - extra - - 0.5 - - // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter - // Use an explicit zero to avoid NaN (gh-3964) - ) ) || 0; - } - - return delta; -} - -function getWidthOrHeight( elem, dimension, extra ) { - - // Start with computed style - var styles = getStyles( elem ), - - // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). - // Fake content-box until we know it's needed to know the true value. - boxSizingNeeded = !support.boxSizingReliable() || extra, - isBorderBox = boxSizingNeeded && - jQuery.css( elem, "boxSizing", false, styles ) === "border-box", - valueIsBorderBox = isBorderBox, - - val = curCSS( elem, dimension, styles ), - offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); - - // Support: Firefox <=54 - // Return a confounding non-pixel value or feign ignorance, as appropriate. - if ( rnumnonpx.test( val ) ) { - if ( !extra ) { - return val; - } - val = "auto"; - } - - - // Support: IE 9 - 11 only - // Use offsetWidth/offsetHeight for when box sizing is unreliable. - // In those cases, the computed value can be trusted to be border-box. - if ( ( !support.boxSizingReliable() && isBorderBox || - - // Support: IE 10 - 11+, Edge 15 - 18+ - // IE/Edge misreport `getComputedStyle` of table rows with width/height - // set in CSS while `offset*` properties report correct values. - // Interestingly, in some cases IE 9 doesn't suffer from this issue. - !support.reliableTrDimensions() && nodeName( elem, "tr" ) || - - // Fall back to offsetWidth/offsetHeight when value is "auto" - // This happens for inline elements with no explicit setting (gh-3571) - val === "auto" || - - // Support: Android <=4.1 - 4.3 only - // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) - !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && - - // Make sure the element is visible & connected - elem.getClientRects().length ) { - - isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; - - // Where available, offsetWidth/offsetHeight approximate border box dimensions. - // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the - // retrieved value as a content box dimension. - valueIsBorderBox = offsetProp in elem; - if ( valueIsBorderBox ) { - val = elem[ offsetProp ]; - } - } - - // Normalize "" and auto - val = parseFloat( val ) || 0; - - // Adjust for the element's box model - return ( val + - boxModelAdjustment( - elem, - dimension, - extra || ( isBorderBox ? "border" : "content" ), - valueIsBorderBox, - styles, - - // Provide the current computed size to request scroll gutter calculation (gh-3589) - val - ) - ) + "px"; -} - -jQuery.extend( { - - // Add in style property hooks for overriding the default - // behavior of getting and setting a style property - cssHooks: { - opacity: { - get: function( elem, computed ) { - if ( computed ) { - - // We should always get a number back from opacity - var ret = curCSS( elem, "opacity" ); - return ret === "" ? "1" : ret; - } - } - } - }, - - // Don't automatically add "px" to these possibly-unitless properties - cssNumber: { - "animationIterationCount": true, - "columnCount": true, - "fillOpacity": true, - "flexGrow": true, - "flexShrink": true, - "fontWeight": true, - "gridArea": true, - "gridColumn": true, - "gridColumnEnd": true, - "gridColumnStart": true, - "gridRow": true, - "gridRowEnd": true, - "gridRowStart": true, - "lineHeight": true, - "opacity": true, - "order": true, - "orphans": true, - "widows": true, - "zIndex": true, - "zoom": true - }, - - // Add in properties whose names you wish to fix before - // setting or getting the value - cssProps: {}, - - // Get and set the style property on a DOM Node - style: function( elem, name, value, extra ) { - - // Don't set styles on text and comment nodes - if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { - return; - } - - // Make sure that we're working with the right name - var ret, type, hooks, - origName = camelCase( name ), - isCustomProp = rcustomProp.test( name ), - style = elem.style; - - // Make sure that we're working with the right name. We don't - // want to query the value if it is a CSS custom property - // since they are user-defined. - if ( !isCustomProp ) { - name = finalPropName( origName ); - } - - // Gets hook for the prefixed version, then unprefixed version - hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; - - // Check if we're setting a value - if ( value !== undefined ) { - type = typeof value; - - // Convert "+=" or "-=" to relative numbers (#7345) - if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { - value = adjustCSS( elem, name, ret ); - - // Fixes bug #9237 - type = "number"; - } - - // Make sure that null and NaN values aren't set (#7116) - if ( value == null || value !== value ) { - return; - } - - // If a number was passed in, add the unit (except for certain CSS properties) - // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append - // "px" to a few hardcoded values. - if ( type === "number" && !isCustomProp ) { - value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); - } - - // background-* props affect original clone's values - if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { - style[ name ] = "inherit"; - } - - // If a hook was provided, use that value, otherwise just set the specified value - if ( !hooks || !( "set" in hooks ) || - ( value = hooks.set( elem, value, extra ) ) !== undefined ) { - - if ( isCustomProp ) { - style.setProperty( name, value ); - } else { - style[ name ] = value; - } - } - - } else { - - // If a hook was provided get the non-computed value from there - if ( hooks && "get" in hooks && - ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { - - return ret; - } - - // Otherwise just get the value from the style object - return style[ name ]; - } - }, - - css: function( elem, name, extra, styles ) { - var val, num, hooks, - origName = camelCase( name ), - isCustomProp = rcustomProp.test( name ); - - // Make sure that we're working with the right name. We don't - // want to modify the value if it is a CSS custom property - // since they are user-defined. - if ( !isCustomProp ) { - name = finalPropName( origName ); - } - - // Try prefixed name followed by the unprefixed name - hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; - - // If a hook was provided get the computed value from there - if ( hooks && "get" in hooks ) { - val = hooks.get( elem, true, extra ); - } - - // Otherwise, if a way to get the computed value exists, use that - if ( val === undefined ) { - val = curCSS( elem, name, styles ); - } - - // Convert "normal" to computed value - if ( val === "normal" && name in cssNormalTransform ) { - val = cssNormalTransform[ name ]; - } - - // Make numeric if forced or a qualifier was provided and val looks numeric - if ( extra === "" || extra ) { - num = parseFloat( val ); - return extra === true || isFinite( num ) ? num || 0 : val; - } - - return val; - } -} ); - -jQuery.each( [ "height", "width" ], function( _i, dimension ) { - jQuery.cssHooks[ dimension ] = { - get: function( elem, computed, extra ) { - if ( computed ) { - - // Certain elements can have dimension info if we invisibly show them - // but it must have a current display style that would benefit - return rdisplayswap.test( jQuery.css( elem, "display" ) ) && - - // Support: Safari 8+ - // Table columns in Safari have non-zero offsetWidth & zero - // getBoundingClientRect().width unless display is changed. - // Support: IE <=11 only - // Running getBoundingClientRect on a disconnected node - // in IE throws an error. - ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? - swap( elem, cssShow, function() { - return getWidthOrHeight( elem, dimension, extra ); - } ) : - getWidthOrHeight( elem, dimension, extra ); - } - }, - - set: function( elem, value, extra ) { - var matches, - styles = getStyles( elem ), - - // Only read styles.position if the test has a chance to fail - // to avoid forcing a reflow. - scrollboxSizeBuggy = !support.scrollboxSize() && - styles.position === "absolute", - - // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) - boxSizingNeeded = scrollboxSizeBuggy || extra, - isBorderBox = boxSizingNeeded && - jQuery.css( elem, "boxSizing", false, styles ) === "border-box", - subtract = extra ? - boxModelAdjustment( - elem, - dimension, - extra, - isBorderBox, - styles - ) : - 0; - - // Account for unreliable border-box dimensions by comparing offset* to computed and - // faking a content-box to get border and padding (gh-3699) - if ( isBorderBox && scrollboxSizeBuggy ) { - subtract -= Math.ceil( - elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - - parseFloat( styles[ dimension ] ) - - boxModelAdjustment( elem, dimension, "border", false, styles ) - - 0.5 - ); - } - - // Convert to pixels if value adjustment is needed - if ( subtract && ( matches = rcssNum.exec( value ) ) && - ( matches[ 3 ] || "px" ) !== "px" ) { - - elem.style[ dimension ] = value; - value = jQuery.css( elem, dimension ); - } - - return setPositiveNumber( elem, value, subtract ); - } - }; -} ); - -jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, - function( elem, computed ) { - if ( computed ) { - return ( parseFloat( curCSS( elem, "marginLeft" ) ) || - elem.getBoundingClientRect().left - - swap( elem, { marginLeft: 0 }, function() { - return elem.getBoundingClientRect().left; - } ) - ) + "px"; - } - } -); - -// These hooks are used by animate to expand properties -jQuery.each( { - margin: "", - padding: "", - border: "Width" -}, function( prefix, suffix ) { - jQuery.cssHooks[ prefix + suffix ] = { - expand: function( value ) { - var i = 0, - expanded = {}, - - // Assumes a single number if not a string - parts = typeof value === "string" ? value.split( " " ) : [ value ]; - - for ( ; i < 4; i++ ) { - expanded[ prefix + cssExpand[ i ] + suffix ] = - parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; - } - - return expanded; - } - }; - - if ( prefix !== "margin" ) { - jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; - } -} ); - -jQuery.fn.extend( { - css: function( name, value ) { - return access( this, function( elem, name, value ) { - var styles, len, - map = {}, - i = 0; - - if ( Array.isArray( name ) ) { - styles = getStyles( elem ); - len = name.length; - - for ( ; i < len; i++ ) { - map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); - } - - return map; - } - - return value !== undefined ? - jQuery.style( elem, name, value ) : - jQuery.css( elem, name ); - }, name, value, arguments.length > 1 ); - } -} ); - - -function Tween( elem, options, prop, end, easing ) { - return new Tween.prototype.init( elem, options, prop, end, easing ); -} -jQuery.Tween = Tween; - -Tween.prototype = { - constructor: Tween, - init: function( elem, options, prop, end, easing, unit ) { - this.elem = elem; - this.prop = prop; - this.easing = easing || jQuery.easing._default; - this.options = options; - this.start = this.now = this.cur(); - this.end = end; - this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); - }, - cur: function() { - var hooks = Tween.propHooks[ this.prop ]; - - return hooks && hooks.get ? - hooks.get( this ) : - Tween.propHooks._default.get( this ); - }, - run: function( percent ) { - var eased, - hooks = Tween.propHooks[ this.prop ]; - - if ( this.options.duration ) { - this.pos = eased = jQuery.easing[ this.easing ]( - percent, this.options.duration * percent, 0, 1, this.options.duration - ); - } else { - this.pos = eased = percent; - } - this.now = ( this.end - this.start ) * eased + this.start; - - if ( this.options.step ) { - this.options.step.call( this.elem, this.now, this ); - } - - if ( hooks && hooks.set ) { - hooks.set( this ); - } else { - Tween.propHooks._default.set( this ); - } - return this; - } -}; - -Tween.prototype.init.prototype = Tween.prototype; - -Tween.propHooks = { - _default: { - get: function( tween ) { - var result; - - // Use a property on the element directly when it is not a DOM element, - // or when there is no matching style property that exists. - if ( tween.elem.nodeType !== 1 || - tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { - return tween.elem[ tween.prop ]; - } - - // Passing an empty string as a 3rd parameter to .css will automatically - // attempt a parseFloat and fallback to a string if the parse fails. - // Simple values such as "10px" are parsed to Float; - // complex values such as "rotate(1rad)" are returned as-is. - result = jQuery.css( tween.elem, tween.prop, "" ); - - // Empty strings, null, undefined and "auto" are converted to 0. - return !result || result === "auto" ? 0 : result; - }, - set: function( tween ) { - - // Use step hook for back compat. - // Use cssHook if its there. - // Use .style if available and use plain properties where available. - if ( jQuery.fx.step[ tween.prop ] ) { - jQuery.fx.step[ tween.prop ]( tween ); - } else if ( tween.elem.nodeType === 1 && ( - jQuery.cssHooks[ tween.prop ] || - tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { - jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); - } else { - tween.elem[ tween.prop ] = tween.now; - } - } - } -}; - -// Support: IE <=9 only -// Panic based approach to setting things on disconnected nodes -Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { - set: function( tween ) { - if ( tween.elem.nodeType && tween.elem.parentNode ) { - tween.elem[ tween.prop ] = tween.now; - } - } -}; - -jQuery.easing = { - linear: function( p ) { - return p; - }, - swing: function( p ) { - return 0.5 - Math.cos( p * Math.PI ) / 2; - }, - _default: "swing" -}; - -jQuery.fx = Tween.prototype.init; - -// Back compat <1.8 extension point -jQuery.fx.step = {}; - - - - -var - fxNow, inProgress, - rfxtypes = /^(?:toggle|show|hide)$/, - rrun = /queueHooks$/; - -function schedule() { - if ( inProgress ) { - if ( document.hidden === false && window.requestAnimationFrame ) { - window.requestAnimationFrame( schedule ); - } else { - window.setTimeout( schedule, jQuery.fx.interval ); - } - - jQuery.fx.tick(); - } -} - -// Animations created synchronously will run synchronously -function createFxNow() { - window.setTimeout( function() { - fxNow = undefined; - } ); - return ( fxNow = Date.now() ); -} - -// Generate parameters to create a standard animation -function genFx( type, includeWidth ) { - var which, - i = 0, - attrs = { height: type }; - - // If we include width, step value is 1 to do all cssExpand values, - // otherwise step value is 2 to skip over Left and Right - includeWidth = includeWidth ? 1 : 0; - for ( ; i < 4; i += 2 - includeWidth ) { - which = cssExpand[ i ]; - attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; - } - - if ( includeWidth ) { - attrs.opacity = attrs.width = type; - } - - return attrs; -} - -function createTween( value, prop, animation ) { - var tween, - collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), - index = 0, - length = collection.length; - for ( ; index < length; index++ ) { - if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { - - // We're done with this property - return tween; - } - } -} - -function defaultPrefilter( elem, props, opts ) { - var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, - isBox = "width" in props || "height" in props, - anim = this, - orig = {}, - style = elem.style, - hidden = elem.nodeType && isHiddenWithinTree( elem ), - dataShow = dataPriv.get( elem, "fxshow" ); - - // Queue-skipping animations hijack the fx hooks - if ( !opts.queue ) { - hooks = jQuery._queueHooks( elem, "fx" ); - if ( hooks.unqueued == null ) { - hooks.unqueued = 0; - oldfire = hooks.empty.fire; - hooks.empty.fire = function() { - if ( !hooks.unqueued ) { - oldfire(); - } - }; - } - hooks.unqueued++; - - anim.always( function() { - - // Ensure the complete handler is called before this completes - anim.always( function() { - hooks.unqueued--; - if ( !jQuery.queue( elem, "fx" ).length ) { - hooks.empty.fire(); - } - } ); - } ); - } - - // Detect show/hide animations - for ( prop in props ) { - value = props[ prop ]; - if ( rfxtypes.test( value ) ) { - delete props[ prop ]; - toggle = toggle || value === "toggle"; - if ( value === ( hidden ? "hide" : "show" ) ) { - - // Pretend to be hidden if this is a "show" and - // there is still data from a stopped show/hide - if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { - hidden = true; - - // Ignore all other no-op show/hide data - } else { - continue; - } - } - orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); - } - } - - // Bail out if this is a no-op like .hide().hide() - propTween = !jQuery.isEmptyObject( props ); - if ( !propTween && jQuery.isEmptyObject( orig ) ) { - return; - } - - // Restrict "overflow" and "display" styles during box animations - if ( isBox && elem.nodeType === 1 ) { - - // Support: IE <=9 - 11, Edge 12 - 15 - // Record all 3 overflow attributes because IE does not infer the shorthand - // from identically-valued overflowX and overflowY and Edge just mirrors - // the overflowX value there. - opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; - - // Identify a display type, preferring old show/hide data over the CSS cascade - restoreDisplay = dataShow && dataShow.display; - if ( restoreDisplay == null ) { - restoreDisplay = dataPriv.get( elem, "display" ); - } - display = jQuery.css( elem, "display" ); - if ( display === "none" ) { - if ( restoreDisplay ) { - display = restoreDisplay; - } else { - - // Get nonempty value(s) by temporarily forcing visibility - showHide( [ elem ], true ); - restoreDisplay = elem.style.display || restoreDisplay; - display = jQuery.css( elem, "display" ); - showHide( [ elem ] ); - } - } - - // Animate inline elements as inline-block - if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { - if ( jQuery.css( elem, "float" ) === "none" ) { - - // Restore the original display value at the end of pure show/hide animations - if ( !propTween ) { - anim.done( function() { - style.display = restoreDisplay; - } ); - if ( restoreDisplay == null ) { - display = style.display; - restoreDisplay = display === "none" ? "" : display; - } - } - style.display = "inline-block"; - } - } - } - - if ( opts.overflow ) { - style.overflow = "hidden"; - anim.always( function() { - style.overflow = opts.overflow[ 0 ]; - style.overflowX = opts.overflow[ 1 ]; - style.overflowY = opts.overflow[ 2 ]; - } ); - } - - // Implement show/hide animations - propTween = false; - for ( prop in orig ) { - - // General show/hide setup for this element animation - if ( !propTween ) { - if ( dataShow ) { - if ( "hidden" in dataShow ) { - hidden = dataShow.hidden; - } - } else { - dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); - } - - // Store hidden/visible for toggle so `.stop().toggle()` "reverses" - if ( toggle ) { - dataShow.hidden = !hidden; - } - - // Show elements before animating them - if ( hidden ) { - showHide( [ elem ], true ); - } - - /* eslint-disable no-loop-func */ - - anim.done( function() { - - /* eslint-enable no-loop-func */ - - // The final step of a "hide" animation is actually hiding the element - if ( !hidden ) { - showHide( [ elem ] ); - } - dataPriv.remove( elem, "fxshow" ); - for ( prop in orig ) { - jQuery.style( elem, prop, orig[ prop ] ); - } - } ); - } - - // Per-property setup - propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); - if ( !( prop in dataShow ) ) { - dataShow[ prop ] = propTween.start; - if ( hidden ) { - propTween.end = propTween.start; - propTween.start = 0; - } - } - } -} - -function propFilter( props, specialEasing ) { - var index, name, easing, value, hooks; - - // camelCase, specialEasing and expand cssHook pass - for ( index in props ) { - name = camelCase( index ); - easing = specialEasing[ name ]; - value = props[ index ]; - if ( Array.isArray( value ) ) { - easing = value[ 1 ]; - value = props[ index ] = value[ 0 ]; - } - - if ( index !== name ) { - props[ name ] = value; - delete props[ index ]; - } - - hooks = jQuery.cssHooks[ name ]; - if ( hooks && "expand" in hooks ) { - value = hooks.expand( value ); - delete props[ name ]; - - // Not quite $.extend, this won't overwrite existing keys. - // Reusing 'index' because we have the correct "name" - for ( index in value ) { - if ( !( index in props ) ) { - props[ index ] = value[ index ]; - specialEasing[ index ] = easing; - } - } - } else { - specialEasing[ name ] = easing; - } - } -} - -function Animation( elem, properties, options ) { - var result, - stopped, - index = 0, - length = Animation.prefilters.length, - deferred = jQuery.Deferred().always( function() { - - // Don't match elem in the :animated selector - delete tick.elem; - } ), - tick = function() { - if ( stopped ) { - return false; - } - var currentTime = fxNow || createFxNow(), - remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), - - // Support: Android 2.3 only - // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) - temp = remaining / animation.duration || 0, - percent = 1 - temp, - index = 0, - length = animation.tweens.length; - - for ( ; index < length; index++ ) { - animation.tweens[ index ].run( percent ); - } - - deferred.notifyWith( elem, [ animation, percent, remaining ] ); - - // If there's more to do, yield - if ( percent < 1 && length ) { - return remaining; - } - - // If this was an empty animation, synthesize a final progress notification - if ( !length ) { - deferred.notifyWith( elem, [ animation, 1, 0 ] ); - } - - // Resolve the animation and report its conclusion - deferred.resolveWith( elem, [ animation ] ); - return false; - }, - animation = deferred.promise( { - elem: elem, - props: jQuery.extend( {}, properties ), - opts: jQuery.extend( true, { - specialEasing: {}, - easing: jQuery.easing._default - }, options ), - originalProperties: properties, - originalOptions: options, - startTime: fxNow || createFxNow(), - duration: options.duration, - tweens: [], - createTween: function( prop, end ) { - var tween = jQuery.Tween( elem, animation.opts, prop, end, - animation.opts.specialEasing[ prop ] || animation.opts.easing ); - animation.tweens.push( tween ); - return tween; - }, - stop: function( gotoEnd ) { - var index = 0, - - // If we are going to the end, we want to run all the tweens - // otherwise we skip this part - length = gotoEnd ? animation.tweens.length : 0; - if ( stopped ) { - return this; - } - stopped = true; - for ( ; index < length; index++ ) { - animation.tweens[ index ].run( 1 ); - } - - // Resolve when we played the last frame; otherwise, reject - if ( gotoEnd ) { - deferred.notifyWith( elem, [ animation, 1, 0 ] ); - deferred.resolveWith( elem, [ animation, gotoEnd ] ); - } else { - deferred.rejectWith( elem, [ animation, gotoEnd ] ); - } - return this; - } - } ), - props = animation.props; - - propFilter( props, animation.opts.specialEasing ); - - for ( ; index < length; index++ ) { - result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); - if ( result ) { - if ( isFunction( result.stop ) ) { - jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = - result.stop.bind( result ); - } - return result; - } - } - - jQuery.map( props, createTween, animation ); - - if ( isFunction( animation.opts.start ) ) { - animation.opts.start.call( elem, animation ); - } - - // Attach callbacks from options - animation - .progress( animation.opts.progress ) - .done( animation.opts.done, animation.opts.complete ) - .fail( animation.opts.fail ) - .always( animation.opts.always ); - - jQuery.fx.timer( - jQuery.extend( tick, { - elem: elem, - anim: animation, - queue: animation.opts.queue - } ) - ); - - return animation; -} - -jQuery.Animation = jQuery.extend( Animation, { - - tweeners: { - "*": [ function( prop, value ) { - var tween = this.createTween( prop, value ); - adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); - return tween; - } ] - }, - - tweener: function( props, callback ) { - if ( isFunction( props ) ) { - callback = props; - props = [ "*" ]; - } else { - props = props.match( rnothtmlwhite ); - } - - var prop, - index = 0, - length = props.length; - - for ( ; index < length; index++ ) { - prop = props[ index ]; - Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; - Animation.tweeners[ prop ].unshift( callback ); - } - }, - - prefilters: [ defaultPrefilter ], - - prefilter: function( callback, prepend ) { - if ( prepend ) { - Animation.prefilters.unshift( callback ); - } else { - Animation.prefilters.push( callback ); - } - } -} ); - -jQuery.speed = function( speed, easing, fn ) { - var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { - complete: fn || !fn && easing || - isFunction( speed ) && speed, - duration: speed, - easing: fn && easing || easing && !isFunction( easing ) && easing - }; - - // Go to the end state if fx are off - if ( jQuery.fx.off ) { - opt.duration = 0; - - } else { - if ( typeof opt.duration !== "number" ) { - if ( opt.duration in jQuery.fx.speeds ) { - opt.duration = jQuery.fx.speeds[ opt.duration ]; - - } else { - opt.duration = jQuery.fx.speeds._default; - } - } - } - - // Normalize opt.queue - true/undefined/null -> "fx" - if ( opt.queue == null || opt.queue === true ) { - opt.queue = "fx"; - } - - // Queueing - opt.old = opt.complete; - - opt.complete = function() { - if ( isFunction( opt.old ) ) { - opt.old.call( this ); - } - - if ( opt.queue ) { - jQuery.dequeue( this, opt.queue ); - } - }; - - return opt; -}; - -jQuery.fn.extend( { - fadeTo: function( speed, to, easing, callback ) { - - // Show any hidden elements after setting opacity to 0 - return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() - - // Animate to the value specified - .end().animate( { opacity: to }, speed, easing, callback ); - }, - animate: function( prop, speed, easing, callback ) { - var empty = jQuery.isEmptyObject( prop ), - optall = jQuery.speed( speed, easing, callback ), - doAnimation = function() { - - // Operate on a copy of prop so per-property easing won't be lost - var anim = Animation( this, jQuery.extend( {}, prop ), optall ); - - // Empty animations, or finishing resolves immediately - if ( empty || dataPriv.get( this, "finish" ) ) { - anim.stop( true ); - } - }; - - doAnimation.finish = doAnimation; - - return empty || optall.queue === false ? - this.each( doAnimation ) : - this.queue( optall.queue, doAnimation ); - }, - stop: function( type, clearQueue, gotoEnd ) { - var stopQueue = function( hooks ) { - var stop = hooks.stop; - delete hooks.stop; - stop( gotoEnd ); - }; - - if ( typeof type !== "string" ) { - gotoEnd = clearQueue; - clearQueue = type; - type = undefined; - } - if ( clearQueue ) { - this.queue( type || "fx", [] ); - } - - return this.each( function() { - var dequeue = true, - index = type != null && type + "queueHooks", - timers = jQuery.timers, - data = dataPriv.get( this ); - - if ( index ) { - if ( data[ index ] && data[ index ].stop ) { - stopQueue( data[ index ] ); - } - } else { - for ( index in data ) { - if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { - stopQueue( data[ index ] ); - } - } - } - - for ( index = timers.length; index--; ) { - if ( timers[ index ].elem === this && - ( type == null || timers[ index ].queue === type ) ) { - - timers[ index ].anim.stop( gotoEnd ); - dequeue = false; - timers.splice( index, 1 ); - } - } - - // Start the next in the queue if the last step wasn't forced. - // Timers currently will call their complete callbacks, which - // will dequeue but only if they were gotoEnd. - if ( dequeue || !gotoEnd ) { - jQuery.dequeue( this, type ); - } - } ); - }, - finish: function( type ) { - if ( type !== false ) { - type = type || "fx"; - } - return this.each( function() { - var index, - data = dataPriv.get( this ), - queue = data[ type + "queue" ], - hooks = data[ type + "queueHooks" ], - timers = jQuery.timers, - length = queue ? queue.length : 0; - - // Enable finishing flag on private data - data.finish = true; - - // Empty the queue first - jQuery.queue( this, type, [] ); - - if ( hooks && hooks.stop ) { - hooks.stop.call( this, true ); - } - - // Look for any active animations, and finish them - for ( index = timers.length; index--; ) { - if ( timers[ index ].elem === this && timers[ index ].queue === type ) { - timers[ index ].anim.stop( true ); - timers.splice( index, 1 ); - } - } - - // Look for any animations in the old queue and finish them - for ( index = 0; index < length; index++ ) { - if ( queue[ index ] && queue[ index ].finish ) { - queue[ index ].finish.call( this ); - } - } - - // Turn off finishing flag - delete data.finish; - } ); - } -} ); - -jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { - var cssFn = jQuery.fn[ name ]; - jQuery.fn[ name ] = function( speed, easing, callback ) { - return speed == null || typeof speed === "boolean" ? - cssFn.apply( this, arguments ) : - this.animate( genFx( name, true ), speed, easing, callback ); - }; -} ); - -// Generate shortcuts for custom animations -jQuery.each( { - slideDown: genFx( "show" ), - slideUp: genFx( "hide" ), - slideToggle: genFx( "toggle" ), - fadeIn: { opacity: "show" }, - fadeOut: { opacity: "hide" }, - fadeToggle: { opacity: "toggle" } -}, function( name, props ) { - jQuery.fn[ name ] = function( speed, easing, callback ) { - return this.animate( props, speed, easing, callback ); - }; -} ); - -jQuery.timers = []; -jQuery.fx.tick = function() { - var timer, - i = 0, - timers = jQuery.timers; - - fxNow = Date.now(); - - for ( ; i < timers.length; i++ ) { - timer = timers[ i ]; - - // Run the timer and safely remove it when done (allowing for external removal) - if ( !timer() && timers[ i ] === timer ) { - timers.splice( i--, 1 ); - } - } - - if ( !timers.length ) { - jQuery.fx.stop(); - } - fxNow = undefined; -}; - -jQuery.fx.timer = function( timer ) { - jQuery.timers.push( timer ); - jQuery.fx.start(); -}; - -jQuery.fx.interval = 13; -jQuery.fx.start = function() { - if ( inProgress ) { - return; - } - - inProgress = true; - schedule(); -}; - -jQuery.fx.stop = function() { - inProgress = null; -}; - -jQuery.fx.speeds = { - slow: 600, - fast: 200, - - // Default speed - _default: 400 -}; - - -// Based off of the plugin by Clint Helfers, with permission. -// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ -jQuery.fn.delay = function( time, type ) { - time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; - type = type || "fx"; - - return this.queue( type, function( next, hooks ) { - var timeout = window.setTimeout( next, time ); - hooks.stop = function() { - window.clearTimeout( timeout ); - }; - } ); -}; - - -( function() { - var input = document.createElement( "input" ), - select = document.createElement( "select" ), - opt = select.appendChild( document.createElement( "option" ) ); - - input.type = "checkbox"; - - // Support: Android <=4.3 only - // Default value for a checkbox should be "on" - support.checkOn = input.value !== ""; - - // Support: IE <=11 only - // Must access selectedIndex to make default options select - support.optSelected = opt.selected; - - // Support: IE <=11 only - // An input loses its value after becoming a radio - input = document.createElement( "input" ); - input.value = "t"; - input.type = "radio"; - support.radioValue = input.value === "t"; -} )(); - - -var boolHook, - attrHandle = jQuery.expr.attrHandle; - -jQuery.fn.extend( { - attr: function( name, value ) { - return access( this, jQuery.attr, name, value, arguments.length > 1 ); - }, - - removeAttr: function( name ) { - return this.each( function() { - jQuery.removeAttr( this, name ); - } ); - } -} ); - -jQuery.extend( { - attr: function( elem, name, value ) { - var ret, hooks, - nType = elem.nodeType; - - // Don't get/set attributes on text, comment and attribute nodes - if ( nType === 3 || nType === 8 || nType === 2 ) { - return; - } - - // Fallback to prop when attributes are not supported - if ( typeof elem.getAttribute === "undefined" ) { - return jQuery.prop( elem, name, value ); - } - - // Attribute hooks are determined by the lowercase version - // Grab necessary hook if one is defined - if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { - hooks = jQuery.attrHooks[ name.toLowerCase() ] || - ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); - } - - if ( value !== undefined ) { - if ( value === null ) { - jQuery.removeAttr( elem, name ); - return; - } - - if ( hooks && "set" in hooks && - ( ret = hooks.set( elem, value, name ) ) !== undefined ) { - return ret; - } - - elem.setAttribute( name, value + "" ); - return value; - } - - if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { - return ret; - } - - ret = jQuery.find.attr( elem, name ); - - // Non-existent attributes return null, we normalize to undefined - return ret == null ? undefined : ret; - }, - - attrHooks: { - type: { - set: function( elem, value ) { - if ( !support.radioValue && value === "radio" && - nodeName( elem, "input" ) ) { - var val = elem.value; - elem.setAttribute( "type", value ); - if ( val ) { - elem.value = val; - } - return value; - } - } - } - }, - - removeAttr: function( elem, value ) { - var name, - i = 0, - - // Attribute names can contain non-HTML whitespace characters - // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 - attrNames = value && value.match( rnothtmlwhite ); - - if ( attrNames && elem.nodeType === 1 ) { - while ( ( name = attrNames[ i++ ] ) ) { - elem.removeAttribute( name ); - } - } - } -} ); - -// Hooks for boolean attributes -boolHook = { - set: function( elem, value, name ) { - if ( value === false ) { - - // Remove boolean attributes when set to false - jQuery.removeAttr( elem, name ); - } else { - elem.setAttribute( name, name ); - } - return name; - } -}; - -jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { - var getter = attrHandle[ name ] || jQuery.find.attr; - - attrHandle[ name ] = function( elem, name, isXML ) { - var ret, handle, - lowercaseName = name.toLowerCase(); - - if ( !isXML ) { - - // Avoid an infinite loop by temporarily removing this function from the getter - handle = attrHandle[ lowercaseName ]; - attrHandle[ lowercaseName ] = ret; - ret = getter( elem, name, isXML ) != null ? - lowercaseName : - null; - attrHandle[ lowercaseName ] = handle; - } - return ret; - }; -} ); - - - - -var rfocusable = /^(?:input|select|textarea|button)$/i, - rclickable = /^(?:a|area)$/i; - -jQuery.fn.extend( { - prop: function( name, value ) { - return access( this, jQuery.prop, name, value, arguments.length > 1 ); - }, - - removeProp: function( name ) { - return this.each( function() { - delete this[ jQuery.propFix[ name ] || name ]; - } ); - } -} ); - -jQuery.extend( { - prop: function( elem, name, value ) { - var ret, hooks, - nType = elem.nodeType; - - // Don't get/set properties on text, comment and attribute nodes - if ( nType === 3 || nType === 8 || nType === 2 ) { - return; - } - - if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { - - // Fix name and attach hooks - name = jQuery.propFix[ name ] || name; - hooks = jQuery.propHooks[ name ]; - } - - if ( value !== undefined ) { - if ( hooks && "set" in hooks && - ( ret = hooks.set( elem, value, name ) ) !== undefined ) { - return ret; - } - - return ( elem[ name ] = value ); - } - - if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { - return ret; - } - - return elem[ name ]; - }, - - propHooks: { - tabIndex: { - get: function( elem ) { - - // Support: IE <=9 - 11 only - // elem.tabIndex doesn't always return the - // correct value when it hasn't been explicitly set - // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ - // Use proper attribute retrieval(#12072) - var tabindex = jQuery.find.attr( elem, "tabindex" ); - - if ( tabindex ) { - return parseInt( tabindex, 10 ); - } - - if ( - rfocusable.test( elem.nodeName ) || - rclickable.test( elem.nodeName ) && - elem.href - ) { - return 0; - } - - return -1; - } - } - }, - - propFix: { - "for": "htmlFor", - "class": "className" - } -} ); - -// Support: IE <=11 only -// Accessing the selectedIndex property -// forces the browser to respect setting selected -// on the option -// The getter ensures a default option is selected -// when in an optgroup -// eslint rule "no-unused-expressions" is disabled for this code -// since it considers such accessions noop -if ( !support.optSelected ) { - jQuery.propHooks.selected = { - get: function( elem ) { - - /* eslint no-unused-expressions: "off" */ - - var parent = elem.parentNode; - if ( parent && parent.parentNode ) { - parent.parentNode.selectedIndex; - } - return null; - }, - set: function( elem ) { - - /* eslint no-unused-expressions: "off" */ - - var parent = elem.parentNode; - if ( parent ) { - parent.selectedIndex; - - if ( parent.parentNode ) { - parent.parentNode.selectedIndex; - } - } - } - }; -} - -jQuery.each( [ - "tabIndex", - "readOnly", - "maxLength", - "cellSpacing", - "cellPadding", - "rowSpan", - "colSpan", - "useMap", - "frameBorder", - "contentEditable" -], function() { - jQuery.propFix[ this.toLowerCase() ] = this; -} ); - - - - - // Strip and collapse whitespace according to HTML spec - // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace - function stripAndCollapse( value ) { - var tokens = value.match( rnothtmlwhite ) || []; - return tokens.join( " " ); - } - - -function getClass( elem ) { - return elem.getAttribute && elem.getAttribute( "class" ) || ""; -} - -function classesToArray( value ) { - if ( Array.isArray( value ) ) { - return value; - } - if ( typeof value === "string" ) { - return value.match( rnothtmlwhite ) || []; - } - return []; -} - -jQuery.fn.extend( { - addClass: function( value ) { - var classes, elem, cur, curValue, clazz, j, finalValue, - i = 0; - - if ( isFunction( value ) ) { - return this.each( function( j ) { - jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); - } ); - } - - classes = classesToArray( value ); - - if ( classes.length ) { - while ( ( elem = this[ i++ ] ) ) { - curValue = getClass( elem ); - cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); - - if ( cur ) { - j = 0; - while ( ( clazz = classes[ j++ ] ) ) { - if ( cur.indexOf( " " + clazz + " " ) < 0 ) { - cur += clazz + " "; - } - } - - // Only assign if different to avoid unneeded rendering. - finalValue = stripAndCollapse( cur ); - if ( curValue !== finalValue ) { - elem.setAttribute( "class", finalValue ); - } - } - } - } - - return this; - }, - - removeClass: function( value ) { - var classes, elem, cur, curValue, clazz, j, finalValue, - i = 0; - - if ( isFunction( value ) ) { - return this.each( function( j ) { - jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); - } ); - } - - if ( !arguments.length ) { - return this.attr( "class", "" ); - } - - classes = classesToArray( value ); - - if ( classes.length ) { - while ( ( elem = this[ i++ ] ) ) { - curValue = getClass( elem ); - - // This expression is here for better compressibility (see addClass) - cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); - - if ( cur ) { - j = 0; - while ( ( clazz = classes[ j++ ] ) ) { - - // Remove *all* instances - while ( cur.indexOf( " " + clazz + " " ) > -1 ) { - cur = cur.replace( " " + clazz + " ", " " ); - } - } - - // Only assign if different to avoid unneeded rendering. - finalValue = stripAndCollapse( cur ); - if ( curValue !== finalValue ) { - elem.setAttribute( "class", finalValue ); - } - } - } - } - - return this; - }, - - toggleClass: function( value, stateVal ) { - var type = typeof value, - isValidValue = type === "string" || Array.isArray( value ); - - if ( typeof stateVal === "boolean" && isValidValue ) { - return stateVal ? this.addClass( value ) : this.removeClass( value ); - } - - if ( isFunction( value ) ) { - return this.each( function( i ) { - jQuery( this ).toggleClass( - value.call( this, i, getClass( this ), stateVal ), - stateVal - ); - } ); - } - - return this.each( function() { - var className, i, self, classNames; - - if ( isValidValue ) { - - // Toggle individual class names - i = 0; - self = jQuery( this ); - classNames = classesToArray( value ); - - while ( ( className = classNames[ i++ ] ) ) { - - // Check each className given, space separated list - if ( self.hasClass( className ) ) { - self.removeClass( className ); - } else { - self.addClass( className ); - } - } - - // Toggle whole class name - } else if ( value === undefined || type === "boolean" ) { - className = getClass( this ); - if ( className ) { - - // Store className if set - dataPriv.set( this, "__className__", className ); - } - - // If the element has a class name or if we're passed `false`, - // then remove the whole classname (if there was one, the above saved it). - // Otherwise bring back whatever was previously saved (if anything), - // falling back to the empty string if nothing was stored. - if ( this.setAttribute ) { - this.setAttribute( "class", - className || value === false ? - "" : - dataPriv.get( this, "__className__" ) || "" - ); - } - } - } ); - }, - - hasClass: function( selector ) { - var className, elem, - i = 0; - - className = " " + selector + " "; - while ( ( elem = this[ i++ ] ) ) { - if ( elem.nodeType === 1 && - ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { - return true; - } - } - - return false; - } -} ); - - - - -var rreturn = /\r/g; - -jQuery.fn.extend( { - val: function( value ) { - var hooks, ret, valueIsFunction, - elem = this[ 0 ]; - - if ( !arguments.length ) { - if ( elem ) { - hooks = jQuery.valHooks[ elem.type ] || - jQuery.valHooks[ elem.nodeName.toLowerCase() ]; - - if ( hooks && - "get" in hooks && - ( ret = hooks.get( elem, "value" ) ) !== undefined - ) { - return ret; - } - - ret = elem.value; - - // Handle most common string cases - if ( typeof ret === "string" ) { - return ret.replace( rreturn, "" ); - } - - // Handle cases where value is null/undef or number - return ret == null ? "" : ret; - } - - return; - } - - valueIsFunction = isFunction( value ); - - return this.each( function( i ) { - var val; - - if ( this.nodeType !== 1 ) { - return; - } - - if ( valueIsFunction ) { - val = value.call( this, i, jQuery( this ).val() ); - } else { - val = value; - } - - // Treat null/undefined as ""; convert numbers to string - if ( val == null ) { - val = ""; - - } else if ( typeof val === "number" ) { - val += ""; - - } else if ( Array.isArray( val ) ) { - val = jQuery.map( val, function( value ) { - return value == null ? "" : value + ""; - } ); - } - - hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; - - // If set returns undefined, fall back to normal setting - if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { - this.value = val; - } - } ); - } -} ); - -jQuery.extend( { - valHooks: { - option: { - get: function( elem ) { - - var val = jQuery.find.attr( elem, "value" ); - return val != null ? - val : - - // Support: IE <=10 - 11 only - // option.text throws exceptions (#14686, #14858) - // Strip and collapse whitespace - // https://html.spec.whatwg.org/#strip-and-collapse-whitespace - stripAndCollapse( jQuery.text( elem ) ); - } - }, - select: { - get: function( elem ) { - var value, option, i, - options = elem.options, - index = elem.selectedIndex, - one = elem.type === "select-one", - values = one ? null : [], - max = one ? index + 1 : options.length; - - if ( index < 0 ) { - i = max; - - } else { - i = one ? index : 0; - } - - // Loop through all the selected options - for ( ; i < max; i++ ) { - option = options[ i ]; - - // Support: IE <=9 only - // IE8-9 doesn't update selected after form reset (#2551) - if ( ( option.selected || i === index ) && - - // Don't return options that are disabled or in a disabled optgroup - !option.disabled && - ( !option.parentNode.disabled || - !nodeName( option.parentNode, "optgroup" ) ) ) { - - // Get the specific value for the option - value = jQuery( option ).val(); - - // We don't need an array for one selects - if ( one ) { - return value; - } - - // Multi-Selects return an array - values.push( value ); - } - } - - return values; - }, - - set: function( elem, value ) { - var optionSet, option, - options = elem.options, - values = jQuery.makeArray( value ), - i = options.length; - - while ( i-- ) { - option = options[ i ]; - - /* eslint-disable no-cond-assign */ - - if ( option.selected = - jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 - ) { - optionSet = true; - } - - /* eslint-enable no-cond-assign */ - } - - // Force browsers to behave consistently when non-matching value is set - if ( !optionSet ) { - elem.selectedIndex = -1; - } - return values; - } - } - } -} ); - -// Radios and checkboxes getter/setter -jQuery.each( [ "radio", "checkbox" ], function() { - jQuery.valHooks[ this ] = { - set: function( elem, value ) { - if ( Array.isArray( value ) ) { - return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); - } - } - }; - if ( !support.checkOn ) { - jQuery.valHooks[ this ].get = function( elem ) { - return elem.getAttribute( "value" ) === null ? "on" : elem.value; - }; - } -} ); - - - - -// Return jQuery for attributes-only inclusion - - -support.focusin = "onfocusin" in window; - - -var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, - stopPropagationCallback = function( e ) { - e.stopPropagation(); - }; - -jQuery.extend( jQuery.event, { - - trigger: function( event, data, elem, onlyHandlers ) { - - var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, - eventPath = [ elem || document ], - type = hasOwn.call( event, "type" ) ? event.type : event, - namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; - - cur = lastElement = tmp = elem = elem || document; - - // Don't do events on text and comment nodes - if ( elem.nodeType === 3 || elem.nodeType === 8 ) { - return; - } - - // focus/blur morphs to focusin/out; ensure we're not firing them right now - if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { - return; - } - - if ( type.indexOf( "." ) > -1 ) { - - // Namespaced trigger; create a regexp to match event type in handle() - namespaces = type.split( "." ); - type = namespaces.shift(); - namespaces.sort(); - } - ontype = type.indexOf( ":" ) < 0 && "on" + type; - - // Caller can pass in a jQuery.Event object, Object, or just an event type string - event = event[ jQuery.expando ] ? - event : - new jQuery.Event( type, typeof event === "object" && event ); - - // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) - event.isTrigger = onlyHandlers ? 2 : 3; - event.namespace = namespaces.join( "." ); - event.rnamespace = event.namespace ? - new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : - null; - - // Clean up the event in case it is being reused - event.result = undefined; - if ( !event.target ) { - event.target = elem; - } - - // Clone any incoming data and prepend the event, creating the handler arg list - data = data == null ? - [ event ] : - jQuery.makeArray( data, [ event ] ); - - // Allow special events to draw outside the lines - special = jQuery.event.special[ type ] || {}; - if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { - return; - } - - // Determine event propagation path in advance, per W3C events spec (#9951) - // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) - if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { - - bubbleType = special.delegateType || type; - if ( !rfocusMorph.test( bubbleType + type ) ) { - cur = cur.parentNode; - } - for ( ; cur; cur = cur.parentNode ) { - eventPath.push( cur ); - tmp = cur; - } - - // Only add window if we got to document (e.g., not plain obj or detached DOM) - if ( tmp === ( elem.ownerDocument || document ) ) { - eventPath.push( tmp.defaultView || tmp.parentWindow || window ); - } - } - - // Fire handlers on the event path - i = 0; - while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { - lastElement = cur; - event.type = i > 1 ? - bubbleType : - special.bindType || type; - - // jQuery handler - handle = ( dataPriv.get( cur, "events" ) || Object.create( null ) )[ event.type ] && - dataPriv.get( cur, "handle" ); - if ( handle ) { - handle.apply( cur, data ); - } - - // Native handler - handle = ontype && cur[ ontype ]; - if ( handle && handle.apply && acceptData( cur ) ) { - event.result = handle.apply( cur, data ); - if ( event.result === false ) { - event.preventDefault(); - } - } - } - event.type = type; - - // If nobody prevented the default action, do it now - if ( !onlyHandlers && !event.isDefaultPrevented() ) { - - if ( ( !special._default || - special._default.apply( eventPath.pop(), data ) === false ) && - acceptData( elem ) ) { - - // Call a native DOM method on the target with the same name as the event. - // Don't do default actions on window, that's where global variables be (#6170) - if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { - - // Don't re-trigger an onFOO event when we call its FOO() method - tmp = elem[ ontype ]; - - if ( tmp ) { - elem[ ontype ] = null; - } - - // Prevent re-triggering of the same event, since we already bubbled it above - jQuery.event.triggered = type; - - if ( event.isPropagationStopped() ) { - lastElement.addEventListener( type, stopPropagationCallback ); - } - - elem[ type ](); - - if ( event.isPropagationStopped() ) { - lastElement.removeEventListener( type, stopPropagationCallback ); - } - - jQuery.event.triggered = undefined; - - if ( tmp ) { - elem[ ontype ] = tmp; - } - } - } - } - - return event.result; - }, - - // Piggyback on a donor event to simulate a different one - // Used only for `focus(in | out)` events - simulate: function( type, elem, event ) { - var e = jQuery.extend( - new jQuery.Event(), - event, - { - type: type, - isSimulated: true - } - ); - - jQuery.event.trigger( e, null, elem ); - } - -} ); - -jQuery.fn.extend( { - - trigger: function( type, data ) { - return this.each( function() { - jQuery.event.trigger( type, data, this ); - } ); - }, - triggerHandler: function( type, data ) { - var elem = this[ 0 ]; - if ( elem ) { - return jQuery.event.trigger( type, data, elem, true ); - } - } -} ); - - -// Support: Firefox <=44 -// Firefox doesn't have focus(in | out) events -// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 -// -// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 -// focus(in | out) events fire after focus & blur events, -// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order -// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 -if ( !support.focusin ) { - jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { - - // Attach a single capturing handler on the document while someone wants focusin/focusout - var handler = function( event ) { - jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); - }; - - jQuery.event.special[ fix ] = { - setup: function() { - - // Handle: regular nodes (via `this.ownerDocument`), window - // (via `this.document`) & document (via `this`). - var doc = this.ownerDocument || this.document || this, - attaches = dataPriv.access( doc, fix ); - - if ( !attaches ) { - doc.addEventListener( orig, handler, true ); - } - dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); - }, - teardown: function() { - var doc = this.ownerDocument || this.document || this, - attaches = dataPriv.access( doc, fix ) - 1; - - if ( !attaches ) { - doc.removeEventListener( orig, handler, true ); - dataPriv.remove( doc, fix ); - - } else { - dataPriv.access( doc, fix, attaches ); - } - } - }; - } ); -} -var location = window.location; - -var nonce = { guid: Date.now() }; - -var rquery = ( /\?/ ); - - - -// Cross-browser xml parsing -jQuery.parseXML = function( data ) { - var xml, parserErrorElem; - if ( !data || typeof data !== "string" ) { - return null; - } - - // Support: IE 9 - 11 only - // IE throws on parseFromString with invalid input. - try { - xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); - } catch ( e ) {} - - parserErrorElem = xml && xml.getElementsByTagName( "parsererror" )[ 0 ]; - if ( !xml || parserErrorElem ) { - jQuery.error( "Invalid XML: " + ( - parserErrorElem ? - jQuery.map( parserErrorElem.childNodes, function( el ) { - return el.textContent; - } ).join( "\n" ) : - data - ) ); - } - return xml; -}; - - -var - rbracket = /\[\]$/, - rCRLF = /\r?\n/g, - rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, - rsubmittable = /^(?:input|select|textarea|keygen)/i; - -function buildParams( prefix, obj, traditional, add ) { - var name; - - if ( Array.isArray( obj ) ) { - - // Serialize array item. - jQuery.each( obj, function( i, v ) { - if ( traditional || rbracket.test( prefix ) ) { - - // Treat each array item as a scalar. - add( prefix, v ); - - } else { - - // Item is non-scalar (array or object), encode its numeric index. - buildParams( - prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", - v, - traditional, - add - ); - } - } ); - - } else if ( !traditional && toType( obj ) === "object" ) { - - // Serialize object item. - for ( name in obj ) { - buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); - } - - } else { - - // Serialize scalar item. - add( prefix, obj ); - } -} - -// Serialize an array of form elements or a set of -// key/values into a query string -jQuery.param = function( a, traditional ) { - var prefix, - s = [], - add = function( key, valueOrFunction ) { - - // If value is a function, invoke it and use its return value - var value = isFunction( valueOrFunction ) ? - valueOrFunction() : - valueOrFunction; - - s[ s.length ] = encodeURIComponent( key ) + "=" + - encodeURIComponent( value == null ? "" : value ); - }; - - if ( a == null ) { - return ""; - } - - // If an array was passed in, assume that it is an array of form elements. - if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { - - // Serialize the form elements - jQuery.each( a, function() { - add( this.name, this.value ); - } ); - - } else { - - // If traditional, encode the "old" way (the way 1.3.2 or older - // did it), otherwise encode params recursively. - for ( prefix in a ) { - buildParams( prefix, a[ prefix ], traditional, add ); - } - } - - // Return the resulting serialization - return s.join( "&" ); -}; - -jQuery.fn.extend( { - serialize: function() { - return jQuery.param( this.serializeArray() ); - }, - serializeArray: function() { - return this.map( function() { - - // Can add propHook for "elements" to filter or add form elements - var elements = jQuery.prop( this, "elements" ); - return elements ? jQuery.makeArray( elements ) : this; - } ).filter( function() { - var type = this.type; - - // Use .is( ":disabled" ) so that fieldset[disabled] works - return this.name && !jQuery( this ).is( ":disabled" ) && - rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && - ( this.checked || !rcheckableType.test( type ) ); - } ).map( function( _i, elem ) { - var val = jQuery( this ).val(); - - if ( val == null ) { - return null; - } - - if ( Array.isArray( val ) ) { - return jQuery.map( val, function( val ) { - return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; - } ); - } - - return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; - } ).get(); - } -} ); - - -var - r20 = /%20/g, - rhash = /#.*$/, - rantiCache = /([?&])_=[^&]*/, - rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, - - // #7653, #8125, #8152: local protocol detection - rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, - rnoContent = /^(?:GET|HEAD)$/, - rprotocol = /^\/\//, - - /* Prefilters - * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) - * 2) These are called: - * - BEFORE asking for a transport - * - AFTER param serialization (s.data is a string if s.processData is true) - * 3) key is the dataType - * 4) the catchall symbol "*" can be used - * 5) execution will start with transport dataType and THEN continue down to "*" if needed - */ - prefilters = {}, - - /* Transports bindings - * 1) key is the dataType - * 2) the catchall symbol "*" can be used - * 3) selection will start with transport dataType and THEN go to "*" if needed - */ - transports = {}, - - // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression - allTypes = "*/".concat( "*" ), - - // Anchor tag for parsing the document origin - originAnchor = document.createElement( "a" ); - -originAnchor.href = location.href; - -// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport -function addToPrefiltersOrTransports( structure ) { - - // dataTypeExpression is optional and defaults to "*" - return function( dataTypeExpression, func ) { - - if ( typeof dataTypeExpression !== "string" ) { - func = dataTypeExpression; - dataTypeExpression = "*"; - } - - var dataType, - i = 0, - dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; - - if ( isFunction( func ) ) { - - // For each dataType in the dataTypeExpression - while ( ( dataType = dataTypes[ i++ ] ) ) { - - // Prepend if requested - if ( dataType[ 0 ] === "+" ) { - dataType = dataType.slice( 1 ) || "*"; - ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); - - // Otherwise append - } else { - ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); - } - } - } - }; -} - -// Base inspection function for prefilters and transports -function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { - - var inspected = {}, - seekingTransport = ( structure === transports ); - - function inspect( dataType ) { - var selected; - inspected[ dataType ] = true; - jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { - var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); - if ( typeof dataTypeOrTransport === "string" && - !seekingTransport && !inspected[ dataTypeOrTransport ] ) { - - options.dataTypes.unshift( dataTypeOrTransport ); - inspect( dataTypeOrTransport ); - return false; - } else if ( seekingTransport ) { - return !( selected = dataTypeOrTransport ); - } - } ); - return selected; - } - - return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); -} - -// A special extend for ajax options -// that takes "flat" options (not to be deep extended) -// Fixes #9887 -function ajaxExtend( target, src ) { - var key, deep, - flatOptions = jQuery.ajaxSettings.flatOptions || {}; - - for ( key in src ) { - if ( src[ key ] !== undefined ) { - ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; - } - } - if ( deep ) { - jQuery.extend( true, target, deep ); - } - - return target; -} - -/* Handles responses to an ajax request: - * - finds the right dataType (mediates between content-type and expected dataType) - * - returns the corresponding response - */ -function ajaxHandleResponses( s, jqXHR, responses ) { - - var ct, type, finalDataType, firstDataType, - contents = s.contents, - dataTypes = s.dataTypes; - - // Remove auto dataType and get content-type in the process - while ( dataTypes[ 0 ] === "*" ) { - dataTypes.shift(); - if ( ct === undefined ) { - ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); - } - } - - // Check if we're dealing with a known content-type - if ( ct ) { - for ( type in contents ) { - if ( contents[ type ] && contents[ type ].test( ct ) ) { - dataTypes.unshift( type ); - break; - } - } - } - - // Check to see if we have a response for the expected dataType - if ( dataTypes[ 0 ] in responses ) { - finalDataType = dataTypes[ 0 ]; - } else { - - // Try convertible dataTypes - for ( type in responses ) { - if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { - finalDataType = type; - break; - } - if ( !firstDataType ) { - firstDataType = type; - } - } - - // Or just use first one - finalDataType = finalDataType || firstDataType; - } - - // If we found a dataType - // We add the dataType to the list if needed - // and return the corresponding response - if ( finalDataType ) { - if ( finalDataType !== dataTypes[ 0 ] ) { - dataTypes.unshift( finalDataType ); - } - return responses[ finalDataType ]; - } -} - -/* Chain conversions given the request and the original response - * Also sets the responseXXX fields on the jqXHR instance - */ -function ajaxConvert( s, response, jqXHR, isSuccess ) { - var conv2, current, conv, tmp, prev, - converters = {}, - - // Work with a copy of dataTypes in case we need to modify it for conversion - dataTypes = s.dataTypes.slice(); - - // Create converters map with lowercased keys - if ( dataTypes[ 1 ] ) { - for ( conv in s.converters ) { - converters[ conv.toLowerCase() ] = s.converters[ conv ]; - } - } - - current = dataTypes.shift(); - - // Convert to each sequential dataType - while ( current ) { - - if ( s.responseFields[ current ] ) { - jqXHR[ s.responseFields[ current ] ] = response; - } - - // Apply the dataFilter if provided - if ( !prev && isSuccess && s.dataFilter ) { - response = s.dataFilter( response, s.dataType ); - } - - prev = current; - current = dataTypes.shift(); - - if ( current ) { - - // There's only work to do if current dataType is non-auto - if ( current === "*" ) { - - current = prev; - - // Convert response if prev dataType is non-auto and differs from current - } else if ( prev !== "*" && prev !== current ) { - - // Seek a direct converter - conv = converters[ prev + " " + current ] || converters[ "* " + current ]; - - // If none found, seek a pair - if ( !conv ) { - for ( conv2 in converters ) { - - // If conv2 outputs current - tmp = conv2.split( " " ); - if ( tmp[ 1 ] === current ) { - - // If prev can be converted to accepted input - conv = converters[ prev + " " + tmp[ 0 ] ] || - converters[ "* " + tmp[ 0 ] ]; - if ( conv ) { - - // Condense equivalence converters - if ( conv === true ) { - conv = converters[ conv2 ]; - - // Otherwise, insert the intermediate dataType - } else if ( converters[ conv2 ] !== true ) { - current = tmp[ 0 ]; - dataTypes.unshift( tmp[ 1 ] ); - } - break; - } - } - } - } - - // Apply converter (if not an equivalence) - if ( conv !== true ) { - - // Unless errors are allowed to bubble, catch and return them - if ( conv && s.throws ) { - response = conv( response ); - } else { - try { - response = conv( response ); - } catch ( e ) { - return { - state: "parsererror", - error: conv ? e : "No conversion from " + prev + " to " + current - }; - } - } - } - } - } - } - - return { state: "success", data: response }; -} - -jQuery.extend( { - - // Counter for holding the number of active queries - active: 0, - - // Last-Modified header cache for next request - lastModified: {}, - etag: {}, - - ajaxSettings: { - url: location.href, - type: "GET", - isLocal: rlocalProtocol.test( location.protocol ), - global: true, - processData: true, - async: true, - contentType: "application/x-www-form-urlencoded; charset=UTF-8", - - /* - timeout: 0, - data: null, - dataType: null, - username: null, - password: null, - cache: null, - throws: false, - traditional: false, - headers: {}, - */ - - accepts: { - "*": allTypes, - text: "text/plain", - html: "text/html", - xml: "application/xml, text/xml", - json: "application/json, text/javascript" - }, - - contents: { - xml: /\bxml\b/, - html: /\bhtml/, - json: /\bjson\b/ - }, - - responseFields: { - xml: "responseXML", - text: "responseText", - json: "responseJSON" - }, - - // Data converters - // Keys separate source (or catchall "*") and destination types with a single space - converters: { - - // Convert anything to text - "* text": String, - - // Text to html (true = no transformation) - "text html": true, - - // Evaluate text as a json expression - "text json": JSON.parse, - - // Parse text as xml - "text xml": jQuery.parseXML - }, - - // For options that shouldn't be deep extended: - // you can add your own custom options here if - // and when you create one that shouldn't be - // deep extended (see ajaxExtend) - flatOptions: { - url: true, - context: true - } - }, - - // Creates a full fledged settings object into target - // with both ajaxSettings and settings fields. - // If target is omitted, writes into ajaxSettings. - ajaxSetup: function( target, settings ) { - return settings ? - - // Building a settings object - ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : - - // Extending ajaxSettings - ajaxExtend( jQuery.ajaxSettings, target ); - }, - - ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), - ajaxTransport: addToPrefiltersOrTransports( transports ), - - // Main method - ajax: function( url, options ) { - - // If url is an object, simulate pre-1.5 signature - if ( typeof url === "object" ) { - options = url; - url = undefined; - } - - // Force options to be an object - options = options || {}; - - var transport, - - // URL without anti-cache param - cacheURL, - - // Response headers - responseHeadersString, - responseHeaders, - - // timeout handle - timeoutTimer, - - // Url cleanup var - urlAnchor, - - // Request state (becomes false upon send and true upon completion) - completed, - - // To know if global events are to be dispatched - fireGlobals, - - // Loop variable - i, - - // uncached part of the url - uncached, - - // Create the final options object - s = jQuery.ajaxSetup( {}, options ), - - // Callbacks context - callbackContext = s.context || s, - - // Context for global events is callbackContext if it is a DOM node or jQuery collection - globalEventContext = s.context && - ( callbackContext.nodeType || callbackContext.jquery ) ? - jQuery( callbackContext ) : - jQuery.event, - - // Deferreds - deferred = jQuery.Deferred(), - completeDeferred = jQuery.Callbacks( "once memory" ), - - // Status-dependent callbacks - statusCode = s.statusCode || {}, - - // Headers (they are sent all at once) - requestHeaders = {}, - requestHeadersNames = {}, - - // Default abort message - strAbort = "canceled", - - // Fake xhr - jqXHR = { - readyState: 0, - - // Builds headers hashtable if needed - getResponseHeader: function( key ) { - var match; - if ( completed ) { - if ( !responseHeaders ) { - responseHeaders = {}; - while ( ( match = rheaders.exec( responseHeadersString ) ) ) { - responseHeaders[ match[ 1 ].toLowerCase() + " " ] = - ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) - .concat( match[ 2 ] ); - } - } - match = responseHeaders[ key.toLowerCase() + " " ]; - } - return match == null ? null : match.join( ", " ); - }, - - // Raw string - getAllResponseHeaders: function() { - return completed ? responseHeadersString : null; - }, - - // Caches the header - setRequestHeader: function( name, value ) { - if ( completed == null ) { - name = requestHeadersNames[ name.toLowerCase() ] = - requestHeadersNames[ name.toLowerCase() ] || name; - requestHeaders[ name ] = value; - } - return this; - }, - - // Overrides response content-type header - overrideMimeType: function( type ) { - if ( completed == null ) { - s.mimeType = type; - } - return this; - }, - - // Status-dependent callbacks - statusCode: function( map ) { - var code; - if ( map ) { - if ( completed ) { - - // Execute the appropriate callbacks - jqXHR.always( map[ jqXHR.status ] ); - } else { - - // Lazy-add the new callbacks in a way that preserves old ones - for ( code in map ) { - statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; - } - } - } - return this; - }, - - // Cancel the request - abort: function( statusText ) { - var finalText = statusText || strAbort; - if ( transport ) { - transport.abort( finalText ); - } - done( 0, finalText ); - return this; - } - }; - - // Attach deferreds - deferred.promise( jqXHR ); - - // Add protocol if not provided (prefilters might expect it) - // Handle falsy url in the settings object (#10093: consistency with old signature) - // We also use the url parameter if available - s.url = ( ( url || s.url || location.href ) + "" ) - .replace( rprotocol, location.protocol + "//" ); - - // Alias method option to type as per ticket #12004 - s.type = options.method || options.type || s.method || s.type; - - // Extract dataTypes list - s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; - - // A cross-domain request is in order when the origin doesn't match the current origin. - if ( s.crossDomain == null ) { - urlAnchor = document.createElement( "a" ); - - // Support: IE <=8 - 11, Edge 12 - 15 - // IE throws exception on accessing the href property if url is malformed, - // e.g. http://example.com:80x/ - try { - urlAnchor.href = s.url; - - // Support: IE <=8 - 11 only - // Anchor's host property isn't correctly set when s.url is relative - urlAnchor.href = urlAnchor.href; - s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== - urlAnchor.protocol + "//" + urlAnchor.host; - } catch ( e ) { - - // If there is an error parsing the URL, assume it is crossDomain, - // it can be rejected by the transport if it is invalid - s.crossDomain = true; - } - } - - // Convert data if not already a string - if ( s.data && s.processData && typeof s.data !== "string" ) { - s.data = jQuery.param( s.data, s.traditional ); - } - - // Apply prefilters - inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); - - // If request was aborted inside a prefilter, stop there - if ( completed ) { - return jqXHR; - } - - // We can fire global events as of now if asked to - // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) - fireGlobals = jQuery.event && s.global; - - // Watch for a new set of requests - if ( fireGlobals && jQuery.active++ === 0 ) { - jQuery.event.trigger( "ajaxStart" ); - } - - // Uppercase the type - s.type = s.type.toUpperCase(); - - // Determine if request has content - s.hasContent = !rnoContent.test( s.type ); - - // Save the URL in case we're toying with the If-Modified-Since - // and/or If-None-Match header later on - // Remove hash to simplify url manipulation - cacheURL = s.url.replace( rhash, "" ); - - // More options handling for requests with no content - if ( !s.hasContent ) { - - // Remember the hash so we can put it back - uncached = s.url.slice( cacheURL.length ); - - // If data is available and should be processed, append data to url - if ( s.data && ( s.processData || typeof s.data === "string" ) ) { - cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; - - // #9682: remove data so that it's not used in an eventual retry - delete s.data; - } - - // Add or update anti-cache param if needed - if ( s.cache === false ) { - cacheURL = cacheURL.replace( rantiCache, "$1" ); - uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + - uncached; - } - - // Put hash and anti-cache on the URL that will be requested (gh-1732) - s.url = cacheURL + uncached; - - // Change '%20' to '+' if this is encoded form body content (gh-2658) - } else if ( s.data && s.processData && - ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { - s.data = s.data.replace( r20, "+" ); - } - - // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. - if ( s.ifModified ) { - if ( jQuery.lastModified[ cacheURL ] ) { - jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); - } - if ( jQuery.etag[ cacheURL ] ) { - jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); - } - } - - // Set the correct header, if data is being sent - if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { - jqXHR.setRequestHeader( "Content-Type", s.contentType ); - } - - // Set the Accepts header for the server, depending on the dataType - jqXHR.setRequestHeader( - "Accept", - s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? - s.accepts[ s.dataTypes[ 0 ] ] + - ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : - s.accepts[ "*" ] - ); - - // Check for headers option - for ( i in s.headers ) { - jqXHR.setRequestHeader( i, s.headers[ i ] ); - } - - // Allow custom headers/mimetypes and early abort - if ( s.beforeSend && - ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { - - // Abort if not done already and return - return jqXHR.abort(); - } - - // Aborting is no longer a cancellation - strAbort = "abort"; - - // Install callbacks on deferreds - completeDeferred.add( s.complete ); - jqXHR.done( s.success ); - jqXHR.fail( s.error ); - - // Get transport - transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); - - // If no transport, we auto-abort - if ( !transport ) { - done( -1, "No Transport" ); - } else { - jqXHR.readyState = 1; - - // Send global event - if ( fireGlobals ) { - globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); - } - - // If request was aborted inside ajaxSend, stop there - if ( completed ) { - return jqXHR; - } - - // Timeout - if ( s.async && s.timeout > 0 ) { - timeoutTimer = window.setTimeout( function() { - jqXHR.abort( "timeout" ); - }, s.timeout ); - } - - try { - completed = false; - transport.send( requestHeaders, done ); - } catch ( e ) { - - // Rethrow post-completion exceptions - if ( completed ) { - throw e; - } - - // Propagate others as results - done( -1, e ); - } - } - - // Callback for when everything is done - function done( status, nativeStatusText, responses, headers ) { - var isSuccess, success, error, response, modified, - statusText = nativeStatusText; - - // Ignore repeat invocations - if ( completed ) { - return; - } - - completed = true; - - // Clear timeout if it exists - if ( timeoutTimer ) { - window.clearTimeout( timeoutTimer ); - } - - // Dereference transport for early garbage collection - // (no matter how long the jqXHR object will be used) - transport = undefined; - - // Cache response headers - responseHeadersString = headers || ""; - - // Set readyState - jqXHR.readyState = status > 0 ? 4 : 0; - - // Determine if successful - isSuccess = status >= 200 && status < 300 || status === 304; - - // Get response data - if ( responses ) { - response = ajaxHandleResponses( s, jqXHR, responses ); - } - - // Use a noop converter for missing script but not if jsonp - if ( !isSuccess && - jQuery.inArray( "script", s.dataTypes ) > -1 && - jQuery.inArray( "json", s.dataTypes ) < 0 ) { - s.converters[ "text script" ] = function() {}; - } - - // Convert no matter what (that way responseXXX fields are always set) - response = ajaxConvert( s, response, jqXHR, isSuccess ); - - // If successful, handle type chaining - if ( isSuccess ) { - - // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. - if ( s.ifModified ) { - modified = jqXHR.getResponseHeader( "Last-Modified" ); - if ( modified ) { - jQuery.lastModified[ cacheURL ] = modified; - } - modified = jqXHR.getResponseHeader( "etag" ); - if ( modified ) { - jQuery.etag[ cacheURL ] = modified; - } - } - - // if no content - if ( status === 204 || s.type === "HEAD" ) { - statusText = "nocontent"; - - // if not modified - } else if ( status === 304 ) { - statusText = "notmodified"; - - // If we have data, let's convert it - } else { - statusText = response.state; - success = response.data; - error = response.error; - isSuccess = !error; - } - } else { - - // Extract error from statusText and normalize for non-aborts - error = statusText; - if ( status || !statusText ) { - statusText = "error"; - if ( status < 0 ) { - status = 0; - } - } - } - - // Set data for the fake xhr object - jqXHR.status = status; - jqXHR.statusText = ( nativeStatusText || statusText ) + ""; - - // Success/Error - if ( isSuccess ) { - deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); - } else { - deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); - } - - // Status-dependent callbacks - jqXHR.statusCode( statusCode ); - statusCode = undefined; - - if ( fireGlobals ) { - globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", - [ jqXHR, s, isSuccess ? success : error ] ); - } - - // Complete - completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); - - if ( fireGlobals ) { - globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); - - // Handle the global AJAX counter - if ( !( --jQuery.active ) ) { - jQuery.event.trigger( "ajaxStop" ); - } - } - } - - return jqXHR; - }, - - getJSON: function( url, data, callback ) { - return jQuery.get( url, data, callback, "json" ); - }, - - getScript: function( url, callback ) { - return jQuery.get( url, undefined, callback, "script" ); - } -} ); - -jQuery.each( [ "get", "post" ], function( _i, method ) { - jQuery[ method ] = function( url, data, callback, type ) { - - // Shift arguments if data argument was omitted - if ( isFunction( data ) ) { - type = type || callback; - callback = data; - data = undefined; - } - - // The url can be an options object (which then must have .url) - return jQuery.ajax( jQuery.extend( { - url: url, - type: method, - dataType: type, - data: data, - success: callback - }, jQuery.isPlainObject( url ) && url ) ); - }; -} ); - -jQuery.ajaxPrefilter( function( s ) { - var i; - for ( i in s.headers ) { - if ( i.toLowerCase() === "content-type" ) { - s.contentType = s.headers[ i ] || ""; - } - } -} ); - - -jQuery._evalUrl = function( url, options, doc ) { - return jQuery.ajax( { - url: url, - - // Make this explicit, since user can override this through ajaxSetup (#11264) - type: "GET", - dataType: "script", - cache: true, - async: false, - global: false, - - // Only evaluate the response if it is successful (gh-4126) - // dataFilter is not invoked for failure responses, so using it instead - // of the default converter is kludgy but it works. - converters: { - "text script": function() {} - }, - dataFilter: function( response ) { - jQuery.globalEval( response, options, doc ); - } - } ); -}; - - -jQuery.fn.extend( { - wrapAll: function( html ) { - var wrap; - - if ( this[ 0 ] ) { - if ( isFunction( html ) ) { - html = html.call( this[ 0 ] ); - } - - // The elements to wrap the target around - wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); - - if ( this[ 0 ].parentNode ) { - wrap.insertBefore( this[ 0 ] ); - } - - wrap.map( function() { - var elem = this; - - while ( elem.firstElementChild ) { - elem = elem.firstElementChild; - } - - return elem; - } ).append( this ); - } - - return this; - }, - - wrapInner: function( html ) { - if ( isFunction( html ) ) { - return this.each( function( i ) { - jQuery( this ).wrapInner( html.call( this, i ) ); - } ); - } - - return this.each( function() { - var self = jQuery( this ), - contents = self.contents(); - - if ( contents.length ) { - contents.wrapAll( html ); - - } else { - self.append( html ); - } - } ); - }, - - wrap: function( html ) { - var htmlIsFunction = isFunction( html ); - - return this.each( function( i ) { - jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); - } ); - }, - - unwrap: function( selector ) { - this.parent( selector ).not( "body" ).each( function() { - jQuery( this ).replaceWith( this.childNodes ); - } ); - return this; - } -} ); - - -jQuery.expr.pseudos.hidden = function( elem ) { - return !jQuery.expr.pseudos.visible( elem ); -}; -jQuery.expr.pseudos.visible = function( elem ) { - return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); -}; - - - - -jQuery.ajaxSettings.xhr = function() { - try { - return new window.XMLHttpRequest(); - } catch ( e ) {} -}; - -var xhrSuccessStatus = { - - // File protocol always yields status code 0, assume 200 - 0: 200, - - // Support: IE <=9 only - // #1450: sometimes IE returns 1223 when it should be 204 - 1223: 204 - }, - xhrSupported = jQuery.ajaxSettings.xhr(); - -support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); -support.ajax = xhrSupported = !!xhrSupported; - -jQuery.ajaxTransport( function( options ) { - var callback, errorCallback; - - // Cross domain only allowed if supported through XMLHttpRequest - if ( support.cors || xhrSupported && !options.crossDomain ) { - return { - send: function( headers, complete ) { - var i, - xhr = options.xhr(); - - xhr.open( - options.type, - options.url, - options.async, - options.username, - options.password - ); - - // Apply custom fields if provided - if ( options.xhrFields ) { - for ( i in options.xhrFields ) { - xhr[ i ] = options.xhrFields[ i ]; - } - } - - // Override mime type if needed - if ( options.mimeType && xhr.overrideMimeType ) { - xhr.overrideMimeType( options.mimeType ); - } - - // X-Requested-With header - // For cross-domain requests, seeing as conditions for a preflight are - // akin to a jigsaw puzzle, we simply never set it to be sure. - // (it can always be set on a per-request basis or even using ajaxSetup) - // For same-domain requests, won't change header if already provided. - if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { - headers[ "X-Requested-With" ] = "XMLHttpRequest"; - } - - // Set headers - for ( i in headers ) { - xhr.setRequestHeader( i, headers[ i ] ); - } - - // Callback - callback = function( type ) { - return function() { - if ( callback ) { - callback = errorCallback = xhr.onload = - xhr.onerror = xhr.onabort = xhr.ontimeout = - xhr.onreadystatechange = null; - - if ( type === "abort" ) { - xhr.abort(); - } else if ( type === "error" ) { - - // Support: IE <=9 only - // On a manual native abort, IE9 throws - // errors on any property access that is not readyState - if ( typeof xhr.status !== "number" ) { - complete( 0, "error" ); - } else { - complete( - - // File: protocol always yields status 0; see #8605, #14207 - xhr.status, - xhr.statusText - ); - } - } else { - complete( - xhrSuccessStatus[ xhr.status ] || xhr.status, - xhr.statusText, - - // Support: IE <=9 only - // IE9 has no XHR2 but throws on binary (trac-11426) - // For XHR2 non-text, let the caller handle it (gh-2498) - ( xhr.responseType || "text" ) !== "text" || - typeof xhr.responseText !== "string" ? - { binary: xhr.response } : - { text: xhr.responseText }, - xhr.getAllResponseHeaders() - ); - } - } - }; - }; - - // Listen to events - xhr.onload = callback(); - errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); - - // Support: IE 9 only - // Use onreadystatechange to replace onabort - // to handle uncaught aborts - if ( xhr.onabort !== undefined ) { - xhr.onabort = errorCallback; - } else { - xhr.onreadystatechange = function() { - - // Check readyState before timeout as it changes - if ( xhr.readyState === 4 ) { - - // Allow onerror to be called first, - // but that will not handle a native abort - // Also, save errorCallback to a variable - // as xhr.onerror cannot be accessed - window.setTimeout( function() { - if ( callback ) { - errorCallback(); - } - } ); - } - }; - } - - // Create the abort callback - callback = callback( "abort" ); - - try { - - // Do send the request (this may raise an exception) - xhr.send( options.hasContent && options.data || null ); - } catch ( e ) { - - // #14683: Only rethrow if this hasn't been notified as an error yet - if ( callback ) { - throw e; - } - } - }, - - abort: function() { - if ( callback ) { - callback(); - } - } - }; - } -} ); - - - - -// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) -jQuery.ajaxPrefilter( function( s ) { - if ( s.crossDomain ) { - s.contents.script = false; - } -} ); - -// Install script dataType -jQuery.ajaxSetup( { - accepts: { - script: "text/javascript, application/javascript, " + - "application/ecmascript, application/x-ecmascript" - }, - contents: { - script: /\b(?:java|ecma)script\b/ - }, - converters: { - "text script": function( text ) { - jQuery.globalEval( text ); - return text; - } - } -} ); - -// Handle cache's special case and crossDomain -jQuery.ajaxPrefilter( "script", function( s ) { - if ( s.cache === undefined ) { - s.cache = false; - } - if ( s.crossDomain ) { - s.type = "GET"; - } -} ); - -// Bind script tag hack transport -jQuery.ajaxTransport( "script", function( s ) { - - // This transport only deals with cross domain or forced-by-attrs requests - if ( s.crossDomain || s.scriptAttrs ) { - var script, callback; - return { - send: function( _, complete ) { - script = jQuery( " + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    ebmstate: An R Package For Disease Progression Analysis Under Empirical Bayes Cox Models

    + + + +

    The new R package ebmstate is a package for multi-state survival +analysis. It is suitable for high-dimensional data and allows point +and interval estimation of relative transition hazards, cumulative +transition hazards and state occupation probabilities, under +clock-forward and clock-reset Cox models. Our package extends the +package mstate in a threefold manner: it transforms the Cox regression +model into an empirical Bayes model that can handle high-dimensional +data; it introduces an analytical, Fourier transform-based estimator +of state occupation probabilities for clock-reset models that is much +faster than the corresponding, simulation-based estimator in mstate; +and it replaces asymptotic confidence intervals meant for the +low-dimensional setting by non-parametric bootstrap confidence +intervals. Our package supports multi-state models of arbitrary +structure, but the estimators of state occupation probabilities are +valid for transition structures without cycles only. Once the input +data is in the required format, estimation is handled automatically. +The present paper includes a tutorial on how to use ebmstate to +estimate transition hazards and state occupation probabilities, as +well as a simulation study showing how it outperforms mstate in +higher-dimensional settings.

    +
    + + + +
    +
    +

    1 Introduction

    +

    Multi-state models based on transition hazard functions are often used +in the statistical analysis of longitudinal data, in particular disease +progression data (Hougaard 1999). The multi-state model framework is +particularly suitable to accommodate the growing level of detail of +modern clinical data: as long as a clinical history can be framed as a +random process which, at any moment in time, occupies one of a few +states, a multi-state model is applicable. Another strong point of this +framework is that it can incorporate a regression model, i.e., a set +of assumptions on how covariates, possibly time-dependent ones, affect +the risk of transitioning between any two states of the disease. Once +estimated, multi-state models with regression features allow the +stratification of patients according to their transition hazards. In +addition, it is possible, under some models, to generate disease outcome +predictions. These come in the form of state occupation probability +estimates, meaning estimates of the probability of being in each state +of the disease over a given time frame.

    +

    The survival analysis ‘task view’ of the Comprehensive R Archive Network +lists seven R packages that are able to fit general multi-state models +and, at the same time, feature some kind of regression model or +algorithm: flexsurv (Jackson 2016), +msm (Jackson 2011), +SemiMarkov +(Listwon and Saint-Pierre 2015), +survival +(Therneau 2015), +mstate (Wreede et al. 2010), +mboost +(Hothorn et al. 2020) – as extended by +gamboostMSM +(Reulen 2014) – and +penMSM +(Reulen 2015). All of them implement relative risk regression models +(as defined in Aalen et al. 2008 133). The only exceptions are +survival, which also +fits Aalen’s additive regression model (Aalen 1989), and flexsurv, +which also implements accelerated failure time models .

    +

    Recall that a Cox regression model is a semi-parametric model in which +every transition hazard is assumed to be the product of a baseline +hazard function of unspecified form (the non-parametric component) and +an exponential relative risk function (the parametric component) +(Aalen et al. 2008 133). Generally, the relative risk regression models +implemented in these packages are Cox regression models. However, some +models in flexsurv, as well as those in +msm and +SemiMarkov, also +restrict the baseline hazards to specific parametric families, i.e. they +are fully parametric. In +msm and +SemiMarkov, the +stronger assumptions regarding the functional form of the hazard are +leveraged to do away with other common assumptions: +SemiMarkov drops +the usual Markov property to implement homogeneous semi-Markov models; +msm is suitable for panel +data, i.e., data in which the state of each individual is known only at +a finite series of times.

    +

    Packages penMSM and +gamboostMSM are +the best suited to deal with higher-dimensional covariate data. The +first of these packages relies on a structured fusion lasso method, +while the second implements (jointly with +mboost) a boosting +algorithm. Both methods induce sparsity in the number of non-zero +covariate effects, as well as equality among the different transition +effects of each covariate, and are thus especially useful to reduce +complicated multi-state models to more interpretable ones. The remaining +packages assume standard, fixed effects relative risk regression models +and do not include regularisation or variable selection features.

    +

    It is also illustrative to order the seven packages mentioned according +to how extensive their analysis workflow is. Packages +SemiMarkov and +penMSM are intended for +the estimation of relative transition hazards only (i.e., for estimating +the impact of covariates on each transition hazard). With the package +mboost (as extended by +gamboostMSM) it is +also possible to estimate the baseline transition hazards. Finally, a +more complete workflow including estimates of both relative and +cumulative transition hazards, as well as state occupation +probabilities, is implemented in flexsurv, +msm and +mstate, and has been +under implementation in +survival (version 3.0 +or later).

    +

    The present paper provides an introduction to +ebmstate, a new R +package for multi-state survival analysis available for download on the +Comprehensive R Archive Network (CRAN). The main goal of +ebmstate is to +provide an analysis framework for the Cox model that performs better +with higher-dimensional covariate data and is also complete, in the +sense of being able to generate point and interval estimates of relative +transition hazards, cumulative transition hazards and state occupation +probabilities, both under clock-forward and clock-reset models. A +fundamental characteristic of +ebmstate is that it +re-implements and extends the analysis framework of +mstate, which is +complete in the sense just mentioned. In fact, to a large extent, our +package was built by importing, adapting and replacing functions from +the mstate package. +This not only eliminates redundancies, but also makes our package more +accessible to the numerous users of +mstate (the three +papers associated with +mstate have jointly +over 2000 citations).

    +

    To improve the performance of +mstate’s multi-state +Cox model when dealing with higher-dimensional covariate data, a +ridge-type regularisation feature was added. We allow the regression +coefficients of the model to be partitioned into groups, with each group +having its own Gaussian prior. A group can gather, for example, all the +regression coefficients for a given transition. Or, within a given +transition, coefficients can be grouped according to the covariate type +they refer to (for example, demographic, clinical or genomic type). The +resulting hierarchical Bayes model is empirical in that a full prior +elicitation is not required (the mean and variance hyper-parameters of +the Gaussian are estimated from the data). Model fitting relies on the +iterative algorithm introduced by Schall (1991), which typically converges +after a small number of steps. A simulation study showing that Schall’s +algorithm performance compares well with that of other algorithms for +ridge penalty optimisation, including one based on cross-validation, can +be found in Perperoglou (2014).

    +

    The asymptotic confidence intervals generated by +mstate are applicable +when the number of observations is much larger than the number of +parameters to be estimated (see section 3.3 +below). To preserve the completeness of +mstate’s framework in +higher-dimensional settings, we therefore implemented non-parametric +bootstrap intervals of regression coefficients, cumulative transition +hazards and state occupation probabilities.

    +

    The high computational cost implied by the non-parametric bootstrap +motivated a third extension to +mstate. We developed an +estimator of state occupation probabilities under clock-reset Cox models +that is based on a convolution argument (as in Spitoni et al. 2012) and the +Fast Fourier transform (FFT). At present, the estimation of such +probabilities for clock-forward Cox models can be carried out using the +efficient, product-limit based algorithm available in +mstate. However, for +clock-reset Cox models, only a simulation-based estimator is available +in this package (see also the flexsurv package for a similar, +simulation-based estimator). The FFT estimator in +ebmstate was +conceived as a faster alternative to this simulation-based estimator, +but its scope is currently restricted to multi-state models with +transition structures that have no cycles, i.e. in which a transition +between two states is either not possible or follows a unique sequence +of states. Figure 1 provides a short +graphical summary of +ebmstate, with the +main inputs – a genomic-clinical data set and an empirical Bayes +multi-state Cox model – and the main outputs – the estimates of +relative hazards and state occupation probabilities (cumulative +transition hazards are omitted).

    +

    As already mentioned, our empirical Bayes method improves estimator +performance in models with larger numbers of covariates (see section +4 on estimator performance). Also, as a +ridge-type regression method, it can be used as an alternative to the +lasso method of penMSM +in two particular cases: when the levels of correlation between +covariates are high enough to compromise the stability of lasso-based +covariate selection; or simply to improve prediction accuracy when +interpretability is not essential and the number of covariates is not +greater than the number of observations (Zou and Hastie 2005). In addition, and +perhaps more importantly, +ebmstate goes beyond +the regularised estimation of transition hazards offered by +penMSM and +gamboostMSM: point +and interval estimates of state occupation probabilities under the +regularised Cox model can also be computed.

    +

    2 Models

    +

    A multi-state Cox model is a continuous-time stochastic process with a +finite (and usually small) state space \(\mathcal{S}\). To better describe +the models implemented in +ebmstate, we define +the following notation. We let \(t\) denote the time since some initiating +event (usually diagnosis or disease onset). For +\(t \in \left[0, \infty\right)\), we define the following random +variables: \(X(t)\) represents the disease state of the patient, \(S(t)\) +the time spent in the current state, and \(\vec{Z}\left(t\right)\) the +value of a covariate vector. The realisation of each component of the +process \(\lbrace\vec{Z}\left(t\right)\rbrace\) is a step function, +possibly approximating the evolution in time of a continuous covariate. +In addition, \(\lbrace\vec{Z}\left(t\right)\rbrace\) is assumed +not-adapted to the filtration generated by +\(\lbrace X\left(t\right)\rbrace\) (an adapted covariate is one whose path +until \(t\) is known once \(\lbrace X \left(u\right)\rbrace\), \(u \leq t\), +is known). The transition hazard rate of a patient from state \(i\) to +state \(j\) (\(i\neq j\)) at time \(t\), conditional on the sojourn time and +the covariate vector, is defined as +\[\begin{aligned} +&\alpha_{ij}\left(t|\mathbf{z},s \right):=\lim_{h \downarrow 0}\frac{1}{h}\mathrm{P}\left[X(t+h)=j\,|\,X(t)=i,S(t)=s,\vec{Z}(t)=\mathbf{z} \right]\;, \;s\in \left[0,\infty\right)\;,\;t\in \left[s,\infty\right)\;. +\end{aligned}\] +Independent right-censoring and left-truncation are assumed throughout +(Aalen et al. 2008 57). The purpose of the present section is to give a (not +necessarily exhaustive) description of the scope of +mstate and +ebmstate with respect +to the multi-state Cox model. Using the terminology in de Wreede et al. (2011), a +Cox model is termed a ‘clock-reset’ model when +\[\begin{aligned} +\label{eq:clock_reset_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}, s\right)&=\lambda_{ij}^{(0)}\left(s\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad, +\end{aligned} \tag{1}\] +and it is termed a ‘clock-forward’ model when +\[\begin{aligned} +\label{eq:clock_forward_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}\right)&=\alpha_{ij}^{(0)}\left(t\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad. +\end{aligned} \tag{2}\] +In both cases, \(i,j \in \mathcal{S}\), with \(i\neq j\); +\(\boldsymbol{\beta}_{\scriptscriptstyle ij}\) is an unknown vector of +regression coefficient parameters, and both +\(\lambda^{\scriptscriptstyle (0)}_{ij}(\cdot)\) and +\(\alpha^{\scriptscriptstyle (0)}_{ij}(\cdot)\) are unknown (baseline +hazard) functions, non-negative on \(\mathbb{R}^{+}\). When, as in +equation (1), +\(\alpha_{ij}\left(t|\mathbf{z},s\right)\) is the same for all \(t\geq s\), +we simplify its notation to \(\lambda_{ij}\left(s|\mathbf{z}\right)\). As +can be seen from equations (1) and +(2), the ‘clock-reset’ and ‘clock-forward’ +models are models for how the transition hazard rates are affected by +time. In the former case, the only relevant time scale is the time \(s\) +spent in the current state, whereas in the latter only the time \(t\) +since the initiating event matters. While the ‘clock-forward’ model is +arguably the default one in multi-state survival analysis +(Andersen et al. 1993; Aalen et al. 2008), in some cases the ‘clock-reset’ model is +more appropriate. For example, in some forms of cancer, it can be +sensible to assume that the transition hazards from the state of +complete remission depend on the sojourn time, rather than on the time +since the initial diagnosis.

    +

    Relative transition hazards

    +

    The parametric component of the transition hazard from \(i\) to \(j\), +written +\(\exp\left[\boldsymbol{\beta}^{\intercal}_{ij} \,\mathbf{z}\right]\), is +termed the relative transition hazard. In +mstate and +ebmstate, estimating +the relative transition hazard amounts to estimating the regression +coefficient vector \(\boldsymbol{\beta}_{ij}\,\). In +mstate, these +parameters are assumed to be non-random. With +ebmstate, the +following prior distributions can be imposed.

    +

    Define \(\mathcal{P}\) as the set of all pairs of states between which a +direct transition is possible. Let +\(\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ij} \rbrace\), for all +\((i, j) \in \mathcal{P}\), be a partition of \(\boldsymbol \beta\), a +vector containing the regression coefficients for all direct transitions +allowed. Each \(\boldsymbol{\beta}_{\scriptscriptstyle ij}\) is further +partitioned into +\(\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ijk} \rbrace\), for +\(k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace\). In +ebmstate, the most +general model regarding the prior distribution of \(\boldsymbol{\beta}\) +makes two assumptions: a) the scalar components of \(\boldsymbol{\beta}\) +are independent and normally distributed; b) the scalar components of +\(\boldsymbol{\beta}_{\scriptscriptstyle i j k}\) have a common (and +undetermined) mean \(\mu_{\scriptscriptstyle ijk}\) and a common (and also +undetermined) variance \(\sigma^{2}_{\scriptscriptstyle ijk}\;\).

    +

    The purpose of the framework just described is to allow the clustering +of covariate effects according to their prior distribution. If there is +no prior knowledge about how this clustering should be done, a single +Gaussian prior can be imposed on all regression coefficients at once. If +prior knowledge allows the grouping of effects according to the +transition they refer to, a different Gaussian prior can be assigned to +the coefficients of each transition. Even within each transition, +different groups of coefficients can be assigned different prior +distributions. In the analysis of biomedical data, for example, there +can be a split between genes which are known to affect the transition +hazard, and other genes whose effect is unknown.

    +

    Cumulative transition hazard functions

    +

    Our package imports from +mstate a Breslow +estimator of two types of cumulative transition hazard: one on a global +time scale, defined as +\[\begin{aligned} +\mathrm{A}_{ij}\left(t\,|\,\mathbf{z}\right)&:=\int_{0}^{t}\alpha_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad, +\end{aligned}\] +and another on a sojourn time scale, defined as +\[\begin{aligned} +&\Lambda_{ij}(s\,|\,\mathbf{z}):=\int_{0}^{s}\lambda_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad. +\end{aligned}\] +Note that, in either case, the covariate vector is assumed to remain +constant.

    +

    State occupation probabilities

    +

    By state occupation probability, we mean the probability that a patient +in state \(i\) at time \(0\) finds herself in state \(j\) at time \(t\). The +estimates of these probabilities can be seen as functionals of the +estimated cumulative transition hazard functions. For this reason, the +restriction to models with time-fixed covariates, which was just seen to +be applicable to the estimators of cumulative transition hazards, +carries over to the estimation of state occupation probabilities.

    +

    When conditioning on a given covariate path (time-fixed or not), state +occupation probability estimates are not valid unless the covariates are +external (Aalen et al. 2008 142; Cortese and Andersen 2010). Note that a vector of +covariates \(\lbrace \vec{Z}(u)\rbrace_{u\geq 0}\) is said to be +external if, for all \(t \in \left[0,\infty\right)\), each transition +hazard at \(t\), conditional on \(\vec{Z}(t)\), is independent of +\(\lbrace \vec{Z}(u)\rbrace_{u>t}\) (i.e. independent of the future path +of the covariate). Otherwise, it is said to be internal (for more +details on the distinction between internal and external covariates, see Kalbfleisch and Prentice 2002 6). When one does not wish (or is not possible +due to \(\vec{Z}\) being internal) to condition on a future covariate +path of the covariate process, the uncertainty introduced by this +process needs to be accounted for. This can be done by extending the +state space of the disease process, so that it includes information on +the disease and the covariate process (Andersen et al. 1993 170). For +example, to include a dichotomous transplant covariate (an internal +covariate) in a simple survival model with two states, the state space +is expanded from \(\lbrace\)alive, deceased\(\rbrace\) to \(\lbrace\)alive +without transplant, alive with transplant, deceased\(\rbrace\). One can +then either assume that transplanted patients have a different baseline +death hazard or, more simply, that transplantation scales the death +hazard by some constant \(\exp \left( \gamma\right)\). A similar but more +detailed example can be found in Wreede et al. (2010 2.3.2, ‘model 3’).

    +

    3 Estimation

    +

    In the current section, we present the estimation methods underlying the +extensions of mstate +implemented in +ebmstate. +

    +

    Relative and cumulative hazard functions

    +

    Let \(\boldsymbol{\mu}_{\scriptscriptstyle ij}\), with +\(\left(i,j\right) \in \mathcal{P}\) (the set of direct transitions +allowed), denote a vector whose scalar components are the parameters +\(\mu_{\scriptscriptstyle ijk}\), +\(k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace\). +Similarly, let \(\boldsymbol{\sigma}^{2}_{\scriptscriptstyle ij}\) be +composed of the parameters +\(\left\lbrace \sigma^{2}_{\scriptscriptstyle ijk}\right\rbrace_{k}\). The +estimation of \(\boldsymbol{\beta}\), +\(\boldsymbol{\mu}:=\lbrace\boldsymbol{\mu}_{\scriptscriptstyle{ij}}\rbrace\) +and +\(\boldsymbol{\sigma}^2:=\lbrace\boldsymbol{\sigma}^2_{\scriptscriptstyle ij }\rbrace\) +relies on the restricted maximum-likelihood (REML) type algorithm +described in (Perperoglou 2014), and introduced by (Schall 1991). The +resulting estimate of \(\boldsymbol{\beta}\) is a maximum a posteriori +estimate; the estimates of \(\boldsymbol{\mu}\) and +\(\boldsymbol{\sigma}^{2}\) are empirical Bayes estimates. In +ebmstate, the +estimator based on this algorithm is implemented in the function +CoxRFX . The results of a simulation study showing its consistency are +included in the Supporting Scripts and Data (file ESM_1.html, section +1).

    +

    The computation of cumulative hazard rates for given covariate values +and an estimated regression coefficient vector relies on the function +msfit_generic, which is essentially a wrapper for the function +mstate::msfit (see section 5.3). +For the mathematical details of this computation, we refer therefore the +reader to Wreede et al. (2010).

    +

    State occupation probabilities

    +

    The package mstate +includes a simulation-based estimator that can take as input either +\(\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)\) or +\(\hat{\Lambda}_{ij}\left(\cdot\,|\,\mathbf{z}\right)\) to generate +estimates of state occupation probabilities under the clock-forward or +the clock-reset model respectively. Another available estimator, an +Aalen-Johansen-type estimator based on product integration, is far more +efficient computationally and takes as input +\(\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)\) only. As the +scope of this estimator has been restricted to clock-forward Cox models +(Andersen et al. 1993; Aalen et al. 2008), in our package we implemented a +convolution-based estimator as a computationally efficient alternative +(for models with a transition structure that has no cycles).

    +

    For convenience, let the sequence of states from \(0\) to \(n\) have the +labels \(0,1,2,...,n\,\), where \(0\) is the initial state by definition, +and \(n\) is some state that might (eventually) be reached by the process. +In addition, define \(X_{0}:=X(0)\) and \(T_{0}:=0\), and let +\(\left(X_{i},T_{i}\right)\), \(i \in \left\lbrace 1,2,... \right\rbrace\), +denote the marked point process associated with +\(\left\lbrace X(t)\right\rbrace\), so that \(T_{i}\) is the time of the +\(i^{th}\) transition and \(X_{i}\) is the state the process jumps to at +time \(T_{i}\). The inter-transition times are denoted by +\(\tau_{ij}:=T_{j}-T_{i}\), for \(j>i\). We can write the probability that a +patient in state \(0\) at time \(0\) finds herself in state \(n\) at time \(t\), +conditional on \(\vec{Z}(u)=\mathbf{z}\) for all \(u \geq 0\), as +\[\begin{aligned} +&\mathrm{P}\left[X(t)=n\,|\,X(0)=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right]\\ +&\,=\mathrm{P}\left[X_{n}=n,\tau_{0,n} < t,\tau_{n,n+1}\geq t- \tau_{0,n} |X_{0}=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right] \,.\nonumber +\end{aligned}\]

    +

    Recall that \(\lambda_{i,i+1}\left(s\,|\, \mathbf{z}\right)\) denotes the +hazard rate of a transition to state \(i+1\) at time \(s\) since arrival in +state \(i\), for a patient that has covariate vector \(\mathbf{z}\). The +cumulative hazard for the same transition between sojourn times \(0\) and +\(s\), if the patient’s covariate vector remains constant at \(\mathbf{z}\), +is represented by +\(\Lambda_{i,i+1}\left(s \,|\, \mathbf{z}\right):=\int_{0}^{s}\lambda_{i,i+1}\left(x\,|\, \mathbf{z}\right)\mathrm{d}x\). +Similarly, we let \(\lambda_{i}\left(s\,|\, \mathbf{z}\right)\) represent +the hazard rate of going to any state that can be reached directly from +\(i\), at time \(s\) since arrival in state \(i\), for a patient with +covariate vector \(\mathbf{z}\). The cumulative hazard for the same event +between sojourn times \(0\) and \(s\), if the patient’s covariate vector +remains constant at \(\mathbf{z}\), is represented by +\(\Lambda_{i}\left(s \,|\, \mathbf{z}\right)\). The expressions +\(\hat{\Lambda}_{i}\left(s \,|\, \mathbf{z}\right)\) and +\(\hat{\Lambda}_{i,i+1}\left(s \,|\, \mathbf{z}\right)\) denote the +Breslow estimators of the cumulative hazards just defined. In what +follows, all references to probabilities, hazard rates and cumulative +hazards are to be understood as conditional on +\(\vec{Z}(u)=\mathbf{z}\,\), for \(u\geq 0\): this condition is omitted to +simplify the notation.

    +

    In ebmstate, the +function probtrans_ebmstate generates a set of state occupation +probability estimates at equally spaced time points: +\[\begin{aligned} +&\left\lbrace \hat{p}_{0n}\left(k\right)\right\rbrace_{k} :=\left\lbrace \hat{\mathrm{P}}\left[X_{n}=n,\tau_{0,n} < t_{k},\tau_{n,n+1}\geq t_{k}- \tau_{0,n}\,|\, X_{0}=0 \right] \right\rbrace_{k}\;,\; k=0,1,2,...,K\,;\, t_{k}=k\times \Delta t \;. +\end{aligned}\] +The number \(K\) of time intervals is \(10,000\) by default and \(t_{K}\) is a +parameter set by the user. Defining the functions +\[\begin{aligned} +q_{ij}\left(k\right):=\mathrm{P}\left[X_{j}=j, \tau_{ij}\in \left[t_{k},t_{k+1}\right)\,|\,X_{i}=i\right] +\end{aligned}\] +and +\[\begin{aligned} +r_{i}\left(k\right):=\mathrm{P}\left[\tau_{i,i+1} > t_{k} \,|\,X_{i}=i\right]\;, +\end{aligned}\] +and the finite difference +\[\begin{aligned} + \Delta \hat{\Lambda}_{i,i+1}\left(t_{k}\right):=\hat{\Lambda}_{i,i+1}\left(t_{k+1}\right)-\hat{\Lambda}_{i,i+1}\left(t_{k}\right)\;, +\end{aligned}\] +the algorithm behind probtrans_ebmstate can be described as follows:

    +
      +
    1. For \(j=1,2,...,n\), compute +\[\begin{aligned} +\label{eq:est1} +\hat{q}_{j-1,j}\left(k\right)&:=\exp \left[-\hat{\Lambda}_{j-1}\left(t_{k}\right)\right]\Delta \hat{\Lambda}_{j-1,j}\left(t_{k}\right)&& +\end{aligned} \tag{3}\] +for \(k=0,1,...,K-1\).

    2. +
    3. For \(j=2,3,...,n\), compute (iteratively) +\[\begin{aligned} +\label{eq:est2} +\hat{q}_{0j}\left(k\right):=&\sum_{l=0}^{k-1} \hat{q}_{j-1,j}\left(k-l-1\right) \hat{q}_{0,j-1} \left(l\right) && +\end{aligned} \tag{4}\] +for \(k=0,1,...,K-1\).

    4. +
    5. Finally, use the estimates obtained in the last iteration of step 2 +to compute +\[\begin{aligned} +\label{eq:est4} +\hat{p}_{0n}\left(k\right):=&\sum_{l=0}^{k-1} \hat{r}_{n}\left(k-l-1\right) \hat{q}_{0,n}\left(l\right)&& +\end{aligned} \tag{5}\] +for \(k=0,1,...,K\), where +\(\hat{r}_{n}\left(\cdot\right):=\exp \left[-\hat{\Lambda}_{n}\left(t_{\scriptscriptstyle\left(\cdot\right)}\right)\right]\,\).

    6. +
    +

    Substituting \(:=\) for \(\approx\) and removing the ‘hats’ in definitions +(3) to (5), we get the approximate equalities that +justify the algorithm. These approximate equalities are derived in the +Supporting Scripts and Data (file ESM_1.html, section 2).

    +

    Apart from probtrans_ebmstate, the function probtrans_fft is also +based on the convolution argument just shown. However, this function +makes use of the convolution theorem, i.e., of the fact that the +convolution of two (vectorized) functions in the time domain is +equivalent to a pointwise product of the same functions in the frequency +domain. The estimation of state occupation probabilities is thus +simplified to +\[\begin{aligned} +\hat{p}_{0n}:=&\mathcal{F}^{\scriptscriptstyle -1}\left\lbrace \hat{\mathrm q}_{0,1} \boldsymbol{\cdot} \hat{\mathrm q}_{1,2}\boldsymbol{\cdot} \mathrm{...}\boldsymbol{\cdot}\hat{\mathrm q}_{n-1,n}\boldsymbol \cdot \hat{\mathrm r}_{n}\right\rbrace\;, +\end{aligned}\] +where \(\mathcal{F}\) denotes the discrete Fourier transform, +\(\hat{\mathrm{q}}_{j-1,j}:=\mathcal{F}(\hat{q}_{j-1,j})\) and +\(\hat{\mathrm{r}}_{n}:=\mathcal{F}(\hat{r}_{n})\). Conversion to and from +the frequency domain is carried out using the fast Fourier transform +algorithm implemented in the fft function of the base package stats. +The Supporting Scripts and Data contain a short simulation study +checking that state occupation probabilities can be accurately estimated +with probtrans_ebmstate and probtrans_fft (see file ESM_1.html, +sections 3 and 4).

    +

    Figure 2 consists of a grid of plots with estimated +curves of state occupation probabilities. It compares, in terms of speed +and accuracy, the estimator in probtrans_fft with an estimator in +mstate::mssample that has the same target, but is simulation-based. +Each plot contains a black curve and a superimposed red curve. The red +curves in any given column of the grid are all based on the same run of +a function: columns 1 to 3 are based on runs of mssample with the +number of samples \(n\) equal to \(100\), \(1000\) and \(10.000\) respectively, +while column 4 is based on a run of probtrans_fft. Each column in the +grid reproduces the same 4 black curves. These are based on a single run +of mssample with \(n=100.000\) and serve as benchmark. All function runs +are based on the same input: a set of cumulative transition hazard +estimates for a multi-state model with the ‘linear’ transition structure +given in the leftmost diagram of figure +3. Plots in a given row refer to the +same state of the model. The running times on top of each column refer +to the estimation of red curves. The main conclusion suggested by this +analysis of simulated data is that probtrans_fft is as accurate as +mssample with \(n=10.000\), but it is almost 100 times faster (columns 3 +and 4). With \(n=1000\), mssample achieves a good approximation to the +true state occupation probabilities, but is still roughly 9 times +slower. The details on how figure 2 and its +underlying data were generated are given in the Supporting Scripts and +Data (file ESM_1.html, section 5).

    +

    Interval estimation

    +

    Under any model estimated by +ebmstate – as in +general under a Bayesian model –, one can, if the sample size is large +enough, approximate the posterior by a normal distribution with mean +equal to the maximum a posteriori estimate and covariance matrix equal +to the inverse of the generalised observed Fisher information (see, for +example, Gelman et al. 2014 83–84). This approximation has first-order +accuracy and is thus outperformed by Laplace’s method, which has +second-order accuracy (Carlin and Louis 2009 110–111). However, as Carlin and Louis (2009 112) observe, “for moderate- to high-dimensional \(\boldsymbol\theta\) +(say, bigger than 10), Laplaces method will rarely be of sufficient +accuracy[...]”. Carlin and Louis (2009 244–251) also describe three methods +of interval estimation in empirical Bayes settings, but all of them are +designed for fully parametric models. These reasons, along with the fact +that regularised methods such as the one implemented +ebmstate are +typically used to fit models with more than a dozen covariates, led us +to choose the non-parametric bootstrap as the interval estimation method +in ebmstate. Note +that the non-parametric bootstrap can be given a Bayesian +interpretation. Its interval estimates are approximately the same as +those of a Bayesian model that assumes: a) a multinomial distribution +for the data; and b) a non-informative Dirichlet prior distribution for +the probability assigned to each category in the multinomial +distribution. This is a specific case of the so-called Bayesian +bootstrap (Hastie et al. 2009 272). Further research is needed to determine +the theoretical properties of the non-parametric bootstrap in the +present setting, but this falls beyond the scope of the present paper. +Interval estimates of regression coefficients, cumulative hazards and +state occupation probabilities are implemented in the function +boot_ebmstate.

    +

    4 Estimator performance

    +

    It is a well-documented fact in the statistical literature that standard +least-squares or maximum-likelihood estimators can often be improved by +regularisation or shrinkage (see, for example, Samworth 2012). This +improvement comes about when the model dimensionality is high enough +that the bias introduced by regularisation is outweighed by the +reduction in the estimator variance. In the current setting, one might +therefore ask: what kind of dimensionality does a semi-parametric, +multi-state Cox model need to have to be outperformed by its empirical +Bayes counterpart? A simulation study we carried out offers a tentative +answer to this question, by comparing estimators under both Cox models +for an increasing number of covariates. The study also features a third +method, based on a fully non-parametric model, as a null model method. +This was included to give an idea of how many covariates the empirical +Bayes model can deal with before it becomes no better than a simple +non-regressive model.

    +

    Simulation setup

    +

    We assessed the performance of all estimators defined by the tuple +\(\left[a,m, G, n,p(n)\right]\), where \(a\in \lbrace\)regression +coefficients, relative hazards, state occupation probabilities\(\rbrace\) +is the target of estimation, \(m\in \lbrace\)standard Cox, empirical Bayes +Cox, null\(\rbrace\) is the assumed hazard model, \(G \in \lbrace\)linear, +competing risks, ‘m’ structure\(\rbrace\) is the transition structure of +the model (illustrated in figure 3) +and \(n\in \lbrace 100,1000\rbrace\) is the number of patients/disease +histories in the training data set; the variable \(p\) denotes the number +of coefficients/covariates per transition in the true model and its +range depends on \(n\): +\(p\left(100\right) \in \lbrace 10,40,70,100 \rbrace\) whereas +\(p\left(100\right) \in \lbrace 10,100,200,300 ,400,500\rbrace\). By +‘relative hazards’ and ‘state occupation probabilities’, we mean here +the relative transition hazards of an out-of-sample patient, and her +state occupation probabilities at 7 chosen time points. We generated a +batch of 300 independent absolute error observations (‘NA’ estimates +included) for each estimator, where each observation is recorded after +training the estimator on a newly simulated data set. Each boxplot in +figures 6 +(\(n=100\)) and 7 +(\(n=1000\)) is based on one of these batches. As all estimators are +vector estimators, each absolute error is actually an average +absolute error, where the average is taken over the components of the +vector.

    +

    All training data sets were simulated from clock-reset Cox models. Apart +from \(G\) (the model transition structure), \(n\) and \(p\), also the true +baseline hazards are held fixed within each batch of 300 training data +sets. The coefficient vectors used in the simulation are always +non-sparse and are scaled by \(\sqrt{\frac{10}{p}}\) to keep the +log-hazard variance constant when the dimensionality grows. All +covariates are dichotomous and mutually independent. To compute the +coefficient errors for the non-parametric (null) model method, we think +of it as a degenerate Cox model in which all regression coefficient +estimates are fixed at zero. The estimation of regression coefficients +under the standard Cox and the empirical Bayes Cox models was performed +with survival::coxph and ebmstate::CoxRFX respectively; the +estimation of state occupation probabilities is based on +mstate::probtrans for the null model and on ebmstate::probtrans_fft +for both the standard Cox and the empirical Bayes Cox models.

    +

    The reason we did not consider simulation scenarios with more than 500 +covariates per transition, in data sets of 1000 patients, was simply +computational cost. For example, generating the data and error +observations for the scenario with \(n=1000\), \(p=100\) and \(G=\)’m’ +structure took less than one hour to generate using 20 CPU cores in +parallel; the same scenario but with \(p=500\) took 6.5 days using 25 CPU +cores. More details about the simulation setup can be found in the +Supporting Scripts and Data (file ESM_1.html, section 6, subsection +‘sample script’).

    +

    Missing values

    +

    Whenever an estimator was able to compute a valid estimate of its target +for each training data set, i.e., when it did not return any ‘NA’ +estimates, its boxplots are based on 300 valid error observations. This +was always the case with non-parametric estimators: the estimates of +regression coefficients and relative hazards of this type of estimators +are trivial (fixed at zero and one respectively) and hence it is also +straightforward to compute absolute errors. It also happened that +non-parametric estimators of state occupation probabilities had no ‘NA’ +estimates (see file ESM_1.html, section 6, figure 6.3, in the Supporting +Scripts and Data). The situation was similar for the empirical Bayes Cox +model estimators, which showed no more than 5\(\%\) missing estimates in +any of the simulation scenarios studied (ibid., figures 6.1 and 6.2). +However, for the standard Cox model ones, the number of ‘NA’ estimates +depends to a large extent on the number of patients in the data set, as +well as on the dimensionality and transition structure of the model +(figures 4 and +5). In data sets of 100 +patients, it fares well in models with fewer than 10 covariates per +transition, or in models with up to 40 covariates, if the transition +structure is linear. Otherwise its failure rates range from roughly +25\(\%\) to nearly 100\(\%\). In data sets of 1000 patients, the proportion +of ‘NA’ estimates is never above 10\(\%\), if the transition structure is +linear, but it can climb above 60\(\%\) for other transition structures.

    +

    Comparison of estimators

    +

    With respect to the performance of the three methods studied, the +boxplots in figures +6 and +7 suggest the +following conclusions:

    +
      +
    • As \(p/n\) grows, the empirical Bayes estimators quickly outperform +the standard Cox model ones. They already fare substantially better +at \(p/n=0.1\) for both \(n=100\) and \(n=1000\) and for all estimation +targets. At the same time, the relative performance of the empirical +Bayes method with respect to the null model one decreases. At +\(p/n=0.5\), the difference between these two methods is already +rather small for all simulation scenarios.

    • +
    • The relative performance of the empirical Bayes method with respect +to the null method decreases as the number of co-occurring +transition hazards in the model grows. All other things equal, the +empirical Bayes method has the best performance under the ‘linear’ +structure model, which has no competing transitions; it performs +less well under the ‘m’ structure transition model, where two +transition hazards can co-occur; and has the worse relative +performances under the ‘competing risks’ model, where three +transition hazards co-occur. This trend is clearer for \(n=100\) +(figure 6) +but can also be detected in the relative hazard errors for \(n=1000\) +(figure 7). +In any case, the empirical Bayes method seems to be far more robust +than the standard Cox model against increases in the number of +co-occurring transition hazards.

    • +
    • Having as target the regression coefficients or the state occupation +probabilities, instead of relative hazards, makes the empirical +Bayes method better in comparison to the null method. In fact, as +\(p/n\) grows, the empirical Bayes method is never outperformed by the +null method except in the estimation of relative hazards.

    • +
    +

    5 Survival analysis workflow

    +

    The features of mstate were illustrated in Wreede et al. (2010) using a simple +workflow. The starting point of this workflow is a data set in ‘long +format’. Such data set can be fed into survival::coxph to obtain +estimates of the regression coefficients of a multi-state Cox model. The +resulting model fit object can be passed on to mstate::msfit, along +with a vector of covariates of a particular patient, to get personalised +estimates of the cumulative hazard functions. Finally, state occupation +probabilities for the same patient can be estimated if the object +created by mstate::msfit is fed into mstate::probtrans. In this +section, we describe how +ebmstate extends the +scope of this workflow, i.e., how it uses the packages +survival and +mstate to generate +estimates under a multi-state empirical Bayes Cox model. A diagram +summarising the extension is shown in figure 8. In +the 5.5 subsection, we give some +recommendations on how to assess and compare models, but for more +detailed tutorials on how to analyse multi-state data using models +defined by transition hazards, we refer the reader to +Putter et al. (2007) and Putter (2011).

    +

    The main steps of the +ebmstate workflow are +here illustrated using a data set of patients with myelodysplastic +syndromes (MDS) which has been described and studied in +Papaemmanuil et al. (2013). A myelodysplastic syndrome is a form of leukemia in +which the bone marrow is not able to produce enough mature blood cells, +and which sometimes develops into a cancer of white blood cells with a +quick and aggressive progression, i.e., into acute myeloid leukemia +(AML). Figure 9a illustrates an illness-death +type model for MDS patients and also gives a breakdown of the number of +transition events. The conversion to a model with a transition structure +that has no cycles (i.e., that can be handled by our convolution-based +estimators) is shown in figure 9b. The data +set used for model estimation, obtained after a number of pre-processing +steps, contains the disease history of 576 patients, as well as +measurements on 30 covariates. Of these 30 covariates, 11 are mutation +covariates and the remaining are clinical or demographic (see figure +9c). The running time for the estimation of +relative transition hazards does not exceed 10 seconds in a standard +laptop computer. The same holds for the estimation of cumulative +transition hazards or state occupation probabilities for a given +patient. The complete R code underlying the data analysis in the current +section can be found in the Supporting Scripts and Data (file +ESM_2.html). For running only the R snippets shown below and reproduce +their results, the best option is to use the R script in file ESM_3.R of +the Supporting Scripts and Data.

    +

    Input data

    +

    Table 1 shows a fragment of the MDS data +set. The data is in ‘long format’, which means that each row refers to a +period of risk for a given transition and patient. For example, row \(i\) +tells us that, at time Tstart[i], patient id[i] entered state +from[i], and thereby began to be at risk for transition trans[i], +i.e., at risk of going from state from[i] to state to[i]. If the +first transition of patient id[i] after time Tstart[i] occurs before +the last follow-up time for this patient, Tstop[i] records the time of +this transition (regardless of whether the patient moved to state +to[i] or not). Otherwise, Tstop[i] is set to the last follow-up +time. The value of status[i] is set to 1 if and only if the first +transition of patient id[i] after Tstart[i] is to state to[i] and +occurs before the last follow-up (otherwise it is set to 0). The value +of time[i] is defined simply as Tstop[i]\(-\)Tstart[i], and +strata[i] is the stratum of the baseline hazard for transition +trans[i] (more about this variable in the following section). For x +\(\in \left\lbrace \right.\) ASXL1, DNMT3A, +\(\dots \left. \right \rbrace\), x[i] denotes the level of covariate x +between Tstart[i] and Tstop[i] in patient id[i]. (In the MDS data +set, we assume that the relative hazard of a patient is determined by +her covariate vector at \(t=0\), i.e., we assume all covariates to be +time-fixed.) If a patient enters a new state, and this state +communicates directly with \(n\) other states, then, as long as the +patient actually spends time in the new state (i.e. the time of +transition is not the same as the last follow-up time), \(n\) rows must be +added to the data set, with each row corresponding to a different +possible transition.

    +

    From table 1, we know that patient 77 +entered state 1 (‘MDS’) at time 0 and remained in this state until time +2029, when she moved to state 3 (‘death before AML’). There are no rows +to describe the evolution of patient 77 after entering state 3, as this +state is an absorbing state. As to patient 78, she remained in state 1 +until time 332, and moved from there to state 2 (‘AML’). She lived with +AML for 1117 days and moved to state 4 (‘death after AML’) at time 1449.

    +
    +
    id  from to trans Tstart Tstop time status  strata ASXL1 DNMT3A [...]  
    +77     1  2     1      0  2029 2029      0       1     0      0    .
    +77     1  3     2      0  2029 2029      1       2     0      0    .
    +78     1  2     1      0   332  332      1       1     1      0    .
    +78     1  3     2      0   332  332      0       2     1      0    .
    +78     2  4     3    332  1449 1117      1       3     1      0    .
    +

    Table 1: A 5-row fragment of the MDS data set (in long format)

    +
    +

    Fitting an empirical Bayes Cox model

    +

    Once the data is in ‘long format’, the estimation of an empirical Bayes +model can be carried out using the function CoxRFX. A simple example +of the first argument of CoxRFX, denoted ‘Z’, is a data frame +gathering the trans, strata and covariate columns of the data in +long format:

    +
    outcome_covs <- c("id","from","to","trans","Tstart","Tstop","time","status",
    +                  "strata")
    +Z <- mstate_data[!names(mstate_data) %in% outcome_covs]
    +#(`mstate_data' has the data in long format) 
    +

    The strata column determines which baseline hazard functions are +assumed to be equal. In table 1, each +transition is assumed to have a (potentially) different baseline hazard. +The model’s assumptions regarding how covariates affect the hazard are +reflected on the format of the covariate columns of Z. When the Z +argument is the one created in the previous block of code, CoxRFX +returns a single regression coefficient estimate for each covariate. In +other words, the impact of any covariate is assumed to be the same for +every transition.

    +

    There are however ways of relaxing this assumption. One can replace the +ASXL1 column in Z (or any other covariate column) by several +‘type-specific’ ASXL1 columns: the ASXL1 column specific for type +\(i\) would show the mutation status of ASXL1 in rows belonging to +transition of type \(i\), and show zero in all other rows. This would +force CoxRFX to estimate a (potentially) different ASXL1 coefficient +for each transition type. This process of covariate expansion by type +can be based on any partition of the set of transitions. When each type +corresponds to a single transition, we refer to it simply as ‘covariate +expansion by transition’. The output shown below illustrates the effect +of expanding the covariates in ‘mstate_data’ by transition.

    +
    # Columns `id' and `trans' from `mstate_data' together with the first
    +# two expanded covariates (patients 77 and 78):
    +    id trans ASXL1.1 ASXL1.2 ASXL1.3 DNMT3A.1 DNMT3A.2 DNMT3A.3  [...]  
    +    77     1       0       0       0        0        0        0     .
    +    77     2       0       0       0        0        0        0     .
    +    78     1       1       0       0        0        0        0     .
    +    78     2       0       1       0        0        0        0     .
    +    78     3       0       0       1        0        0        0     .
    +

    The example code given below shows how to use +mstate to expand +covariates by transition and how to create a Z argument that makes +CoxRFX estimate a regression coefficient for each covariate for +transitions 1 and 2, and assume a fully non-parametric hazard for +transition 3.

    +
    # To expand covariates by transition using mstate::expand.covs, 
    +# first set the class of `mstate_data' as
    +class(mstate_data) <- c("data.frame","msdata")
    +
    +# then add the transition matrix as attribute:
    +attr(mstate_data,"trans") <- tmat 
    +#(`tmat' is the output of mstate::transMat)
    +
    +# Expand covariates by transition:
    +covariates_expanded_123 <- mstate::expand.covs(
    +    mstate_data,
    +    covs = names(mstate_data)[! names(mstate_data) %in% outcome_covs],
    +    append = F
    +)
    +
    +# remove all covariates for transition 3 from `covariates_expanded_123'
    +# to fit a fully non-parametric model on this transition:
    +covariates_expanded_12 <- covariates_expanded_123[
    +    !grepl(".3",names(covariates_expanded_123),fixed = T)
    +]
    +
    +#argument `Z' of coxrfx
    +Z_12 <- data.frame(covariates_expanded_12,strata = mstate_data$trans,
    +                   trans = mstate_data$trans)
    +

    The second argument of CoxRFX (‘surv’) is a survival object that can +easily be built by feeding the outcome variable columns of the data to +the function Surv (from the package +survival). Whether +CoxRFX fits a clock-forward model or a clock-reset model depends on +the kind of survival object:

    +
    #argument `surv' for a  clock-forward model
    +surv <- Surv(mstate_data$Tstart,mstate_data$Tstop,mstate_data$status)
    +
    +#argument `surv' for a clock-reset model
    +surv <- Surv(mstate_data$time,mstate_data$status)
    +

    The argument groups of CoxRFX is a vector whose length equals the +number of covariates in the data. In other words, the length of groups +is ncol(Z)-2, since the argument Z must include both the covariate +data and the strata and trans columns. If, for \(i \neq j\), +groups[i]=groups[j] \(=\text{`foo'}\), this means that the regression +coefficients of the \(i^{th}\) and \(j^{th}\) covariates of Z both belong +to a group named ‘foo’ of coefficients with the same prior. For the Z +object built above, the groups argument created in the following block +of code embodies the assumption that all coefficients associated with a +given transition have the same prior distribution. The final line of +code fits the empirical Bayes model.

    +
    #argument `groups' of coxrfx
    +groups_12 <- paste0(rep("group",ncol(Z)-2),c("_1","_2"))
    +
    +#fit random effects model
    +model_12 <- CoxRFX(Z_12,surv,groups_12,tmat)
    +

    Figure 10 shows regression coefficient point +estimates for a clock-reset, empirical Bayes model fitted with the code +above. Also shown are 95% non-parametric bootstrap confidence intervals +computed using ebmstate::boot_ebmstate. The \(x\)-axis scale is +logarithmic to allow estimates to be read as relative hazards more +easily. For example, a mutation in RUNX1 is associated with a twofold +increase in the hazard of progression from MDS to AML, and treatment +centre 4 is associated with a 3-fold increase in the hazard of dying +before progressing to AML, when compared to the baseline value of +‘treatment centre’ (treatment centre = 2 or 5). In covariates that have +been log-transformed (age, platelet count and neutrophil count) or +logit-transformed (proportions of myeloblasts and ring sideroblasts in +the bone marrow), the interpretation of estimates is different. For +example, an increase in age by a factor of \(e\) (\(\approx 2.72\)) almost +triples the hazard of dying before AML; the same increase in the ratio +\(bm\_blasts/(1-bm\_blasts)\) (where bm_blasts is the proportion of +myeloblasts in the bone marrow) is associated with an increment in the +hazard of dying before AML of approximately \(16\%\).

    +

    Computing cumulative transition hazard estimates

    +

    The function msfit_generic is the generic function in +ebmstate that +computes cumulative transition hazards for a given set of covariate +values and an estimated Cox model. It calls a different method according +to the class of its object argument. The default method corresponds to +the original msfit function of the +mstate package and is +appropriate for objects of class coxph, i.e., objects that contain the +fit of a Cox model with fixed effects. The other available method for +msfit_generic, msfit_generic.coxrfx, is just the original msfit +function, (slightly) adapted to deal with objects generated by CoxRFX. +Quite importantly, msfit_generic.coxrfx does not allow the variance of +the cumulative hazards to be computed, as this computation relies on +asymptotic results which may not be valid for an empirical Bayes model. +As a result, it only has two other arguments apart from the object of +class coxrfx: a data frame with the covariate values of the patient +whose cumulative hazards we want to compute; and a transition matrix +describing the states and transitions in the model (such as the one that +can be generated using transMat from the package +mstate). The following +block of code exemplifies how these objects can be built and generates +the msfit object containing the cumulative transition hazard estimates +for a sample patient. Note that the object with the patient data must +include a row for each transition, as well as a column specifying the +transition stratum of each row of covariates.

    +
    # Build `patient_data' data frame with the covariate values for which 
    +# cumulative hazards are to be computed (covariate values of patient 78):
    +patient_data <- mstate.data[mstate.data$id == 78,,drop = F][rep(1,3),]
    +patient_data$strata <- patient_data$trans <- 1:3
    +patient_data <- mstate::expand.covs(
    +    patient_data,
    +    covs = names(patient_data)[ ! names(patient_data) %in% outcome_covs],
    +    append = T
    +)
    +patient_data <- patient_data[ ! grepl(".3",names(patient_data),fixed = T)]
    +
    +# The `patient_data' data frame has only 3 rows (one for each transition).
    +# The output below shows its `id' and `trans' columns
    +# and expanded covariates ASXL1 and DNMT3A:
    +    id trans ASXL1.1 ASXL1.2 DNMT3A.1 DNMT3A.2  [...]  
    +    78     1       1       0        0        0     .
    +    78     2       0       1        0        0     .
    +    78     3       0       0        0        0     .
    +
    +# compute cumulative hazards
    +msfit_object_12 <- msfit_generic(model_12,patient_data,tmat)
    +

    Figure 11 shows three plots of estimated +cumulative transition hazards for the sampled patient, one for each +transition in the model, along with \(95\%\) non-parametric bootstrap +confidence intervals (computed with ebmstate::boot_ebmstate). +Throughout the plotted period, the ‘slope’ of the cumulative hazard +(i.e., the hazard rate) for the MDS to AML transition is lower than the +one for the MDS to death transition, and this in turn is lower than the +one for the AML to death transition. It should be recalled that the +cumulative hazard estimate is strictly non-parametric for this last +transition, i.e., it is the same for all patients. The central plot of +figure 11 suggests that, as time since +diagnosis goes by, the hazard of dying in MDS increases (possibly an +effect of age). On the other hand, the hazard of dying in AML seems to +decrease (slightly) with time (rightmost plot). Conclusions regarding +the evolution of the AML hazard are hard to draw, since the confidence +intervals for the corresponding cumulative hazard curve are very wide +(leftmost plot).

    +

    If an object generated by msfit_generic is fed to plot, and the +package mstate is +loaded, the method mstate:::plot.msfit will be called. This is an +efficient way of automatically plotting the cumulative hazard estimates +for all transitions, but confidence interval lines (separately +estimated) cannot be added.

    +

    Computing state occupation probability estimates

    +

    The functions probtrans_mstate, probtrans_ebmstate and +probtrans_fft compute estimates of state occupation probabilities for +a given msfit object. All three functions generate objects of class +probtrans that can be fed to the plot.probtrans method from the +package mstate. The +first of these functions should only be used for clock-forward models, +as it relies on product-limit calculations. It calls the method +probtrans_mstate.default, if the msfit object was generated by +msfit_generic.default, or the method probtrans_mstate.coxrfx, if it +was generated by msfit_generic.coxrfx. Both methods are identical to +the function probtrans in the +mstate package, with +the reserve that probtrans_mstate.coxrfx does not allow the +computation of the variances or covariances of the state occupation +probability estimator.

    +

    The functions probtrans_ebmstate and probtrans_fft are the functions +in ebmstate for the +computation of state occupation probability estimates under clock-reset +models with a transition structure that has no cycles. When using +probtrans_fft (the faster, but somewhat less stable, of these two +functions), three arguments must be supplied: the initial state of the +process whose state occupation probabilities one wishes to compute, the +msfit object, and the upper time limit for the generation of estimates +(max_time). Both functions are based on a discrete-time approximation +to a series of convolutions. The default argument nr_steps controls +the number of (equally spaced) time steps used in this approximation. +The arguments max_time and nr_steps should be increased until the +estimated curves become stable.

    +

    The following line of code computes point estimates of state occupation +probabilities for the sample patient.

    +
    probtrans_object_12 <- probtrans_fft("MDS",msfit_object_12, max_time = 4000)
    +

    Estimates are shown in figure 12, along +with \(95\%\) non-parametric, bootstrap confidence intervals. For this +particular patient, the estimated probability of being dead after AML +remains below 0.4 throughout a period of 10 years from the MDS +diagnosis; if the patient does reach AML, death is expected to happen +quickly thereafter, as reflected in the very low estimates for the +probability of being in AML at any point in time. The following block of +code shows how to compute confidence intervals with boot_ebmstate:

    +
    # Creating the object arguments for boot_ebmstate()
    +
    +# `groups' arguments was already created, but we need to add names to it
    +names(groups_12) <- names(covariates_expanded_12)
    +    
    +# `mstate_data_expanded' argument (similar to `covariates_expanded' but
    +# including outcome variables)
    +mstate_data_expanded <- cbind(
    +  mstate_data[names(mstate_data) %in% outcome_covs],
    +  covariates_expanded_12
    +)
    +
    +# create the non-parametric bootstrap confidence intervals
    +boot_ebmstate_object <- boot_ebmstate(
    +  mstate_data = mstate_data_expanded,
    +  which_group = groups_12,
    +  min_nr_samples = 100,
    +  patient_data = patient_data,
    +  tmat = tmat,
    +  initial_state = "MDS",
    +  time_model = "clockreset",
    +  input_file = NULL,
    +  coxrfx_args = list(max.iter = 200),
    +  probtrans_args = list(max_time = 4000)
    +)
    +

    Model assessment

    +

    For any model fitted with +ebmstate, two +performance metrics can be easily computed: the concordance statistic +((Harrell et al. 1982); see also the help page of +survival::concordance for the definition of concordance) and the +Bayesian Information Criterion (BIC) score (Schwarz 1978). +As an example of how these two metrics can be obtained and used for +model comparison, suppose we wish to compare ‘model_12’ fitted above – +which consists of a Cox regression including all covariates for +transitions 1 and 2 and a fully non-parametric model for transition 3 – +with a model that combines Cox regressions of all covariates for each of +the three transitions (denoted ‘model_123’ below). The following code +snippet shows how to fit this second model.

    +
    # arguments `groups' and `Z' for fitting a Cox regression model on all transitions
    +Z_123 <- data.frame(
    +    covariates_expanded_123,
    +    strata = mstate_data$trans,
    +    trans = mstate_data$trans
    +)
    +groups_123 <- paste0(rep("group", ncol(Z_123) - 2), c("_1", "_2", "_3"))
    +
    +# Fit a Cox regression model for all transitions
    +model_123 <- CoxRFX(Z = Z_123, surv = surv, groups = groups_123)
    +

    Running the concordance function in the +survival package for +each model yields the following output:

    +
    > concordance(model_12)
    +    Call:
    +    concordance.coxph(object = model_12)
    +
    +    n= 1210
    +    Concordance= 0.8131 se= 0.01314
    +                concordant discordant tied.x tied.y tied.xy
    +    strata=1      18040       2783      0      1       0
    +    strata=2      37919       9678      0      7       0
    +    strata=3          0          0   1052      0       4
    +
    +> concordance(model_123)
    +    Call:
    +    concordance.coxph(object = model_123)
    +
    +    n= 1210
    +    Concordance= 0.8168 se= 0.01312
    +                concordant discordant tied.x tied.y tied.xy
    +    strata=1      18041       2782      0      1       0
    +    strata=2      37920       9677      0      7       0
    +    strata=3        784        268      0      4       0
    +

    The output shows that modelling transition 3 with a Cox model, instead +of a fully parametric one, has a negligible impact on the overall +concordance. However, this is due to the fact that there are far fewer +observations for this transition. The concordance for transition 3 only, +which corresponds to strata 3, is 0.5 under the fully parametric model +(i.e., all patients are assigned the same transition hazard) and +considerably higher under the Cox regression (\(784/(784+268)=0.75\)). +Ideally, the comparison of models of different complexity should be +carried out on a test sample rather than on the training data. For this +purpose, the test data can be input into to the concordance function +(argument newdata). However, in the present case, only 61 patients +were ever at risk of dying with AML (i.e. of undergoing transition 3), +and of these only 41 actually died, so we might prefer to keep all +patients in the training data, rather than saving a fraction of them for +testing purposes. Such an option will yield more accurate coefficient +estimates, at the expense of not allowing the computation of unbiased +estimates of model performance. If the goal is only to compare models, +we can make do without test data, by using an information score that +penalises model complexity, such as the BIC. To facilitate model +comparison, the BIC score is one of the attributes of the model fit +object:

    +
    > model_12$BIC
    +    [1] 2508.37
    +> model_123$BIC
    +    [1] 2483.49
    +

    The best model is the one with the lowest score, so the choice of +‘model_123’ is confirmed.

    +

    6 Discussion

    +

    We have shown that +ebmstate is suitable +for higher-dimensional, multi-state survival analysis, and that it is +both efficient and easy-to-use. To a significant extent, the +user-friendliness of +ebmstate stems from +the fact that it was not built ‘from the ground up’. Instead, we +produced a package that is more easily accessible to the many users of +mstate by taking +advantage of whichever features of this package were useful to our +method and by eliminating redundancies. The connection between +ebmstate and +mstate is based on the +fact that the function CoxRFX takes the same type of input and +produces the same type of output as coxph from the package survival, +and the function probtrans_fft (or probtrans_ebmstate) has the same +type of input and output as probtrans from +mstate (as shown in +figure 8).

    +

    We also sought to improve our package’s user-friendliness by making it +as efficient as possible. The reduction of computational cost is based +on two features. First, our empirical Bayes method relies on an +expectation-maximisation algorithm that estimates both the parameters +and the hyper-parameters of the model, i.e., no further tuning of the +model is required. Second, in +ebmstate, the +computation of state occupation probability estimates relies on +analytical results rather than on simulation: not only for clock-forward +models, where we import from +mstate a product-limit +estimator, but also for clock-reset models, where we implement our own +estimator based on a convolution argument and the fast Fourier +transform.

    +

    To our knowledge, +ebmstate is the first +R package to put together a framework for multi-state model estimation +that is complete and suitable for higher-dimensional data. It does so by +implementing point and interval estimators of regression coefficients, +cumulative transition hazards and state occupation probabilities, under +regularised multi-state Cox models. In section +4, the results of the simulation study +suggest that for data sets with 100 patients or more and a ratio of \(p\) +(patients) to \(n\) (coefficients per transition) greater than 0.1, the +standard Cox model estimator is clearly outperformed by the empirical +Bayes one when it comes to the estimation of relative hazards and state +occupation probabilities of an out-of-sample patient, or the regression +coefficients of the model. However, the same study suggests that using +an empirical Bayes method instead of a fully non-parametric one is of +limited or no value in settings where \(p/n \geq 1\). This loss of +usefulness can already happen for \(p/n\leq 1/2\) when it comes to the +estimation of the relative hazards of an out-of-sample patient, +especially for transition structures with multiple competing +transitions.

    +

    As mentioned in previous sections, +ebmstate imports a +product-limit estimator from +mstate that targets the +state occupation probabilities of patients with time-fixed covariate +vectors. However, these estimators are extendible to models with +time-dependent covariates, as long as these are external and the +estimates are conditional on specific covariate paths (Aalen et al. 2008 142). For piecewise constant covariates, it is likely that such an +adaptation could be obtained by combining transition probability +estimates obtained for each period in which the covariates are fixed. +While no significant theoretical obstacles are foreseen in this matter, +the computer implementation for more than a single piecewise constant +covariate is likely to be a laborious task. We have left it therefore +for future work.

    +

    Acknowledgements

    +

    The authors are supported by grant NNF17OC0027594 from the Novo Nordisk +Foundation. We thank an anonymous reviewer for their constructive +comments and helpful suggestions which led to a much-improved +manuscript.

    +

    Supporting Scripts and Data

    +

    In the supporting Scripts and Data, the file ESM_1.html contains +additional simulation results and theoretical demonstrations. Additional +details on the analysis of the MDS data set are given in the file +ESM_2.html. The MDS data set is in files MDS.TPD.20Nov2012.csv and +mds.paper.clin.txt. The file ESM_3.R contains a simplified R script +to run the code snippets in the present paper. The +ebmstate package is +available on CRAN.

    +

    7 Conflict of interest

    +

    The authors have declared no conflict of interest.

    +

    Figures

    +
    +
    +graphic without alt text +

    +Figure 1: Summary of inputs and outputs of the package ebmstate. The input data set should be one that violates the assumption – commonly used in survival analysis – that the number of observations is much larger than the number of parameters to be estimated (a genomic-clinical data set is shown as a typical example). The input model is a multi-state Cox model defined by a transition structure and a prior distribution on the regression coefficients. This prior distribution is defined by partitioning the vector of regression coefficients into groups of regression coefficients, with each group having its own Gaussian prior with undetermined mean and variance. The outputs of ebmstate include estimates of the relative transition hazards associated with each covariate, as well as estimates of the probability that a specific patient (with specific covariate measurements) has of occupying each state of the model over some time period. Estimates of cumulative transition hazards are omitted from the figure. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 2: Comparison of running times and estimation accuracy of mssample and probtrans_fft. Each plot in the grid shows two estimated curves of state occupation probabilities. The black curves are based on a single run of mstate::mssample with n=100.000 observations (approximately 17 minutes of running time) and are the same across columns. They serve as benchmark for precision assessment. In columns 1 to 3 of the grid, the superimposed red curves are based on a run of mssample with respectively 100, 1000, and 10.000 observations. In the rightmost column, the red curves are based on a run of probtrans_fft. All functions have as input the same set of cumulative transition hazards. These were estimated using a non-parametric multi-state model and a data set of 1000 patients generated according to a clock-reset Cox model with a ‘linear’ transition structure (leftmost diagram of figure 3). Plots in the same row refer to the same state of the model, while those in the same column refer to the same run of a function. Running times and, where appropriate, number of simulations (n) are given on top of each column. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 3: Model transition structures. We studied the performance of Cox model estimators, empirical Bayes Cox model estimators and fully non-parametric estimators with respect to these 3 transition structures. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 4: Proportions of valid, infinite and missing (‘NA’) estimates for the standard Cox model estimators in the simulation study of figure 6 (100 patients per simulated data set). +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 5: Proportions of valid, infinite and missing (‘NA’) estimates for the standard Cox model estimators in the simulation study of figure 7 (1000 patients per simulated data set). +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 6: Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with 100 observations each. In the figure grid there is a boxplot corresponding to every tuple \((a,m, G, p)\) such that \(a\in \lbrace\)regression coefficients, relative hazards, state occupation probabilities\(\rbrace\) is the target of estimation, \(m\in \lbrace\)standard Cox, empirical Bayes Cox, null\(\rbrace\) is the hazard model, \(G \in \lbrace\)linear, competing risks, ‘m’ structure\(\rbrace\) is the transition structure of the model, and \(p \in \lbrace 10,40,70,100 \rbrace\) is the number of coefficients/covariates per transition. Each boxplot is based on at most 300 average absolute error observations. Figure 4, together with figures 6.1 and 6.3 in file ESM_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot’s y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 7: Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with 1000 observations each. In the figure grid there is a boxplot corresponding to every tuple \((a,m, G, p)\) such that \(a\in \lbrace\)regression coefficients, relative hazards, state occupation probabilities\(\rbrace\) is the target of estimation, \(m\in \lbrace\)standard Cox, empirical Bayes Cox, null\(\rbrace\) is the hazard model, \(G \in \lbrace\)linear, competing risks, ‘m’ structure\(\rbrace\) is the transition structure of the model, and \(p \in \lbrace 10,100,200,300,400,500 \rbrace\) is the number of coefficients/covariates per transition. Each boxplot is based on at most 300 average absolute error observations. Figure 5, together with figures 6.2 and 6.3 in file ESM_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot’s y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 8: Extension of the mstate analysis framework by ebmstate. Arrows correspond to functions. Boxes correspond to inputs or outputs of functions. Functions CoxRFX and probtrans_fft from ebmstate compute point estimates only. Interval estimates can be obtained using the non-parametric bootstrap algorithm implemented in the function ebmstate::boot_ebmstate. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 9: a: transition model implied by the data set of patients with myelodysplastic syndromes, together with transition event numbers; b: conversion to a transition structure without cycles; c: transformations applied to the MDS covariate data and summary statistics for the data before transformation. MDS stands for myelodysplastic syndromes; AML stands for acute myeloid leukemia. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 10: Point estimates of regression coefficients for the Cox model fitted to the MDS data, along with 95% non-parametric bootstrap confidence intervals. The x-axis scale is logarithmic so that coefficient estimates can be read as relative hazard estimates. If \(\gamma_{ij}\) is the element of \(\hat{\boldsymbol{\beta}}_{ij}\) associated with a given covariate, \(\exp\left(\gamma_{ij}\right)\) is the estimated relative hazard for this covariate in transition \(\left(i,j\right)\). In general, a relative hazard estimate \(r\) for a covariate \(z\) in transition \(\left(i,j\right)\) means that a one-unit increase in z is associated with an r-fold increase in the hazard of this transition. If z was obtained by log-transformation (as in age, platelet counts and neutrophil counts), a one-unit increase in z corresponds to scaling the original covariate by \(e\approx 2.72\). In case z was obtained by logit-transformation (as in bone marrow blasts and sideroblasts proportions), the same one-unit increase corresponds to scaling the odds of the original covariate by e. +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 11: Point estimates of cumulative transition hazards for a sample patient with MDS (black curve), along with \(95\\%\) non-parametric confidence intervals (dashed red lines). +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 12: Point estimates of state occupation probabilities for a sample patient with MDS (black curve), along with \(95\\%\) non-parametric confidence intervals (dashed red lines). +

    +
    +
    +
    +
    +

    8 Supplementary materials

    +

    Supplementary materials are available in addition to this article. It can be downloaded at +RJ-2024-002.zip

    +

    9 CRAN packages used

    +

    msm, SemiMarkov, survival, mstate, mboost, gamboostMSM, penMSM, ebmstate

    +

    10 CRAN Task Views implied by cited packages

    +

    ClinicalTrials, Distributions, Econometrics, Epidemiology, MachineLearning, Survival

    +

    11 Note

    +

    This article is converted from a Legacy LaTeX article using the +texor package. +The pdf version is the official version. To report a problem with the html, +refer to CONTRIBUTE on the R Journal homepage.

    +
    +
    +O. O. Aalen. A linear regression model for the analysis of life times. Statistics in Medicine, 8(8): 907–925, 1989. URL https://doi.org/10.1002/sim.4780080803. +
    +
    +O. Aalen, O. Borgan and H. Gjessing. Survival and event history analysis. Springer, 2008. URL https://link.springer.com/book/10.1007/978-0-387-68560-1. +
    +
    +P. Andersen, O. Borgan, R. Gill and N. Keiding. Statistical models based on counting processes. Springer, 1993. URL https://link.springer.com/book/10.1007/978-1-4612-4348-9. +
    +
    +B. Carlin and T. Louis. Bayesian methods for data analysis. CRC Press, 2009. URL https://doi.org/10.1201/b14884. +
    +
    +G. Cortese and P. K. Andersen. Competing risks and time-dependent covariates. Biometrical Journal, 52(1): 138–158, 2010. URL https://doi.org/10.1002/bimj.200900076. +
    +
    +L. C. de Wreede, M. Fiocco and H. Putter. mstate: An R package for the analysis of competing risks and multi-state models. Journal of Statistical Software, 38(7): 1–30, 2011. URL http://www.jstatsoft.org/v38/i07/. +
    +
    +A. Gelman, J. Carlin, H. Stern, D. Dunson, A. Vehtari and D. Rubin. Bayesian data analysis. CRC Press, 2014. URL https://doi.org/10.1201/b16018. +
    +
    +J. Harrell Frank E., R. M. Califf, D. B. Pryor, K. L. Lee and R. A. Rosati. Evaluating the Yield of Medical Tests. JAMA, 247(18): 2543–2546, 1982. URL https://doi.org/10.1001/jama.1982.03320430047030. +
    +
    +T. Hastie, R. Tibshirani, J. H. Friedman and J. H. Friedman. The elements of statistical learning: Data mining, inference, and prediction. Springer, 2009. URL https://link.springer.com/book/10.1007/978-0-387-84858-7. +
    +
    +T. Hothorn, P. Buehlmann, T. Kneib, M. Schmid and B. Hofner. Mboost: Model-based boosting. R package version, 2.9–3, 2020. URL https://CRAN.R-project.org/package=mboost. +
    +
    +P. Hougaard. Multi-state models: A review. Lifetime data analysis, 5(3): 239–264, 1999. URL https://doi.org/10.1023/A:1009672031531. +
    +
    +C. Jackson. flexsurv: A platform for parametric survival modeling in R. Journal of Statistical Software, 70(8): 1–33, 2016. DOI 10.18637/jss.v070.i08. +
    +
    +C. H. Jackson. Multi-state models for panel data: The msm package for R. Journal of Statistical Software, 38(8): 1–29, 2011. URL http://www.jstatsoft.org/v38/i08/. +
    +
    +J. D. Kalbfleisch and R. L. Prentice. The statistical analysis of failure time data. John Wiley & Sons, 2002. DOI 10.1002/9781118032985. +
    +
    +A. Listwon and P. Saint-Pierre. SemiMarkov: An R Package for Parametric Estimation in Multi-State Semi-Markov Models. Journal of Statistical Software, 66(6): 784, 2015. URL https://hal.archives-ouvertes.fr/hal-00860244. +
    +
    +E. Papaemmanuil, M. Gerstung, L. Malcovati, S. Tauro, G. Gundem, P. Van Loo, C. J. Yoon, P. Ellis, D. C. Wedge, A. Pellagatti, et al. Clinical and biological implications of driver mutations in myelodysplastic syndromes. Blood, 122(22): 3616–3627, 2013. URL https://doi.org/10.1182/blood-2013-08-518886. +
    +
    +A. Perperoglou. Cox models with dynamic ridge penalties on time-varying effects of the covariates. Statistics in Medicine, 33(1): 170–180, 2014. URL https://doi.org/10.1002/sim.5921. +
    +
    +H. Putter. Tutorial in biostatistics: Competing risks and multi-state models analyses using the mstate package. Companion file for the mstate package, 2011. URL https://mirror.las.iastate.edu/CRAN/web/packages/mstate/vignettes/Tutorial.pdf. +
    +
    +H. Putter, M. Fiocco and R. B. Geskus. Tutorial in biostatistics: Competing risks and multi-state models. Statistics in Medicine, 26(11): 2389–2430, 2007. URL https://doi.org/10.1002/sim.2712. +
    +
    +H. Reulen. gamboostMSM. R package version, 1.1.87, 2014. URL https://CRAN.R-project.org/package=gamboostMSM. +
    +
    +H. Reulen. penMSM. R package version, 0.99, 2015. URL https://CRAN.R-project.org/package=penMSM. +
    +
    +R. J. Samworth. Stein’s paradox. Eureka, 62: 38–41, 2012. URL https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=7eebd55f569395544f2b5d367d6aee614901d2c1. +
    +
    +R. Schall. Estimation in generalized linear models with random effects. Biometrika, 78(4): 719–727, 1991. URL http://dx.doi.org/10.1093/biomet/78.4.719. +
    +
    +G. Schwarz. Estimating the dimension of a model. The annals of statistics, 461–464, 1978. URL https://www.jstor.org/stable/2958889. +
    +
    +C. Spitoni, M. Verduijn and H. Putter. Estimation and asymptotic theory for transition probabilities in markov renewal multi-state models. The International Journal of Biostatistics, 8(1): 2012. URL https://doi.org/10.1515/1557-4679.1375. +
    +
    +T. M. Therneau. A package for survival analysis in s. 2015. URL https://CRAN.R-project.org/package=survival. version 2.38. +
    +
    +L. C. de Wreede, M. Fiocco and H. Putter. The mstate package for estimation and prediction in non- and semi-parametric multi-state and competing risks models. Computer Methods and Programs in Biomedicine, 99(3): 261–274, 2010. URL http://www.sciencedirect.com/science/article/pii/S0169260710000027. +
    +
    +H. Zou and T. Hastie. Regularization and variable selection via the elastic net. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 67(2): 301–320, 2005. URL https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00503.x. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Costa & Gerstung, "ebmstate: An R Package For Disease Progression Analysis Under Empirical Bayes Cox Models", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-002,
    +  author = {Costa, Rui J. and Gerstung, Moritz},
    +  title = {ebmstate: An R Package For Disease Progression Analysis Under Empirical Bayes Cox Models},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-002},
    +  doi = {10.32614/RJ-2024-002},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {15-38}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-002/RJ-2024-002.pdf b/_articles/RJ-2024-002/RJ-2024-002.pdf new file mode 100644 index 0000000000..920c9560ca Binary files /dev/null and b/_articles/RJ-2024-002/RJ-2024-002.pdf differ diff --git a/_articles/RJ-2024-002/RJ-2024-002.zip b/_articles/RJ-2024-002/RJ-2024-002.zip new file mode 100644 index 0000000000..f4cc54fc46 Binary files /dev/null and b/_articles/RJ-2024-002/RJ-2024-002.zip differ diff --git a/_articles/RJ-2024-002/RJournal.sty b/_articles/RJ-2024-002/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-002/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-002/RJwrapper.md b/_articles/RJ-2024-002/RJwrapper.md new file mode 100644 index 0000000000..cf6a65f232 --- /dev/null +++ b/_articles/RJ-2024-002/RJwrapper.md @@ -0,0 +1,1375 @@ +--- +abstract: | + The new R package ebmstate is a package for multi-state survival + analysis. It is suitable for high-dimensional data and allows point + and interval estimation of relative transition hazards, cumulative + transition hazards and state occupation probabilities, under + clock-forward and clock-reset Cox models. Our package extends the + package mstate in a threefold manner: it transforms the Cox regression + model into an empirical Bayes model that can handle high-dimensional + data; it introduces an analytical, Fourier transform-based estimator + of state occupation probabilities for clock-reset models that is much + faster than the corresponding, simulation-based estimator in mstate; + and it replaces asymptotic confidence intervals meant for the + low-dimensional setting by non-parametric bootstrap confidence + intervals. Our package supports multi-state models of arbitrary + structure, but the estimators of state occupation probabilities are + valid for transition structures without cycles only. Once the input + data is in the required format, estimation is handled automatically. + The present paper includes a tutorial on how to use ebmstate to + estimate transition hazards and state occupation probabilities, as + well as a simulation study showing how it outperforms mstate in + higher-dimensional settings. +address: +- | + Rui J. Costa\ + European Molecular Biology Laboratory\ + European Bioinformatics Institute (EMBL-EBI)\ + Hinxton, CB10 1SD\ + United Kingdom\ + [ruibarrigana@hotmail.com](ruibarrigana@hotmail.com){.uri} +- | + Moritz Gerstung\ + aff. 1: European Molecular Biology Laboratory\ + European Bioinformatics Institute (EMBL-EBI)\ + Hinxton, CB10 1SD\ + United Kindom\ + aff. 2: German Cancer Research Center (DKFZ)\ + Im Neuenheimer Feld 280\ + 69120 Heidelberg\ + Germany\ + [moritz.gerstung@dkfz.de](moritz.gerstung@dkfz.de){.uri} +author: +- by Rui J. Costa and Moritz Gerstung +bibliography: +- costa-gerstung.bib +title: "ebmstate: An R Package For Disease Progression Analysis Under + Empirical Bayes Cox Models" +--- + +::: article +## Introduction + +Multi-state models based on transition hazard functions are often used +in the statistical analysis of longitudinal data, in particular disease +progression data [@Hougaard1999]. The multi-state model framework is +particularly suitable to accommodate the growing level of detail of +modern clinical data: as long as a clinical history can be framed as a +random process which, at any moment in time, occupies one of a few +states, a multi-state model is applicable. Another strong point of this +framework is that it can incorporate a *regression model*, i.e., a set +of assumptions on how covariates, possibly time-dependent ones, affect +the risk of transitioning between any two states of the disease. Once +estimated, multi-state models with regression features allow the +stratification of patients according to their transition hazards. In +addition, it is possible, under some models, to generate disease outcome +predictions. These come in the form of *state occupation probability* +estimates, meaning estimates of the probability of being in each state +of the disease over a given time frame. + +The survival analysis 'task view' of the Comprehensive R Archive Network +lists seven R packages that are able to fit *general* multi-state models +and, at the same time, feature some kind of regression model or +algorithm: `flexsurv` [@flexsurv_package], +[**msm**](https://CRAN.R-project.org/package=msm) [@Jackson2011], +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov) +[@Listwon2015], +[**survival**](https://CRAN.R-project.org/package=survival) +[@survival_package], +[**mstate**](https://CRAN.R-project.org/package=mstate) [@Wreede2010], +[**mboost**](https://CRAN.R-project.org/package=mboost) +[@mboost_package] -- as extended by +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM) +[@gamboostMSM_package] -- and +[**penMSM**](https://CRAN.R-project.org/package=penMSM) +[@penMSM_package]. All of them implement relative risk regression models +[as defined in @Aalen2008 p. 133]. The only exceptions are +[**survival**](https://CRAN.R-project.org/package=survival), which also +fits Aalen's additive regression model [@Aalen1989], and `flexsurv`, +which also implements accelerated failure time models . + +Recall that a Cox regression model is a semi-parametric model in which +every transition hazard is assumed to be the product of a baseline +hazard function of unspecified form (the non-parametric component) and +an exponential relative risk function (the parametric component) +[@Aalen2008 p. 133]. Generally, the relative risk regression models +implemented in these packages are Cox regression models. However, some +models in `flexsurv`, as well as those in +[**msm**](https://CRAN.R-project.org/package=msm) and +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov), also +restrict the baseline hazards to specific parametric families, i.e. they +are fully parametric. In +[**msm**](https://CRAN.R-project.org/package=msm) and +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov), the +stronger assumptions regarding the functional form of the hazard are +leveraged to do away with other common assumptions: +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov) drops +the usual Markov property to implement homogeneous semi-Markov models; +[**msm**](https://CRAN.R-project.org/package=msm) is suitable for *panel +data*, i.e., data in which the state of each individual is known only at +a finite series of times. + +Packages [**penMSM**](https://CRAN.R-project.org/package=penMSM) and +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM) are +the best suited to deal with higher-dimensional covariate data. The +first of these packages relies on a structured fusion lasso method, +while the second implements (jointly with +[**mboost**](https://CRAN.R-project.org/package=mboost)) a boosting +algorithm. Both methods induce sparsity in the number of non-zero +covariate effects, as well as equality among the different transition +effects of each covariate, and are thus especially useful to reduce +complicated multi-state models to more interpretable ones. The remaining +packages assume standard, fixed effects relative risk regression models +and do not include regularisation or variable selection features. + +It is also illustrative to order the seven packages mentioned according +to how extensive their analysis workflow is. Packages +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov) and +[**penMSM**](https://CRAN.R-project.org/package=penMSM) are intended for +the estimation of relative transition hazards only (i.e., for estimating +the impact of covariates on each transition hazard). With the package +[**mboost**](https://CRAN.R-project.org/package=mboost) (as extended by +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM)) it is +also possible to estimate the baseline transition hazards. Finally, a +more complete workflow including estimates of both relative and +cumulative transition hazards, as well as state occupation +probabilities, is implemented in `flexsurv`, +[**msm**](https://CRAN.R-project.org/package=msm) and +[**mstate**](https://CRAN.R-project.org/package=mstate), and has been +under implementation in +[**survival**](https://CRAN.R-project.org/package=survival) (version 3.0 +or later). + +The present paper provides an introduction to +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), a new R +package for multi-state survival analysis available for download on the +Comprehensive R Archive Network (CRAN). The main goal of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is to +provide an analysis framework for the Cox model that performs better +with higher-dimensional covariate data and is also complete, in the +sense of being able to generate point and interval estimates of relative +transition hazards, cumulative transition hazards and state occupation +probabilities, both under clock-forward and clock-reset models. A +fundamental characteristic of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is that it +re-implements and extends the analysis framework of +[**mstate**](https://CRAN.R-project.org/package=mstate), which is +complete in the sense just mentioned. In fact, to a large extent, our +package was built by importing, adapting and replacing functions from +the [**mstate**](https://CRAN.R-project.org/package=mstate) package. +This not only eliminates redundancies, but also makes our package more +accessible to the numerous users of +[**mstate**](https://CRAN.R-project.org/package=mstate) (the three +papers associated with +[**mstate**](https://CRAN.R-project.org/package=mstate) have jointly +over 2000 citations). + +To improve the performance of +[**mstate**](https://CRAN.R-project.org/package=mstate)'s multi-state +Cox model when dealing with higher-dimensional covariate data, a +ridge-type regularisation feature was added. We allow the regression +coefficients of the model to be partitioned into groups, with each group +having its own Gaussian prior. A group can gather, for example, all the +regression coefficients for a given transition. Or, within a given +transition, coefficients can be grouped according to the covariate type +they refer to (for example, demographic, clinical or genomic type). The +resulting hierarchical Bayes model is *empirical* in that a full prior +elicitation is not required (the mean and variance hyper-parameters of +the Gaussian are estimated from the data). Model fitting relies on the +iterative algorithm introduced by @Schall1991, which typically converges +after a small number of steps. A simulation study showing that Schall's +algorithm performance compares well with that of other algorithms for +ridge penalty optimisation, including one based on cross-validation, can +be found in @Perperoglou2014. + +The asymptotic confidence intervals generated by +[**mstate**](https://CRAN.R-project.org/package=mstate) are applicable +when the number of observations is much larger than the number of +parameters to be estimated (see section [3.3](#sec:interval_estimation) +below). To preserve the completeness of +[**mstate**](https://CRAN.R-project.org/package=mstate)'s framework in +higher-dimensional settings, we therefore implemented non-parametric +bootstrap intervals of regression coefficients, cumulative transition +hazards and state occupation probabilities. + +The high computational cost implied by the non-parametric bootstrap +motivated a third extension to +[**mstate**](https://CRAN.R-project.org/package=mstate). We developed an +estimator of state occupation probabilities under clock-reset Cox models +that is based on a convolution argument [as in @Spitoni2012] and the +Fast Fourier transform (FFT). At present, the estimation of such +probabilities for clock-forward Cox models can be carried out using the +efficient, product-limit based algorithm available in +[**mstate**](https://CRAN.R-project.org/package=mstate). However, for +clock-reset Cox models, only a simulation-based estimator is available +in this package (see also the `flexsurv` package for a similar, +simulation-based estimator). The FFT estimator in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) was +conceived as a faster alternative to this simulation-based estimator, +but its scope is currently restricted to multi-state models with +transition structures that have no cycles, i.e. in which a transition +between two states is either not possible or follows a unique sequence +of states. Figure \@ref(fig:figpackage-summary-figure) provides a short +graphical summary of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), with the +main inputs -- a genomic-clinical data set and an empirical Bayes +multi-state Cox model -- and the main outputs -- the estimates of +relative hazards and state occupation probabilities (cumulative +transition hazards are omitted). + +As already mentioned, our empirical Bayes method improves estimator +performance in models with larger numbers of covariates (see section +[4](#sec:estimator_performance) on estimator performance). Also, as a +ridge-type regression method, it can be used as an alternative to the +lasso method of [**penMSM**](https://CRAN.R-project.org/package=penMSM) +in two particular cases: when the levels of correlation between +covariates are high enough to compromise the stability of lasso-based +covariate selection; or simply to improve prediction accuracy when +interpretability is not essential and the number of covariates is not +greater than the number of observations [@Zou2005]. In addition, and +perhaps more importantly, +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) goes beyond +the regularised estimation of transition hazards offered by +[**penMSM**](https://CRAN.R-project.org/package=penMSM) and +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM): point +and interval estimates of state occupation probabilities under the +regularised Cox model can also be computed. + +## Models + +A multi-state Cox model is a continuous-time stochastic process with a +finite (and usually small) state space $\mathcal{S}$. To better describe +the models implemented in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), we define +the following notation. We let $t$ denote the time since some initiating +event (usually diagnosis or disease onset). For +$t \in \left[0, \infty\right)$, we define the following random +variables: $X(t)$ represents the disease state of the patient, $S(t)$ +the time spent in the current state, and $\vec{Z}\left(t\right)$ the +value of a covariate vector. The realisation of each component of the +process $\lbrace\vec{Z}\left(t\right)\rbrace$ is a step function, +possibly approximating the evolution in time of a continuous covariate. +In addition, $\lbrace\vec{Z}\left(t\right)\rbrace$ is assumed +not-adapted to the filtration generated by +$\lbrace X\left(t\right)\rbrace$ (an adapted covariate is one whose path +until $t$ is known once $\lbrace X \left(u\right)\rbrace$, $u \leq t$, +is known). The transition hazard rate of a patient from state $i$ to +state $j$ ($i\neq j$) at time $t$, conditional on the sojourn time and +the covariate vector, is defined as +$$\begin{aligned} + &\alpha_{ij}\left(t|\mathbf{z},s \right):=\lim_{h \downarrow 0}\frac{1}{h}\mathrm{P}\left[X(t+h)=j\,|\,X(t)=i,S(t)=s,\vec{Z}(t)=\mathbf{z} \right]\;, \;s\in \left[0,\infty\right)\;,\;t\in \left[s,\infty\right)\;. +\end{aligned}$$ +Independent right-censoring and left-truncation are assumed throughout +[@Aalen2008 p. 57]. The purpose of the present section is to give a (not +necessarily exhaustive) description of the scope of +[**mstate**](https://CRAN.R-project.org/package=mstate) and +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) with respect +to the multi-state Cox model. Using the terminology in @Putter2011, a +Cox model is termed a 'clock-reset' model when +$$\begin{aligned} +\label{eq:clock_reset_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}, s\right)&=\lambda_{ij}^{(0)}\left(s\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad, +\end{aligned} (\#eq:clock-reset-Cox)$$ +and it is termed a 'clock-forward' model when +$$\begin{aligned} +\label{eq:clock_forward_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}\right)&=\alpha_{ij}^{(0)}\left(t\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad. +\end{aligned} (\#eq:clock-forward-Cox)$$ +In both cases, $i,j \in \mathcal{S}$, with $i\neq j$; +$\boldsymbol{\beta}_{\scriptscriptstyle ij}$ is an unknown vector of +regression coefficient parameters, and both +$\lambda^{\scriptscriptstyle (0)}_{ij}(\cdot)$ and +$\alpha^{\scriptscriptstyle (0)}_{ij}(\cdot)$ are unknown (baseline +hazard) functions, non-negative on $\mathbb{R}^{+}$. When, as in +equation \@ref(eq:clock-reset-Cox), +$\alpha_{ij}\left(t|\mathbf{z},s\right)$ is the same for all $t\geq s$, +we simplify its notation to $\lambda_{ij}\left(s|\mathbf{z}\right)$. As +can be seen from equations \@ref(eq:clock-reset-Cox) and +\@ref(eq:clock-forward-Cox), the 'clock-reset' and 'clock-forward' +models are models for how the transition hazard rates are affected by +time. In the former case, the only relevant time scale is the time $s$ +spent in the current state, whereas in the latter only the time $t$ +since the initiating event matters. While the 'clock-forward' model is +arguably the default one in multi-state survival analysis +[@Andersen1993; @Aalen2008], in some cases the 'clock-reset' model is +more appropriate. For example, in some forms of cancer, it can be +sensible to assume that the transition hazards from the state of +complete remission depend on the sojourn time, rather than on the time +since the initial diagnosis. + +### Relative transition hazards {#sec:models_relative_hazards} + +The parametric component of the transition hazard from $i$ to $j$, +written +$\exp\left[\boldsymbol{\beta}^{\intercal}_{ij} \,\mathbf{z}\right]$, is +termed the relative transition hazard. In +[**mstate**](https://CRAN.R-project.org/package=mstate) and +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), estimating +the relative transition hazard amounts to estimating the regression +coefficient vector $\boldsymbol{\beta}_{ij}\,$. In +[**mstate**](https://CRAN.R-project.org/package=mstate), these +parameters are assumed to be non-random. With +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +following prior distributions can be imposed. + +Define $\mathcal{P}$ as the set of all pairs of states between which a +direct transition is possible. Let +$\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ij} \rbrace$, for all +$(i, j) \in \mathcal{P}$, be a partition of $\boldsymbol \beta$, a +vector containing the regression coefficients for all direct transitions +allowed. Each $\boldsymbol{\beta}_{\scriptscriptstyle ij}$ is further +partitioned into +$\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ijk} \rbrace$, for +$k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace$. In +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the most +general model regarding the prior distribution of $\boldsymbol{\beta}$ +makes two assumptions: a) the scalar components of $\boldsymbol{\beta}$ +are independent and normally distributed; b) the scalar components of +$\boldsymbol{\beta}_{\scriptscriptstyle i j k}$ have a common (and +undetermined) mean $\mu_{\scriptscriptstyle ijk}$ and a common (and also +undetermined) variance $\sigma^{2}_{\scriptscriptstyle ijk}\;$. + +The purpose of the framework just described is to allow the clustering +of covariate effects according to their prior distribution. If there is +no prior knowledge about how this clustering should be done, a single +Gaussian prior can be imposed on all regression coefficients at once. If +prior knowledge allows the grouping of effects according to the +transition they refer to, a different Gaussian prior can be assigned to +the coefficients of each transition. Even within each transition, +different groups of coefficients can be assigned different prior +distributions. In the analysis of biomedical data, for example, there +can be a split between genes which are known to affect the transition +hazard, and other genes whose effect is unknown. + +### Cumulative transition hazard functions + +Our package imports from +[**mstate**](https://CRAN.R-project.org/package=mstate) a Breslow +estimator of two types of cumulative transition hazard: one on a global +time scale, defined as +$$\begin{aligned} +\mathrm{A}_{ij}\left(t\,|\,\mathbf{z}\right)&:=\int_{0}^{t}\alpha_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad, +\end{aligned}$$ +and another on a sojourn time scale, defined as +$$\begin{aligned} +&\Lambda_{ij}(s\,|\,\mathbf{z}):=\int_{0}^{s}\lambda_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad. +\end{aligned}$$ +Note that, in either case, the covariate vector is assumed to remain +constant. + +### State occupation probabilities + +By state occupation probability, we mean the probability that a patient +in state $i$ at time $0$ finds herself in state $j$ at time $t$. The +estimates of these probabilities can be seen as functionals of the +estimated cumulative transition hazard functions. For this reason, the +restriction to models with time-fixed covariates, which was just seen to +be applicable to the estimators of cumulative transition hazards, +carries over to the estimation of state occupation probabilities. + +When conditioning on a given covariate path (time-fixed or not), state +occupation probability estimates are not valid unless the covariates are +*external* [@Cortese2010; @Aalen2008 p. 142]. Note that a vector of +covariates $\lbrace \vec{Z}(u)\rbrace_{u\geq 0}$ is said to be +*external* if, for all $t \in \left[0,\infty\right)$, each transition +hazard at $t$, conditional on $\vec{Z}(t)$, is independent of +$\lbrace \vec{Z}(u)\rbrace_{u>t}$ (i.e. independent of the future path +of the covariate). Otherwise, it is said to be *internal* [for more +details on the distinction between internal and external covariates, see +@Kalbfleisch2002 chapter 6]. When one does not wish (or is not possible +due to $\vec{Z}$ being *internal*) to condition on a future covariate +path of the covariate process, the uncertainty introduced by this +process needs to be accounted for. This can be done by extending the +state space of the disease process, so that it includes information on +the disease *and* the covariate process [@Andersen1993 p. 170]. For +example, to include a dichotomous transplant covariate (an internal +covariate) in a simple survival model with two states, the state space +is expanded from $\lbrace$alive, deceased$\rbrace$ to $\lbrace$alive +without transplant, alive with transplant, deceased$\rbrace$. One can +then either assume that transplanted patients have a different baseline +death hazard or, more simply, that transplantation scales the death +hazard by some constant $\exp \left( \gamma\right)$. A similar but more +detailed example can be found in @Wreede2010 [section 2.3.2, 'model 3' +]. + +## Estimation + +In the current section, we present the estimation methods underlying the +extensions of [**mstate**](https://CRAN.R-project.org/package=mstate) +implemented in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate). +[]{#sec:estimation label="sec:estimation"} + +### Relative and cumulative hazard functions + +Let $\boldsymbol{\mu}_{\scriptscriptstyle ij}$, with +$\left(i,j\right) \in \mathcal{P}$ (the set of direct transitions +allowed), denote a vector whose scalar components are the parameters +$\mu_{\scriptscriptstyle ijk}$, +$k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace$. +Similarly, let $\boldsymbol{\sigma}^{2}_{\scriptscriptstyle ij}$ be +composed of the parameters +$\left\lbrace \sigma^{2}_{\scriptscriptstyle ijk}\right\rbrace_{k}$. The +estimation of $\boldsymbol{\beta}$, +$\boldsymbol{\mu}:=\lbrace\boldsymbol{\mu}_{\scriptscriptstyle{ij}}\rbrace$ +and +$\boldsymbol{\sigma}^2:=\lbrace\boldsymbol{\sigma}^2_{\scriptscriptstyle ij }\rbrace$ +relies on the restricted maximum-likelihood (REML) type algorithm +described in [@Perperoglou2014], and introduced by [@Schall1991]. The +resulting estimate of $\boldsymbol{\beta}$ is a maximum *a posteriori* +estimate; the estimates of $\boldsymbol{\mu}$ and +$\boldsymbol{\sigma}^{2}$ are empirical Bayes estimates. In +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +estimator based on this algorithm is implemented in the function +`CoxRFX` . The results of a simulation study showing its consistency are +included in the Supporting Scripts and Data (file ESM_1.html, section +1). + +The computation of cumulative hazard rates for given covariate values +and an estimated regression coefficient vector relies on the function +`msfit_generic`, which is essentially a wrapper for the function +`mstate::msfit` (see section [5.3](#sec:computing_cumulative_hazards)). +For the mathematical details of this computation, we refer therefore the +reader to @Wreede2010. + +### State occupation probabilities {#sec:trans_probs} + +The package [**mstate**](https://CRAN.R-project.org/package=mstate) +includes a simulation-based estimator that can take as input either +$\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ or +$\hat{\Lambda}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ to generate +estimates of state occupation probabilities under the clock-forward or +the clock-reset model respectively. Another available estimator, an +Aalen-Johansen-type estimator based on product integration, is far more +efficient computationally and takes as input +$\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ only. As the +scope of this estimator has been restricted to clock-forward Cox models +[@Andersen1993; @Aalen2008], in our package we implemented a +convolution-based estimator as a computationally efficient alternative +(for models with a transition structure that has no cycles). + +For convenience, let the sequence of states from $0$ to $n$ have the +labels $0,1,2,...,n\,$, where $0$ is the initial state by definition, +and $n$ is some state that might (eventually) be reached by the process. +In addition, define $X_{0}:=X(0)$ and $T_{0}:=0$, and let +$\left(X_{i},T_{i}\right)$, $i \in \left\lbrace 1,2,... \right\rbrace$, +denote the marked point process associated with +$\left\lbrace X(t)\right\rbrace$, so that $T_{i}$ is the time of the +$i^{th}$ transition and $X_{i}$ is the state the process jumps to at +time $T_{i}$. The inter-transition times are denoted by +$\tau_{ij}:=T_{j}-T_{i}$, for $j>i$. We can write the probability that a +patient in state $0$ at time $0$ finds herself in state $n$ at time $t$, +conditional on $\vec{Z}(u)=\mathbf{z}$ for all $u \geq 0$, as +$$\begin{aligned} + &\mathrm{P}\left[X(t)=n\,|\,X(0)=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right]\\ + &\,=\mathrm{P}\left[X_{n}=n,\tau_{0,n} < t,\tau_{n,n+1}\geq t- \tau_{0,n} |X_{0}=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right] \,.\nonumber +\end{aligned}$$ + +Recall that $\lambda_{i,i+1}\left(s\,|\, \mathbf{z}\right)$ denotes the +hazard rate of a transition to state $i+1$ at time $s$ since arrival in +state $i$, for a patient that has covariate vector $\mathbf{z}$. The +cumulative hazard for the same transition between sojourn times $0$ and +$s$, if the patient's covariate vector remains constant at $\mathbf{z}$, +is represented by +$\Lambda_{i,i+1}\left(s \,|\, \mathbf{z}\right):=\int_{0}^{s}\lambda_{i,i+1}\left(x\,|\, \mathbf{z}\right)\mathrm{d}x$. +Similarly, we let $\lambda_{i}\left(s\,|\, \mathbf{z}\right)$ represent +the hazard rate of going to any state that can be reached directly from +$i$, at time $s$ since arrival in state $i$, for a patient with +covariate vector $\mathbf{z}$. The cumulative hazard for the same event +between sojourn times $0$ and $s$, if the patient's covariate vector +remains constant at $\mathbf{z}$, is represented by +$\Lambda_{i}\left(s \,|\, \mathbf{z}\right)$. The expressions +$\hat{\Lambda}_{i}\left(s \,|\, \mathbf{z}\right)$ and +$\hat{\Lambda}_{i,i+1}\left(s \,|\, \mathbf{z}\right)$ denote the +Breslow estimators of the cumulative hazards just defined. In what +follows, all references to probabilities, hazard rates and cumulative +hazards are to be understood as conditional on +$\vec{Z}(u)=\mathbf{z}\,$, for $u\geq 0$: this condition is omitted to +simplify the notation. + +In [**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +function `probtrans_ebmstate` generates a set of state occupation +probability estimates at equally spaced time points: +$$\begin{aligned} +&\left\lbrace \hat{p}_{0n}\left(k\right)\right\rbrace_{k} :=\left\lbrace \hat{\mathrm{P}}\left[X_{n}=n,\tau_{0,n} < t_{k},\tau_{n,n+1}\geq t_{k}- \tau_{0,n}\,|\, X_{0}=0 \right] \right\rbrace_{k}\;,\; k=0,1,2,...,K\,;\, t_{k}=k\times \Delta t \;. +\end{aligned}$$ +The number $K$ of time intervals is $10,000$ by default and $t_{K}$ is a +parameter set by the user. Defining the functions +$$\begin{aligned} +q_{ij}\left(k\right):=\mathrm{P}\left[X_{j}=j, \tau_{ij}\in \left[t_{k},t_{k+1}\right)\,|\,X_{i}=i\right] +\end{aligned}$$ +and +$$\begin{aligned} +r_{i}\left(k\right):=\mathrm{P}\left[\tau_{i,i+1} > t_{k} \,|\,X_{i}=i\right]\;, +\end{aligned}$$ +and the finite difference +$$\begin{aligned} + \Delta \hat{\Lambda}_{i,i+1}\left(t_{k}\right):=\hat{\Lambda}_{i,i+1}\left(t_{k+1}\right)-\hat{\Lambda}_{i,i+1}\left(t_{k}\right)\;, +\end{aligned}$$ +the algorithm behind `probtrans_ebmstate` can be described as follows: + +1. For $j=1,2,...,n$, compute + $$\begin{aligned} + \label{eq:est1} + \hat{q}_{j-1,j}\left(k\right)&:=\exp \left[-\hat{\Lambda}_{j-1}\left(t_{k}\right)\right]\Delta \hat{\Lambda}_{j-1,j}\left(t_{k}\right)&& + \end{aligned} (\#eq:est1)$$ + for $k=0,1,...,K-1$. + +2. For $j=2,3,...,n$, compute (iteratively) + $$\begin{aligned} + \label{eq:est2} + \hat{q}_{0j}\left(k\right):=&\sum_{l=0}^{k-1} \hat{q}_{j-1,j}\left(k-l-1\right) \hat{q}_{0,j-1} \left(l\right) && + \end{aligned} (\#eq:est2)$$ + for $k=0,1,...,K-1$. + +3. Finally, use the estimates obtained in the last iteration of step 2 + to compute + $$\begin{aligned} + \label{eq:est4} + \hat{p}_{0n}\left(k\right):=&\sum_{l=0}^{k-1} \hat{r}_{n}\left(k-l-1\right) \hat{q}_{0,n}\left(l\right)&& + \end{aligned} (\#eq:est4)$$ + for $k=0,1,...,K$, where + $\hat{r}_{n}\left(\cdot\right):=\exp \left[-\hat{\Lambda}_{n}\left(t_{\scriptscriptstyle\left(\cdot\right)}\right)\right]\,$. + +Substituting $:=$ for $\approx$ and removing the 'hats' in definitions +\@ref(eq:est1) to \@ref(eq:est4), we get the approximate equalities that +justify the algorithm. These approximate equalities are derived in the +Supporting Scripts and Data (file ESM_1.html, section 2). + +Apart from `probtrans_ebmstate`, the function `probtrans_fft` is also +based on the convolution argument just shown. However, this function +makes use of the convolution theorem, i.e., of the fact that the +convolution of two (vectorized) functions in the time domain is +equivalent to a pointwise product of the same functions in the frequency +domain. The estimation of state occupation probabilities is thus +simplified to +$$\begin{aligned} + \hat{p}_{0n}:=&\mathcal{F}^{\scriptscriptstyle -1}\left\lbrace \hat{\mathrm q}_{0,1} \boldsymbol{\cdot} \hat{\mathrm q}_{1,2}\boldsymbol{\cdot} \mathrm{...}\boldsymbol{\cdot}\hat{\mathrm q}_{n-1,n}\boldsymbol \cdot \hat{\mathrm r}_{n}\right\rbrace\;, +\end{aligned}$$ +where $\mathcal{F}$ denotes the discrete Fourier transform, +$\hat{\mathrm{q}}_{j-1,j}:=\mathcal{F}(\hat{q}_{j-1,j})$ and +$\hat{\mathrm{r}}_{n}:=\mathcal{F}(\hat{r}_{n})$. Conversion to and from +the frequency domain is carried out using the fast Fourier transform +algorithm implemented in the `fft` function of the base package `stats`. +The Supporting Scripts and Data contain a short simulation study +checking that state occupation probabilities can be accurately estimated +with `probtrans_ebmstate` and `probtrans_fft` (see file ESM_1.html, +sections 3 and 4). + +Figure \@ref(fig:figmssample) consists of a grid of plots with estimated +curves of state occupation probabilities. It compares, in terms of speed +and accuracy, the estimator in `probtrans_fft` with an estimator in +`mstate::mssample` that has the same target, but is simulation-based. +Each plot contains a black curve and a superimposed red curve. The red +curves in any given column of the grid are all based on the same run of +a function: columns 1 to 3 are based on runs of `mssample` with the +number of samples $n$ equal to $100$, $1000$ and $10.000$ respectively, +while column 4 is based on a run of `probtrans_fft`. Each column in the +grid reproduces the same 4 black curves. These are based on a single run +of `mssample` with $n=100.000$ and serve as benchmark. All function runs +are based on the same input: a set of cumulative transition hazard +estimates for a multi-state model with the 'linear' transition structure +given in the leftmost diagram of figure +\@ref(fig:figtransition-structures). Plots in a given row refer to the +same state of the model. The running times on top of each column refer +to the estimation of red curves. The main conclusion suggested by this +analysis of simulated data is that `probtrans_fft` is as accurate as +`mssample` with $n=10.000$, but it is almost 100 times faster (columns 3 +and 4). With $n=1000$, `mssample` achieves a good approximation to the +true state occupation probabilities, but is still roughly 9 times +slower. The details on how figure \@ref(fig:figmssample) and its +underlying data were generated are given in the Supporting Scripts and +Data (file ESM_1.html, section 5). + +### Interval estimation {#sec:interval_estimation} + +Under any model estimated by +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) -- as in +general under a Bayesian model --, one can, if the sample size is large +enough, approximate the posterior by a normal distribution with mean +equal to the maximum *a posteriori* estimate and covariance matrix equal +to the inverse of the generalised observed Fisher information [see, for +example, @Gelman2014 p. 83-84]. This approximation has first-order +accuracy and is thus outperformed by Laplace's method, which has +second-order accuracy [@Carlin2009 p. 110-111]. However, as @Carlin2009 +[p. 112] observe, "for moderate- to high-dimensional $\boldsymbol\theta$ +(say, bigger than 10), Laplaces method will rarely be of sufficient +accuracy\[\...\]". @Carlin2009 [p. 244-251] also describe three methods +of interval estimation in empirical Bayes settings, but all of them are +designed for fully parametric models. These reasons, along with the fact +that regularised methods such as the one implemented +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) are +typically used to fit models with more than a dozen covariates, led us +to choose the non-parametric bootstrap as the interval estimation method +in [**ebmstate**](https://CRAN.R-project.org/package=ebmstate). Note +that the non-parametric bootstrap can be given a Bayesian +interpretation. Its interval estimates are approximately the same as +those of a Bayesian model that assumes: a) a multinomial distribution +for the data; and b) a non-informative Dirichlet prior distribution for +the probability assigned to each category in the multinomial +distribution. This is a specific case of the so-called Bayesian +bootstrap [@Hastie2009 p. 272]. Further research is needed to determine +the theoretical properties of the non-parametric bootstrap in the +present setting, but this falls beyond the scope of the present paper. +Interval estimates of regression coefficients, cumulative hazards and +state occupation probabilities are implemented in the function +`boot_ebmstate`. + +## Estimator performance {#sec:estimator_performance} + +It is a well-documented fact in the statistical literature that standard +least-squares or maximum-likelihood estimators can often be improved by +regularisation or shrinkage [see, for example, @Samworth2012]. This +improvement comes about when the model dimensionality is high enough +that the bias introduced by regularisation is outweighed by the +reduction in the estimator variance. In the current setting, one might +therefore ask: what kind of dimensionality does a semi-parametric, +multi-state Cox model need to have to be outperformed by its empirical +Bayes counterpart? A simulation study we carried out offers a tentative +answer to this question, by comparing estimators under both Cox models +for an increasing number of covariates. The study also features a third +method, based on a fully non-parametric model, as a null model method. +This was included to give an idea of how many covariates the empirical +Bayes model can deal with before it becomes no better than a simple +non-regressive model. + +### Simulation setup + +We assessed the performance of all estimators defined by the tuple +$\left[a,m, G, n,p(n)\right]$, where $a\in \lbrace$regression +coefficients, relative hazards, state occupation probabilities$\rbrace$ +is the target of estimation, $m\in \lbrace$standard Cox, empirical Bayes +Cox, null$\rbrace$ is the assumed hazard model, $G \in \lbrace$linear, +competing risks, 'm' structure$\rbrace$ is the transition structure of +the model (illustrated in figure \@ref(fig:figtransition-structures)) +and $n\in \lbrace 100,1000\rbrace$ is the number of patients/disease +histories in the training data set; the variable $p$ denotes the number +of coefficients/covariates per transition in the true model and its +range depends on $n$: +$p\left(100\right) \in \lbrace 10,40,70,100 \rbrace$ whereas +$p\left(100\right) \in \lbrace 10,100,200,300 ,400,500\rbrace$. By +'relative hazards' and 'state occupation probabilities', we mean here +the relative transition hazards of an out-of-sample patient, and her +state occupation probabilities at 7 chosen time points. We generated a +batch of 300 independent absolute error observations ('NA' estimates +included) for each estimator, where each observation is recorded after +training the estimator on a newly simulated data set. Each boxplot in +figures \@ref(fig:figestimator-performance-boxplots-100patients) +($n=100$) and \@ref(fig:figestimator-performance-boxplots-1000patients) +($n=1000$) is based on one of these batches. As all estimators are +*vector* estimators, each absolute error is actually an *average* +absolute error, where the average is taken over the components of the +vector. + +All training data sets were simulated from clock-reset Cox models. Apart +from $G$ (the model transition structure), $n$ and $p$, also the true +baseline hazards are held fixed within each batch of 300 training data +sets. The coefficient vectors used in the simulation are always +non-sparse and are scaled by $\sqrt{\frac{10}{p}}$ to keep the +log-hazard variance constant when the dimensionality grows. All +covariates are dichotomous and mutually independent. To compute the +coefficient errors for the non-parametric (null) model method, we think +of it as a degenerate Cox model in which all regression coefficient +estimates are fixed at zero. The estimation of regression coefficients +under the standard Cox and the empirical Bayes Cox models was performed +with `survival::coxph` and `ebmstate::CoxRFX` respectively; the +estimation of state occupation probabilities is based on +`mstate::probtrans` for the null model and on `ebmstate::probtrans_fft` +for both the standard Cox and the empirical Bayes Cox models. + +The reason we did not consider simulation scenarios with more than 500 +covariates per transition, in data sets of 1000 patients, was simply +computational cost. For example, generating the data and error +observations for the scenario with $n=1000$, $p=100$ and $G=$'m' +structure took less than one hour to generate using 20 CPU cores in +parallel; the same scenario but with $p=500$ took 6.5 days using 25 CPU +cores. More details about the simulation setup can be found in the +Supporting Scripts and Data (file ESM_1.html, section 6, subsection +'sample script'). + +### Missing values + +Whenever an estimator was able to compute a valid estimate of its target +for each training data set, i.e., when it did not return any 'NA' +estimates, its boxplots are based on 300 valid error observations. This +was always the case with non-parametric estimators: the estimates of +regression coefficients and relative hazards of this type of estimators +are trivial (fixed at zero and one respectively) and hence it is also +straightforward to compute absolute errors. It also happened that +non-parametric estimators of state occupation probabilities had no 'NA' +estimates (see file ESM_1.html, section 6, figure 6.3, in the Supporting +Scripts and Data). The situation was similar for the empirical Bayes Cox +model estimators, which showed no more than 5$\%$ missing estimates in +any of the simulation scenarios studied (ibid., figures 6.1 and 6.2). +However, for the standard Cox model ones, the number of 'NA' estimates +depends to a large extent on the number of patients in the data set, as +well as on the dimensionality and transition structure of the model +(figures \@ref(fig:figna-props-100patients-coxph) and +\@ref(fig:figna-props-1000patients-coxph)). In data sets of 100 +patients, it fares well in models with fewer than 10 covariates per +transition, or in models with up to 40 covariates, if the transition +structure is linear. Otherwise its failure rates range from roughly +25$\%$ to nearly 100$\%$. In data sets of 1000 patients, the proportion +of 'NA' estimates is never above 10$\%$, if the transition structure is +linear, but it can climb above 60$\%$ for other transition structures. + +### Comparison of estimators + +With respect to the performance of the three methods studied, the +boxplots in figures +\@ref(fig:figestimator-performance-boxplots-100patients) and +\@ref(fig:figestimator-performance-boxplots-1000patients) suggest the +following conclusions: + +- As $p/n$ grows, the empirical Bayes estimators quickly outperform + the standard Cox model ones. They already fare substantially better + at $p/n=0.1$ for both $n=100$ and $n=1000$ and for all estimation + targets. At the same time, the relative performance of the empirical + Bayes method with respect to the null model one decreases. At + $p/n=0.5$, the difference between these two methods is already + rather small for all simulation scenarios. + +- The relative performance of the empirical Bayes method with respect + to the null method decreases as the number of co-occurring + transition hazards in the model grows. All other things equal, the + empirical Bayes method has the best performance under the 'linear' + structure model, which has no competing transitions; it performs + less well under the 'm' structure transition model, where two + transition hazards can co-occur; and has the worse relative + performances under the 'competing risks' model, where three + transition hazards co-occur. This trend is clearer for $n=100$ + (figure \@ref(fig:figestimator-performance-boxplots-100patients)) + but can also be detected in the relative hazard errors for $n=1000$ + (figure \@ref(fig:figestimator-performance-boxplots-1000patients)). + In any case, the empirical Bayes method seems to be far more robust + than the standard Cox model against increases in the number of + co-occurring transition hazards. + +- Having as target the regression coefficients or the state occupation + probabilities, instead of relative hazards, makes the empirical + Bayes method better in comparison to the null method. In fact, as + $p/n$ grows, the empirical Bayes method is never outperformed by the + null method except in the estimation of relative hazards. + +## Survival analysis workflow + +The features of `mstate` were illustrated in @Wreede2010 using a simple +workflow. The starting point of this workflow is a data set in 'long +format'. Such data set can be fed into `survival::coxph` to obtain +estimates of the regression coefficients of a multi-state Cox model. The +resulting model fit object can be passed on to `mstate::msfit`, along +with a vector of covariates of a particular patient, to get personalised +estimates of the cumulative hazard functions. Finally, state occupation +probabilities for the same patient can be estimated if the object +created by `mstate::msfit` is fed into `mstate::probtrans`. In this +section, we describe how +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) extends the +scope of this workflow, i.e., how it uses the packages +[**survival**](https://CRAN.R-project.org/package=survival) and +[**mstate**](https://CRAN.R-project.org/package=mstate) to generate +estimates under a multi-state empirical Bayes Cox model. A diagram +summarising the extension is shown in figure \@ref(fig:figworkflow). In +the [5.5](#sec:model_assessment) subsection, we give some +recommendations on how to assess and compare models, but for more +detailed tutorials on how to analyse multi-state data using models +defined by transition hazards, we refer the reader to +@Putter2007tutorial and @Putter2011tutorial. + +The main steps of the +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) workflow are +here illustrated using a data set of patients with myelodysplastic +syndromes (MDS) which has been described and studied in +@Papaemmanuil2013. A myelodysplastic syndrome is a form of leukemia in +which the bone marrow is not able to produce enough mature blood cells, +and which sometimes develops into a cancer of white blood cells with a +quick and aggressive progression, i.e., into acute myeloid leukemia +(AML). Figure \@ref(fig:figtrans-diagrams)a illustrates an illness-death +type model for MDS patients and also gives a breakdown of the number of +transition events. The conversion to a model with a transition structure +that has no cycles (i.e., that can be handled by our convolution-based +estimators) is shown in figure \@ref(fig:figtrans-diagrams)b. The data +set used for model estimation, obtained after a number of pre-processing +steps, contains the disease history of 576 patients, as well as +measurements on 30 covariates. Of these 30 covariates, 11 are mutation +covariates and the remaining are clinical or demographic (see figure +\@ref(fig:figtrans-diagrams)c). The running time for the estimation of +relative transition hazards does not exceed 10 seconds in a standard +laptop computer. The same holds for the estimation of cumulative +transition hazards or state occupation probabilities for a given +patient. The complete R code underlying the data analysis in the current +section can be found in the Supporting Scripts and Data (file +ESM_2.html). For running only the R snippets shown below and reproduce +their results, the best option is to use the R script in file ESM_3.R of +the Supporting Scripts and Data. + +### Input data + +Table \@ref(table:long_format_data) shows a fragment of the MDS data +set. The data is in 'long format', which means that each row refers to a +period of risk for a given transition and patient. For example, row $i$ +tells us that, at time `Tstart[i]`, patient `id[i]` entered state +`from[i]`, and thereby began to be at risk for transition `trans[i]`, +i.e., at risk of going from state `from[i]` to state `to[i]`. If the +first transition of patient `id[i]` after time `Tstart[i]` occurs before +the last follow-up time for this patient, `Tstop[i]` records the time of +this transition (regardless of whether the patient moved to state +`to[i]` or not). Otherwise, `Tstop[i]` is set to the last follow-up +time. The value of `status[i]` is set to 1 if and only if the first +transition of patient `id[i]` after `Tstart[i]` is to state `to[i]` and +occurs before the last follow-up (otherwise it is set to 0). The value +of `time[i]` is defined simply as `Tstop[i]`$-$`Tstart[i]`, and +`strata[i]` is the stratum of the baseline hazard for transition +`trans[i]` (more about this variable in the following section). For `x` +$\in \left\lbrace \right.$ `ASXL1`, `DNMT3A`, +$\dots \left. \right \rbrace$, `x[i]` denotes the level of covariate `x` +between `Tstart[i]` and `Tstop[i]` in patient `id[i]`. (In the MDS data +set, we assume that the relative hazard of a patient is determined by +her covariate vector at $t=0$, i.e., we assume all covariates to be +time-fixed.) If a patient enters a new state, and this state +communicates directly with $n$ other states, then, as long as the +patient actually spends time in the new state (i.e. the time of +transition is not the same as the last follow-up time), $n$ rows must be +added to the data set, with each row corresponding to a different +possible transition. + +From table \@ref(table:long_format_data), we know that patient 77 +entered state 1 ('MDS') at time 0 and remained in this state until time +2029, when she moved to state 3 ('death before AML'). There are no rows +to describe the evolution of patient 77 after entering state 3, as this +state is an absorbing state. As to patient 78, she remained in state 1 +until time 332, and moved from there to state 2 ('AML'). She lived with +AML for 1117 days and moved to state 4 ('death after AML') at time 1449. + +``` r +id from to trans Tstart Tstop time status strata ASXL1 DNMT3A [...] +77 1 2 1 0 2029 2029 0 1 0 0 . +77 1 3 2 0 2029 2029 1 2 0 0 . +78 1 2 1 0 332 332 1 1 1 0 . +78 1 3 2 0 332 332 0 2 1 0 . +78 2 4 3 332 1449 1117 1 3 1 0 . +``` + +### Fitting an empirical Bayes Cox model {#sec:fit_bayes_cox_model} + +Once the data is in 'long format', the estimation of an empirical Bayes +model can be carried out using the function `CoxRFX`. A simple example +of the first argument of `CoxRFX`, denoted '`Z`', is a data frame +gathering the `trans`, `strata` and covariate columns of the data in +long format: + +``` r +outcome_covs <- c("id","from","to","trans","Tstart","Tstop","time","status", + "strata") +Z <- mstate_data[!names(mstate_data) %in% outcome_covs] +#(`mstate_data' has the data in long format) +``` + +The `strata` column determines which baseline hazard functions are +assumed to be equal. In table \@ref(table:long_format_data), each +transition is assumed to have a (potentially) different baseline hazard. +The model's assumptions regarding how covariates affect the hazard are +reflected on the format of the covariate columns of `Z`. When the `Z` +argument is the one created in the previous block of code, `CoxRFX` +returns a single regression coefficient estimate for each covariate. In +other words, the impact of any covariate is assumed to be the same for +every transition. + +There are however ways of relaxing this assumption. One can replace the +`ASXL1` column in Z (or any other covariate column) by several +'type-specific' `ASXL1` columns: the `ASXL1` column specific for type +$i$ would show the mutation status of `ASXL1` in rows belonging to +transition of type $i$, and show zero in all other rows. This would +force `CoxRFX` to estimate a (potentially) different `ASXL1` coefficient +for each transition type. This process of covariate expansion by type +can be based on any partition of the set of transitions. When each type +corresponds to a single transition, we refer to it simply as 'covariate +expansion by transition'. The output shown below illustrates the effect +of expanding the covariates in 'mstate_data' by transition. + +``` r +# Columns `id' and `trans' from `mstate_data' together with the first +# two expanded covariates (patients 77 and 78): + id trans ASXL1.1 ASXL1.2 ASXL1.3 DNMT3A.1 DNMT3A.2 DNMT3A.3 [...] + 77 1 0 0 0 0 0 0 . + 77 2 0 0 0 0 0 0 . + 78 1 1 0 0 0 0 0 . + 78 2 0 1 0 0 0 0 . + 78 3 0 0 1 0 0 0 . +``` + +The example code given below shows how to use +[**mstate**](https://CRAN.R-project.org/package=mstate) to expand +covariates by transition and how to create a `Z` argument that makes +`CoxRFX` estimate a regression coefficient for each covariate for +transitions 1 and 2, and assume a fully non-parametric hazard for +transition 3. + +``` r +# To expand covariates by transition using mstate::expand.covs, +# first set the class of `mstate_data' as +class(mstate_data) <- c("data.frame","msdata") + +# then add the transition matrix as attribute: +attr(mstate_data,"trans") <- tmat +#(`tmat' is the output of mstate::transMat) + +# Expand covariates by transition: +covariates_expanded_123 <- mstate::expand.covs( + mstate_data, + covs = names(mstate_data)[! names(mstate_data) %in% outcome_covs], + append = F +) + +# remove all covariates for transition 3 from `covariates_expanded_123' +# to fit a fully non-parametric model on this transition: +covariates_expanded_12 <- covariates_expanded_123[ + !grepl(".3",names(covariates_expanded_123),fixed = T) +] + +#argument `Z' of coxrfx +Z_12 <- data.frame(covariates_expanded_12,strata = mstate_data$trans, + trans = mstate_data$trans) +``` + +The second argument of `CoxRFX` ('`surv`') is a survival object that can +easily be built by feeding the outcome variable columns of the data to +the function `Surv` (from the package +[**survival**](https://CRAN.R-project.org/package=survival)). Whether +`CoxRFX` fits a clock-forward model or a clock-reset model depends on +the kind of survival object: + +``` r +#argument `surv' for a clock-forward model +surv <- Surv(mstate_data$Tstart,mstate_data$Tstop,mstate_data$status) + +#argument `surv' for a clock-reset model +surv <- Surv(mstate_data$time,mstate_data$status) +``` + +The argument `groups` of `CoxRFX` is a vector whose length equals the +number of covariates in the data. In other words, the length of `groups` +is `ncol(Z)-2`, since the argument `Z` must include both the covariate +data and the `strata` and `trans` columns. If, for $i \neq j$, +`groups[i]`=`groups[j]` $=\text{`foo'}$, this means that the regression +coefficients of the $i^{th}$ and $j^{th}$ covariates of `Z` both belong +to a group named 'foo' of coefficients with the same prior. For the `Z` +object built above, the `groups` argument created in the following block +of code embodies the assumption that all coefficients associated with a +given transition have the same prior distribution. The final line of +code fits the empirical Bayes model. + +``` r +#argument `groups' of coxrfx +groups_12 <- paste0(rep("group",ncol(Z)-2),c("_1","_2")) + +#fit random effects model +model_12 <- CoxRFX(Z_12,surv,groups_12,tmat) +``` + +Figure \@ref(fig:figcoef-plots) shows regression coefficient point +estimates for a clock-reset, empirical Bayes model fitted with the code +above. Also shown are 95% non-parametric bootstrap confidence intervals +computed using `ebmstate::boot_ebmstate`. The $x$-axis scale is +logarithmic to allow estimates to be read as relative hazards more +easily. For example, a mutation in *RUNX1* is associated with a twofold +increase in the hazard of progression from MDS to AML, and treatment +centre 4 is associated with a 3-fold increase in the hazard of dying +before progressing to AML, when compared to the baseline value of +'treatment centre' (treatment centre = 2 or 5). In covariates that have +been log-transformed (age, platelet count and neutrophil count) or +logit-transformed (proportions of myeloblasts and ring sideroblasts in +the bone marrow), the interpretation of estimates is different. For +example, an increase in age by a factor of $e$ ($\approx 2.72$) almost +triples the hazard of dying before AML; the same increase in the ratio +$bm\_blasts/(1-bm\_blasts)$ (where *bm_blasts* is the proportion of +myeloblasts in the bone marrow) is associated with an increment in the +hazard of dying before AML of approximately $16\%$. + +### Computing cumulative transition hazard estimates {#sec:computing_cumulative_hazards} + +The function `msfit_generic` is the generic function in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) that +computes cumulative transition hazards for a given set of covariate +values and an estimated Cox model. It calls a different method according +to the class of its `object` argument. The default method corresponds to +the original `msfit` function of the +[**mstate**](https://CRAN.R-project.org/package=mstate) package and is +appropriate for objects of class `coxph`, i.e., objects that contain the +fit of a Cox model with fixed effects. The other available method for +`msfit_generic`, `msfit_generic.coxrfx`, is just the original `msfit` +function, (slightly) adapted to deal with objects generated by `CoxRFX`. +Quite importantly, `msfit_generic.coxrfx` does not allow the variance of +the cumulative hazards to be computed, as this computation relies on +asymptotic results which may not be valid for an empirical Bayes model. +As a result, it only has two other arguments apart from the object of +class `coxrfx`: a data frame with the covariate values of the patient +whose cumulative hazards we want to compute; and a transition matrix +describing the states and transitions in the model (such as the one that +can be generated using `transMat` from the package +[**mstate**](https://CRAN.R-project.org/package=mstate)). The following +block of code exemplifies how these objects can be built and generates +the `msfit` object containing the cumulative transition hazard estimates +for a sample patient. Note that the object with the patient data must +include a row for each transition, as well as a column specifying the +transition stratum of each row of covariates. + +``` r +# Build `patient_data' data frame with the covariate values for which +# cumulative hazards are to be computed (covariate values of patient 78): +patient_data <- mstate.data[mstate.data$id == 78,,drop = F][rep(1,3),] +patient_data$strata <- patient_data$trans <- 1:3 +patient_data <- mstate::expand.covs( + patient_data, + covs = names(patient_data)[ ! names(patient_data) %in% outcome_covs], + append = T +) +patient_data <- patient_data[ ! grepl(".3",names(patient_data),fixed = T)] + +# The `patient_data' data frame has only 3 rows (one for each transition). +# The output below shows its `id' and `trans' columns +# and expanded covariates ASXL1 and DNMT3A: + id trans ASXL1.1 ASXL1.2 DNMT3A.1 DNMT3A.2 [...] + 78 1 1 0 0 0 . + 78 2 0 1 0 0 . + 78 3 0 0 0 0 . + +# compute cumulative hazards +msfit_object_12 <- msfit_generic(model_12,patient_data,tmat) +``` + +Figure \@ref(fig:figpatient78-cumhaz) shows three plots of estimated +cumulative transition hazards for the sampled patient, one for each +transition in the model, along with $95\%$ non-parametric bootstrap +confidence intervals (computed with `ebmstate::boot_ebmstate`). +Throughout the plotted period, the 'slope' of the cumulative hazard +(i.e., the hazard rate) for the MDS to AML transition is lower than the +one for the MDS to death transition, and this in turn is lower than the +one for the AML to death transition. It should be recalled that the +cumulative hazard estimate is strictly non-parametric for this last +transition, i.e., it is the same for all patients. The central plot of +figure \@ref(fig:figpatient78-cumhaz) suggests that, as time since +diagnosis goes by, the hazard of dying in MDS increases (possibly an +effect of age). On the other hand, the hazard of dying in AML seems to +decrease (slightly) with time (rightmost plot). Conclusions regarding +the evolution of the AML hazard are hard to draw, since the confidence +intervals for the corresponding cumulative hazard curve are very wide +(leftmost plot). + +If an object generated by `msfit_generic` is fed to `plot`, and the +package [**mstate**](https://CRAN.R-project.org/package=mstate) is +loaded, the method `mstate:::plot.msfit` will be called. This is an +efficient way of automatically plotting the cumulative hazard estimates +for all transitions, but confidence interval lines (separately +estimated) cannot be added. + +### Computing state occupation probability estimates {#sec:computing_transition_probs} + +The functions `probtrans_mstate`, `probtrans_ebmstate` and +`probtrans_fft` compute estimates of state occupation probabilities for +a given `msfit` object. All three functions generate objects of class +`probtrans` that can be fed to the `plot.probtrans` method from the +package [**mstate**](https://CRAN.R-project.org/package=mstate). The +first of these functions should only be used for clock-forward models, +as it relies on product-limit calculations. It calls the method +`probtrans_mstate.default`, if the `msfit` object was generated by +`msfit_generic.default`, or the method `probtrans_mstate.coxrfx`, if it +was generated by `msfit_generic.coxrfx`. Both methods are identical to +the function `probtrans` in the +[**mstate**](https://CRAN.R-project.org/package=mstate) package, with +the reserve that `probtrans_mstate.coxrfx` does not allow the +computation of the variances or covariances of the state occupation +probability estimator. + +The functions `probtrans_ebmstate` and `probtrans_fft` are the functions +in [**ebmstate**](https://CRAN.R-project.org/package=ebmstate) for the +computation of state occupation probability estimates under clock-reset +models with a transition structure that has no cycles. When using +`probtrans_fft` (the faster, but somewhat less stable, of these two +functions), three arguments must be supplied: the initial state of the +process whose state occupation probabilities one wishes to compute, the +`msfit` object, and the upper time limit for the generation of estimates +(`max_time`). Both functions are based on a discrete-time approximation +to a series of convolutions. The default argument `nr_steps` controls +the number of (equally spaced) time steps used in this approximation. +The arguments `max_time` and `nr_steps` should be increased until the +estimated curves become stable. + +The following line of code computes point estimates of state occupation +probabilities for the sample patient. + +``` r +probtrans_object_12 <- probtrans_fft("MDS",msfit_object_12, max_time = 4000) +``` + +Estimates are shown in figure \@ref(fig:figpatient78-transProbs), along +with $95\%$ non-parametric, bootstrap confidence intervals. For this +particular patient, the estimated probability of being dead after AML +remains below 0.4 throughout a period of 10 years from the MDS +diagnosis; if the patient does reach AML, death is expected to happen +quickly thereafter, as reflected in the very low estimates for the +probability of being in AML at any point in time. The following block of +code shows how to compute confidence intervals with `boot_ebmstate`: + +``` r +# Creating the object arguments for boot_ebmstate() + +# `groups' arguments was already created, but we need to add names to it +names(groups_12) <- names(covariates_expanded_12) + +# `mstate_data_expanded' argument (similar to `covariates_expanded' but +# including outcome variables) +mstate_data_expanded <- cbind( + mstate_data[names(mstate_data) %in% outcome_covs], + covariates_expanded_12 +) + +# create the non-parametric bootstrap confidence intervals +boot_ebmstate_object <- boot_ebmstate( + mstate_data = mstate_data_expanded, + which_group = groups_12, + min_nr_samples = 100, + patient_data = patient_data, + tmat = tmat, + initial_state = "MDS", + time_model = "clockreset", + input_file = NULL, + coxrfx_args = list(max.iter = 200), + probtrans_args = list(max_time = 4000) +) +``` + +### Model assessment {#sec:model_assessment} + +For any model fitted with +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), two +performance metrics can be easily computed: the *concordance* statistic +([@harrell1982evaluating]; see also the help page of +`survival::concordance` for the definition of concordance) and the +*Bayesian Information Criterion* (BIC) score [@schwarz1978estimating]. +As an example of how these two metrics can be obtained and used for +model comparison, suppose we wish to compare 'model_12' fitted above -- +which consists of a Cox regression including all covariates for +transitions 1 and 2 and a fully non-parametric model for transition 3 -- +with a model that combines Cox regressions of all covariates for each of +the three transitions (denoted 'model_123' below). The following code +snippet shows how to fit this second model. + +``` r +# arguments `groups' and `Z' for fitting a Cox regression model on all transitions +Z_123 <- data.frame( + covariates_expanded_123, + strata = mstate_data$trans, + trans = mstate_data$trans +) +groups_123 <- paste0(rep("group", ncol(Z_123) - 2), c("_1", "_2", "_3")) + +# Fit a Cox regression model for all transitions +model_123 <- CoxRFX(Z = Z_123, surv = surv, groups = groups_123) +``` + +Running the `concordance` function in the +[**survival**](https://CRAN.R-project.org/package=survival) package for +each model yields the following output: + +``` r +> concordance(model_12) + Call: + concordance.coxph(object = model_12) + + n= 1210 + Concordance= 0.8131 se= 0.01314 + concordant discordant tied.x tied.y tied.xy + strata=1 18040 2783 0 1 0 + strata=2 37919 9678 0 7 0 + strata=3 0 0 1052 0 4 + +> concordance(model_123) + Call: + concordance.coxph(object = model_123) + + n= 1210 + Concordance= 0.8168 se= 0.01312 + concordant discordant tied.x tied.y tied.xy + strata=1 18041 2782 0 1 0 + strata=2 37920 9677 0 7 0 + strata=3 784 268 0 4 0 +``` + +The output shows that modelling transition 3 with a Cox model, instead +of a fully parametric one, has a negligible impact on the overall +concordance. However, this is due to the fact that there are far fewer +observations for this transition. The concordance for transition 3 only, +which corresponds to strata 3, is 0.5 under the fully parametric model +(i.e., all patients are assigned the same transition hazard) and +considerably higher under the Cox regression ($784/(784+268)=0.75$). +Ideally, the comparison of models of different complexity should be +carried out on a test sample rather than on the training data. For this +purpose, the test data can be input into to the `concordance` function +(argument `newdata`). However, in the present case, only 61 patients +were ever at risk of dying with AML (i.e. of undergoing transition 3), +and of these only 41 actually died, so we might prefer to keep all +patients in the training data, rather than saving a fraction of them for +testing purposes. Such an option will yield more accurate coefficient +estimates, at the expense of not allowing the computation of unbiased +estimates of model performance. If the goal is only to compare models, +we can make do without test data, by using an information score that +penalises model complexity, such as the BIC. To facilitate model +comparison, the BIC score is one of the attributes of the model fit +object: + +``` r +> model_12$BIC + [1] 2508.37 +> model_123$BIC + [1] 2483.49 +``` + +The best model is the one with the lowest score, so the choice of +'model_123' is confirmed. + +## Discussion + +We have shown that +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is suitable +for higher-dimensional, multi-state survival analysis, and that it is +both efficient and easy-to-use. To a significant extent, the +user-friendliness of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) stems from +the fact that it was not built 'from the ground up'. Instead, we +produced a package that is more easily accessible to the many users of +[**mstate**](https://CRAN.R-project.org/package=mstate) by taking +advantage of whichever features of this package were useful to our +method and by eliminating redundancies. The connection between +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) and +[**mstate**](https://CRAN.R-project.org/package=mstate) is based on the +fact that the function `CoxRFX` takes the same type of input and +produces the same type of output as `coxph` from the package `survival`, +and the function `probtrans_fft` (or `probtrans_ebmstate`) has the same +type of input and output as `probtrans` from +[**mstate**](https://CRAN.R-project.org/package=mstate) (as shown in +figure \@ref(fig:figworkflow)). + +We also sought to improve our package's user-friendliness by making it +as efficient as possible. The reduction of computational cost is based +on two features. First, our empirical Bayes method relies on an +expectation-maximisation algorithm that estimates both the parameters +and the hyper-parameters of the model, i.e., no further tuning of the +model is required. Second, in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +computation of state occupation probability estimates relies on +analytical results rather than on simulation: not only for clock-forward +models, where we import from +[**mstate**](https://CRAN.R-project.org/package=mstate) a product-limit +estimator, but also for clock-reset models, where we implement our own +estimator based on a convolution argument and the fast Fourier +transform. + +To our knowledge, +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is the first +R package to put together a framework for multi-state model estimation +that is complete and suitable for higher-dimensional data. It does so by +implementing point and interval estimators of regression coefficients, +cumulative transition hazards and state occupation probabilities, under +regularised multi-state Cox models. In section +[4](#sec:estimator_performance), the results of the simulation study +suggest that for data sets with 100 patients or more and a ratio of $p$ +(patients) to $n$ (coefficients per transition) greater than 0.1, the +standard Cox model estimator is clearly outperformed by the empirical +Bayes one when it comes to the estimation of relative hazards and state +occupation probabilities of an out-of-sample patient, or the regression +coefficients of the model. However, the same study suggests that using +an empirical Bayes method instead of a fully non-parametric one is of +limited or no value in settings where $p/n \geq 1$. This loss of +usefulness can already happen for $p/n\leq 1/2$ when it comes to the +estimation of the relative hazards of an out-of-sample patient, +especially for transition structures with multiple competing +transitions. + +As mentioned in previous sections, +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) imports a +product-limit estimator from +[**mstate**](https://CRAN.R-project.org/package=mstate) that targets the +state occupation probabilities of patients with *time-fixed* covariate +vectors. However, these estimators are extendible to models with +time-dependent covariates, as long as these are external and the +estimates are conditional on specific covariate paths [@Aalen2008 p. +142]. For piecewise constant covariates, it is likely that such an +adaptation could be obtained by combining transition probability +estimates obtained for each period in which the covariates are fixed. +While no significant theoretical obstacles are foreseen in this matter, +the computer implementation for more than a single piecewise constant +covariate is likely to be a laborious task. We have left it therefore +for future work. + +## Acknowledgements {#acknowledgements .unnumbered} + +The authors are supported by grant NNF17OC0027594 from the Novo Nordisk +Foundation. We thank an anonymous reviewer for their constructive +comments and helpful suggestions which led to a much-improved +manuscript. + +## Supporting Scripts and Data {#supporting-scripts-and-data .unnumbered} + +In the supporting Scripts and Data, the file `ESM_1.html` contains +additional simulation results and theoretical demonstrations. Additional +details on the analysis of the MDS data set are given in the file +`ESM_2.html`. The MDS data set is in files `MDS.TPD.20Nov2012.csv` and +`mds.paper.clin.txt`. The file `ESM_3.R` contains a simplified R script +to run the code snippets in the present paper. The +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) package is +available on CRAN. + +## Conflict of interest + +The authors have declared no conflict of interest. + +**Figures** + +```{r figpackage-summary-figure, echo=FALSE , fig.cap="Summary of inputs and outputs of the package ebmstate. The input data set should be one that violates the assumption – commonly used in survival analysis – that the number of observations is much larger than the number of parameters to be estimated (a genomic-clinical data set is shown as a typical example). The input model is a multi-state Cox model defined by a transition structure and a prior distribution on the regression coefficients. This prior distribution is defined by partitioning the vector of regression coefficients into groups of regression coefficients, with each group having its own Gaussian prior with undetermined mean and variance. The outputs of ebmstate include estimates of the relative transition hazards associated with each covariate, as well as estimates of the probability that a specific patient (with specific covariate measurements) has of occupying each state of the model over some time period. Estimates of cumulative transition hazards are omitted from the figure.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/package_summary_figure.png")) +``` + +```{r figmssample, echo=FALSE , fig.cap="Comparison of running times and estimation accuracy of mssample and probtrans_fft. Each plot in the grid shows two estimated curves of state occupation probabilities. The black curves are based on a single run of mstate::mssample with n=100.000 observations (approximately 17 minutes of running time) and are the same across columns. They serve as benchmark for precision assessment. In columns 1 to 3 of the grid, the superimposed red curves are based on a run of mssample with respectively 100, 1000, and 10.000 observations. In the rightmost column, the red curves are based on a run of probtrans_fft. All functions have as input the same set of cumulative transition hazards. These were estimated using a non-parametric multi-state model and a data set of 1000 patients generated according to a clock-reset Cox model with a ‘linear’ transition structure (leftmost diagram of figure 3). Plots in the same row refer to the same state of the model, while those in the same column refer to the same run of a function. Running times and, where appropriate, number of simulations (n) are given on top of each column.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/mssample_and_probtrans_fft.png")) +``` + +```{r figtransition-structures, echo=FALSE , fig.cap="Model transition structures. We studied the performance of Cox model estimators, empirical Bayes Cox model estimators and fully non-parametric estimators with respect to these 3 transition structures.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/transition_structures.png")) +``` + +```{r figna-props-100patients-coxph, echo=FALSE , fig.cap="Proportions of valid, infinite and missing (‘NA’) estimates for the standard Cox model estimators in the simulation study of figure 6 (100 patients per simulated data set).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/na_props_100patients_coxph.png")) +``` + +```{r figna-props-1000patients-coxph, echo=FALSE , fig.cap="Proportions of valid, infinite and missing (‘NA’) estimates for the standard Cox model estimators in the simulation study of figure 7 (1000 patients per simulated data set).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/na_props_1000patients_coxph.png")) +``` + +```{r figestimator-performance-boxplots-100patients, echo=FALSE , fig.cap="Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with 100 observations each. In the figure grid there is a boxplot corresponding to every tuple (a,m, G, p) such that a\in \lbraceregression coefficients, relative hazards, state occupation probabilities\rbrace is the target of estimation, m\in \lbracestandard Cox, empirical Bayes Cox, null\rbrace is the hazard model, G \in \lbracelinear, competing risks, ‘m’ structure\rbrace is the transition structure of the model, and p \in \lbrace 10,40,70,100 \rbrace is the number of coefficients/covariates per transition. Each boxplot is based on at most 300 average absolute error observations. Figure 4, together with figures 6.1 and 6.3 in file ESM_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot’s y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. ", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/estimator_performance_boxplots_100patients.png")) +``` + +```{r figestimator-performance-boxplots-1000patients, echo=FALSE , fig.cap=" Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with 1000 observations each. In the figure grid there is a boxplot corresponding to every tuple (a,m, G, p) such that a\in \lbraceregression coefficients, relative hazards, state occupation probabilities\rbrace is the target of estimation, m\in \lbracestandard Cox, empirical Bayes Cox, null\rbrace is the hazard model, G \in \lbracelinear, competing risks, ‘m’ structure\rbrace is the transition structure of the model, and p \in \lbrace 10,100,200,300,400,500 \rbrace is the number of coefficients/covariates per transition. Each boxplot is based on at most 300 average absolute error observations. Figure 5, together with figures 6.2 and 6.3 in file ESM_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot’s y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. ", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/estimator_performance_boxplots_1000patients.png")) +``` + +```{r figworkflow, echo=FALSE , fig.cap="Extension of the mstate analysis framework by ebmstate. Arrows correspond to functions. Boxes correspond to inputs or outputs of functions. Functions CoxRFX and probtrans_fft from ebmstate compute point estimates only. Interval estimates can be obtained using the non-parametric bootstrap algorithm implemented in the function ebmstate::boot_ebmstate.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/workflow0.png")) +``` + +```{r figtrans-diagrams, echo=FALSE , fig.cap="a: transition model implied by the data set of patients with myelodysplastic syndromes, together with transition event numbers; b: conversion to a transition structure without cycles; c: transformations applied to the MDS covariate data and summary statistics for the data before transformation. MDS stands for myelodysplastic syndromes; AML stands for acute myeloid leukemia.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/data_summary_figs2.png")) +``` + +```{r figcoef-plots, echo=FALSE , fig.cap="Point estimates of regression coefficients for the Cox model fitted to the MDS data, along with 95% non-parametric bootstrap confidence intervals. The x-axis scale is logarithmic so that coefficient estimates can be read as relative hazard estimates. If \gamma_{ij} is the element of \hat{\boldsymbol{\beta}}_{ij} associated with a given covariate, \exp\left(\gamma_{ij}\right) is the estimated relative hazard for this covariate in transition \left(i,j\right). In general, a relative hazard estimate r for a covariate z in transition \left(i,j\right) means that a one-unit increase in z is associated with an r-fold increase in the hazard of this transition. If z was obtained by log-transformation (as in age, platelet counts and neutrophil counts), a one-unit increase in z corresponds to scaling the original covariate by e\approx 2.72. In case z was obtained by logit-transformation (as in bone marrow blasts and sideroblasts proportions), the same one-unit increase corresponds to scaling the odds of the original covariate by e.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/coef_plots.png")) +``` + +```{r figpatient78-cumhaz, echo=FALSE , fig.cap="Point estimates of cumulative transition hazards for a sample patient with MDS (black curve), along with 95\% non-parametric confidence intervals (dashed red lines).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/patient78_cumhaz_final.png")) +``` + +```{r figpatient78-transProbs, echo=FALSE , fig.cap="Point estimates of state occupation probabilities for a sample patient with MDS (black curve), along with 95\% non-parametric confidence intervals (dashed red lines).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/patient78_transProbs_final.png")) +``` +::: diff --git a/_articles/RJ-2024-002/RJwrapper.tex b/_articles/RJ-2024-002/RJwrapper.tex new file mode 100644 index 0000000000..803f1eab19 --- /dev/null +++ b/_articles/RJ-2024-002/RJwrapper.tex @@ -0,0 +1,30 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + +%% load any required packages FOLLOWING this line +\usepackage{dsfont} +\usepackage{listings} +\lstset{basicstyle=\ttfamily} +\usepackage{natbib} + + +\begin{document} + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{15} + +%% replace RJtemplate with your article +\begin{article} + \input{costa-gerstung} +\end{article} + +\end{document} diff --git a/_articles/RJ-2024-002/Rlogo-5.png b/_articles/RJ-2024-002/Rlogo-5.png new file mode 100644 index 0000000000..077505788a Binary files /dev/null and b/_articles/RJ-2024-002/Rlogo-5.png differ diff --git a/_articles/RJ-2024-002/costa-gerstung.R b/_articles/RJ-2024-002/costa-gerstung.R new file mode 100644 index 0000000000..e69de29bb2 diff --git a/_articles/RJ-2024-002/costa-gerstung.bib b/_articles/RJ-2024-002/costa-gerstung.bib new file mode 100644 index 0000000000..31b9356dc1 --- /dev/null +++ b/_articles/RJ-2024-002/costa-gerstung.bib @@ -0,0 +1,453 @@ +% An example bibliography .bib file. + +%This is a bibliography of extremes papers Version 11 January 2002 +%@PREAMBLE{"\newcommand{\noopsort}[1]{} " } + +% Note that spaces are needed to get all authors initials, +% ie D. R. Cox or Cox, D. R. are both ok. The full stops are NOT +% needed ie D R Cox and Cox, D R also both work! + +@article{Aalen1989, + title={A linear regression model for the analysis of life times}, + author={Aalen, Odd O}, + journal={Statistics in Medicine}, + volume={8}, + number={8}, + pages={907--925}, + year={1989}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.4780080803} +} + +@BOOK{Aalen2008, + author="Aalen,O and Borgan, O and Gjessing, H ", + title="Survival and event history analysis", + year=2008, + publisher="Springer", + address="", + note="", + url={https://link.springer.com/book/10.1007/978-0-387-68560-1}} + +@BOOK{Andersen1993, + author="Andersen,PK and Borgan, O and Gill, RD and Keiding, N ", + title="Statistical Models Based On Counting Processes", + year=1993, + publisher="Springer", + address="", + note="", + url={https://link.springer.com/book/10.1007/978-1-4612-4348-9}} + +@article{Cortese2010, + title={Competing risks and time-dependent covariates}, + author={Cortese, Giuliana and Andersen, Per K}, + journal={Biometrical Journal}, + volume={52}, + number={1}, + pages={138--158}, + year={2010}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/bimj.200900076} +} + +@BOOK{Carlin2009, + author={Carlin,BP and Louis, TA}, + title={Bayesian Methods for Data Analysis}, + year={2009}, + publisher={CRC Press}, + url={https://doi.org/10.1201/b14884} + } + +@Article{flexsurv_package, + title = {{flexsurv}: A Platform for Parametric Survival Modeling in + {R}}, + author = {Christopher Jackson}, + journal = {Journal of Statistical Software}, + year = {2016}, + volume = {70}, + number = {8}, + pages = {1--33}, + doi = {10.18637/jss.v070.i08}, + } + +@article{Karoui2018, + title={Can we trust the bootstrap in high-dimensions? {T}he case of linear models}, + author={El Karoui, Noureddine and Purdom, Elizabeth}, + journal={The Journal of Machine Learning Research}, + volume={19}, + number={1}, + pages={170--235}, + year={2018}, + publisher={JMLR. org}, + url={https://jmlr.org/papers/v19/17-006.html} +} + +@article{gamboostMSM_package, + title={gamboostMSM}, + author={Reulen, Holger}, + journal={R package version}, + pages={1.1.87}, + year={2014}, + url={https://CRAN.R-project.org/package=gamboostMSM} +} + +@BOOK{Gelman2014, + author="Gelman,A and Carlin, JB and Stern, HS and Dunson, DB and Vehtari,A and Rubin, DB ", + title="Bayesian Data Analysis", + year=2014, + publisher="CRC Press", + address="", + note="", + url={https://doi.org/10.1201/b16018}} + +@article{Gerds2014, + title={Calibration plots for risk prediction models in the presence of competing risks}, + author={Gerds, Thomas A and Andersen, Per K and Kattan, Michael W}, + journal={Statistics in medicine}, + volume={33}, + number={18}, + pages={3191--3203}, + year={2014}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.6152} +} + + +@article{Grinfeld2018, +title = {Personalized Prognostic Predictions for Patients with Myeloproliferative Neoplasms through Integration of Comprehensive Genomic and Clinical Information}, +journal = {Blood}, +volume = {130}, +pages = {491}, +year = {2017}, +issn = {0006-4971}, +doi = {https://doi.org/10.1182/blood.V130.Suppl_1.491.491}, +url = {https://www.sciencedirect.com/science/article/pii/S000649711981008X}, +author = {Jacob Grinfeld and Jyoti Nangalia and E Joanna Baxter and Anna L. Godfrey and Paola Guglielmelli and Rob Cantrill and David Wedge and Nicos Angelopoulos and Gunes Gundem and Charlie Massie and Elli Papaemmanuil and Cathy MacLean and Julia Cook and Francesca Lauren Nice and Christen Lykkegaard Andersen and Hans Carl Hasselbalch and Mary Frances McMullin and Alessandro M. Vannucchi and Claire N. Harrison and Moritz Gerstung and Peter J Campbell and Anthony R Green}, +} + +@article{harrell1982evaluating, + author = {Harrell, Frank E., Jr and Califf, Robert M. and Pryor, David B. and Lee, Kerry L. and Rosati, Robert A.}, + title = "{Evaluating the Yield of Medical Tests}", + journal = {JAMA}, + volume = {247}, + number = {18}, + pages = {2543-2546}, + year = {1982}, + month = {05}, + issn = {0098-7484}, + doi = {10.1001/jama.1982.03320430047030}, + url = {https://doi.org/10.1001/jama.1982.03320430047030}, + eprint = {https://jamanetwork.com/journals/jama/articlepdf/372568/jama\_247\_18\_030.pdf}, +} + + +@book{Hastie2009, + title={The elements of statistical learning: data mining, inference, and prediction}, + author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H and Friedman, Jerome H}, + volume={2}, + year={2009}, + publisher={Springer}, + url={https://link.springer.com/book/10.1007/978-0-387-84858-7} +} + +@article{Hoff2019, + Author = {Hoff, Rune and Putter, Hein and Mehlum, Ingrid Sivesind and Gran, Jon Michael}, + Da = {2019/10/01}, + Date-Added = {2021-02-19 11:01:01 +0000}, + Date-Modified = {2021-02-19 11:01:01 +0000}, + Doi = {10.1007/s10985-019-09474-0}, + Id = {Hoff2019}, + Isbn = {1572-9249}, + Journal = {Lifetime Data Analysis}, + Number = {4}, + Pages = {660--680}, + Title = {Landmark estimation of transition probabilities in non-Markov multi-state models with covariates}, + Ty = {JOUR}, + Url = {https://doi.org/10.1007/s10985-019-09474-0}, + Volume = {25}, + Year = {2019}, + Bdsk-Url-1 = {https://doi.org/10.1007/s10985-019-09474-0}} + + +@article{Hougaard1999, + title={Multi-state models: a review}, + author={Hougaard, Philip}, + journal={Lifetime data analysis}, + volume={5}, + number={3}, + pages={239--264}, + year={1999}, + publisher={Springer}, + url={https://doi.org/10.1023/A:1009672031531} +} + +@Article{Jackson2011, + title = {Multi-state models for panel data: the {msm} package for {R}}, + author = {Christopher H. Jackson}, + journal = {Journal of Statistical Software}, + year = {2011}, + volume = {38}, + number = {8}, + pages = {1--29}, + url = {http://www.jstatsoft.org/v38/i08/}, + } + + @book{Kalbfleisch2002, + title={The statistical analysis of failure time data}, + author={Kalbfleisch, John D and Prentice, Ross L}, + year={2002}, + publisher={John Wiley \& Sons}, + doi={10.1002/9781118032985}, + } + +@article{Listwon2015, + TITLE = {{SemiMarkov: An R Package for Parametric Estimation in Multi-State Semi-Markov Models}}, + AUTHOR = {Listwon, Agnieszka and Saint-Pierre, Philippe}, + URL = {https://hal.archives-ouvertes.fr/hal-00860244}, + JOURNAL = {{Journal of Statistical Software}}, + PUBLISHER = {{University of California, Los Angeles}}, + VOLUME = {66}, + NUMBER = {6}, + PAGES = {784}, + YEAR = {2015}, + DOI = {10.18637/jss.v066.i06}, + KEYWORDS = {exponentiated Weibull distribution ; multi-state semi-Markov models ; parametric estimation ; asthma ; R package}, + PDF = {https://hal.archives-ouvertes.fr/hal-00860244/file/Listwon_SaintPierre_HAL.pdf}, + HAL_ID = {hal-00860244}, + HAL_VERSION = {v1}, +} + +@article{mboost_package, + title={mboost: Model-Based Boosting}, + author={Hothorn, Torsten and Buehlmann, Peter and Kneib, Thomas and Schmid, Matthias and Hofner, Benjamin}, + journal={R package version}, + pages={2.9-3}, + year={2020}, + url={https://CRAN.R-project.org/package=mboost} +} + +@article{Morris1983, + title={Parametric empirical Bayes inference: theory and applications}, + author={Morris, Carl N}, + journal={Journal of the American Statistical Association}, + volume={78}, + number={381}, + pages={47--55}, + year={1983}, + publisher={Taylor \& Francis Group}, + url={https://doi.org/10.1080/01621459.1983.10477920} +} + +@article{Papaemmanuil2013, + title={Clinical and biological implications of driver mutations in myelodysplastic syndromes}, + author={Papaemmanuil, Elli and Gerstung, Moritz and Malcovati, Luca and Tauro, Sudhir and Gundem, Gunes and Van Loo, Peter and Yoon, Chris J and Ellis, Peter and Wedge, David C and Pellagatti, Andrea and others}, + journal={Blood}, + volume={122}, + number={22}, + pages={3616--3627}, + year={2013}, + publisher={Am Soc Hematology}, + url={https://doi.org/10.1182/blood-2013-08-518886} +} + +@BOOK{Pawitan2001, + author="Pawitan, Y", + title="In All Likelihood", + year=2001, + publisher="Oxford University Press", + address="", + note="", + url={https://global.oup.com/academic/product/in-all-likelihood-9780199671229?cc=gb&lang=en&#} + } + +@article{penMSM_package, + title={penMSM}, + author={Reulen, Holger}, + journal={R package version}, + pages={0.99}, + year={2015}, + url={https://CRAN.R-project.org/package=penMSM} +} + +@article{Perperoglou2014, + title={Cox models with dynamic ridge penalties on time-varying effects of the covariates}, + author={Perperoglou, Aris}, + journal={Statistics in Medicine}, + volume={33}, + number={1}, + pages={170--180}, + year={2014}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.5921} +} + +@Article{Putter2011, + title = {{mstate}: An {R} Package for the Analysis of Competing Risks and Multi-State Models}, + author = {Liesbeth C. {de Wreede} and Marta Fiocco and Hein Putter}, + journal = {Journal of Statistical Software}, + year = {2011}, + volume = {38}, + number = {7}, + pages = {1--30}, + url = {http://www.jstatsoft.org/v38/i07/}, + } + +@article{Putter2007tutorial, + title={Tutorial in biostatistics: competing risks and multi-state models}, + author={Putter, Hein and Fiocco, Marta and Geskus, Ronald B}, + journal={Statistics in Medicine}, + volume={26}, + number={11}, + pages={2389--2430}, + year={2007}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.2712} +} + +@article{Putter2011tutorial, + title={Tutorial in biostatistics: Competing risks and multi-state models Analyses using the mstate package}, + author={Putter, Hein}, + journal={Companion file for the mstate package}, + year={2011}, + url = {https://mirror.las.iastate.edu/CRAN/web/packages/mstate/vignettes/Tutorial.pdf} +} + + @Manual{R_language, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2019}, + url = {https://www.R-project.org/}, + } + +@article{Rueda2019, + title={Dynamics of breast-cancer relapse reveal late-recurring ER-positive genomic subgroups}, + author={Rueda, Oscar M and Sammut, Stephen-John and Seoane, Jose A and Chin, Suet-Feung and Caswell-Jin, Jennifer L and Callari, Maurizio and Batra, Rajbir and Pereira, Bernard and Bruna, Alejandra and Ali, H Raza and others}, + journal={Nature}, + volume={567}, + number={7748}, + pages={399}, + year={2019}, + publisher={Nature Publishing Group}, + url={https://doi.org/10.1038/s41586-019-1007-8} +} + +@article{Samworth2012, + title={Stein's paradox}, + author={Samworth, Richard J}, + journal={Eureka}, + volume={62}, + pages={38--41}, + year={2012}, + publisher={The Archimedeans}, + url={https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=7eebd55f569395544f2b5d367d6aee614901d2c1} +} + +@article{Schall1991, +author = {Schall, Robert}, +title = {Estimation in generalized linear models with random effects}, +journal = {Biometrika}, +volume = {78}, +number = {4}, +pages = {719-727}, +year = {1991}, +doi = {10.1093/biomet/78.4.719}, +URL = {http://dx.doi.org/10.1093/biomet/78.4.719}, +eprint = {/oup/backfile/content_public/journal/biomet/78/4/10.1093/biomet/78.4.719/2/78-4-719.pdf} +} + +@article{schwarz1978estimating, + title={Estimating the dimension of a model}, + author={Schwarz, Gideon}, + journal={The annals of statistics}, + pages={461--464}, + year={1978}, + publisher={JSTOR}, + url={https://www.jstor.org/stable/2958889} +} + +@manual{shiny, + title={Easy web applications in R.}, + author={{RStudio, Inc}}, + year={2013}, + url={http://www.rstudio.com/shiny/} +} + + @Manual{survival_package, + title = {A Package for Survival Analysis in S}, + author = {Terry M Therneau}, + year = {2015}, + note = {version 2.38}, + url = {https://CRAN.R-project.org/package=survival}, + } + + +@article{Shu2007, + Author = {Shu, Youyi and Klein, John P. and Zhang, Mei-Jie}, + Da = {2007/03/01}, + Date-Added = {2021-02-09 10:55:17 +0000}, + Date-Modified = {2021-02-09 10:55:17 +0000}, + Doi = {10.1007/s10985-006-9018-9}, + Id = {Shu2007}, + Isbn = {1572-9249}, + Journal = {Lifetime Data Analysis}, + Number = {1}, + Pages = {91--117}, + Title = {Asymptotic theory for the Cox semi-Markov illness-death model}, + Ty = {JOUR}, + Url = {https://doi.org/10.1007/s10985-006-9018-9}, + Volume = {13}, + Year = {2007}, + Bdsk-Url-1 = {https://doi.org/10.1007/s10985-006-9018-9}} + +@article{Spitoni2012, +author = {Cristian Spitoni and Marion Verduijn and Hein Putter}, +doi = {doi:10.1515/1557-4679.1375}, +url = {https://doi.org/10.1515/1557-4679.1375}, +title = {Estimation and Asymptotic Theory for Transition Probabilities in Markov Renewal Multi-State Models}, +journal = {The International Journal of Biostatistics}, +number = {1}, +volume = {8}, +year = {2012} +} + + +@article{vanHouwelingen2007, + title={Dynamic prediction by landmarking in event history analysis}, + author={van Houwelingen, Hans C}, + journal={Scandinavian Journal of Statistics}, + volume={34}, + number={1}, + pages={70--85}, + year={2007}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1111/j.1467-9469.2006.00529.x} +} + +@article{Wreede2010, +title = "The mstate package for estimation and prediction in non- and semi-parametric multi-state and competing risks models", +journal = "Computer Methods and Programs in Biomedicine", +volume = "99", +number = "3", +pages = "261 - 274", +year = "2010", +issn = "0169-2607", +doi = "https://doi.org/10.1016/j.cmpb.2010.01.001", +url = "http://www.sciencedirect.com/science/article/pii/S0169260710000027", +author = "Liesbeth C. de Wreede and Marta Fiocco and Hein Putter", +keywords = "Survival analysis, Multi-state models, Competing risks models, Markov models, Cox models, Software" +} + +@article{Zou2005, +author = {Zou, Hui and Hastie, Trevor}, +title = {Regularization and variable selection via the elastic net}, +journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, +volume = {67}, +number = {2}, +pages = {301-320}, +keywords = {Grouping effect, LARS algorithm, Lasso, Penalization, p≫n problem, Variable selection}, +doi = {https://doi.org/10.1111/j.1467-9868.2005.00503.x}, +url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00503.x}, +eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-9868.2005.00503.x}, +year = {2005} +} \ No newline at end of file diff --git a/_articles/RJ-2024-002/costa-gerstung.tex b/_articles/RJ-2024-002/costa-gerstung.tex new file mode 100644 index 0000000000..87871faa64 --- /dev/null +++ b/_articles/RJ-2024-002/costa-gerstung.tex @@ -0,0 +1,635 @@ +% !TeX root = RJwrapper.tex +\title{ebmstate: An R Package For Disease Progression Analysis Under Empirical Bayes Cox Models} +\author{by Rui J. Costa and Moritz Gerstung} + +\maketitle + +\abstract{ +The new R package ebmstate is a package for multi-state survival analysis. It is suitable for high-dimensional data and allows point and interval estimation of relative transition hazards, cumulative transition hazards and state occupation probabilities, under clock-forward and clock-reset Cox models. Our package extends the package mstate in a threefold manner: it transforms the Cox regression model into an empirical Bayes model that can handle high-dimensional data; it introduces an analytical, Fourier transform-based estimator of state occupation probabilities for clock-reset models that is much faster than the corresponding, simulation-based estimator in mstate; and it replaces asymptotic confidence intervals meant for the low-dimensional setting by non-parametric bootstrap confidence intervals. Our package supports multi-state models of arbitrary structure, but the estimators of state occupation probabilities are valid for transition structures without cycles only. Once the input data is in the required format, estimation is handled automatically. The present paper includes a tutorial on how to use ebmstate to estimate transition hazards and state occupation probabilities, as well as a simulation study showing how it outperforms mstate in higher-dimensional settings. +} + +\section{Introduction} + +Multi-state models based on transition hazard functions are often used in the statistical analysis of longitudinal data, in particular disease progression data \citep{Hougaard1999}. The multi-state model framework is particularly suitable to accommodate the growing level of detail of modern clinical data: as long as a clinical history can be framed as a random process which, at any moment in time, occupies one of a few states, a multi-state model is applicable. Another strong point of this framework is that it can incorporate a \textit{regression model}, i.e., a set of assumptions on how covariates, possibly time-dependent ones, affect the risk of transitioning between any two states of the disease. Once estimated, multi-state models with regression features allow the stratification of patients according to their transition hazards. In addition, it is possible, under some models, to generate disease outcome predictions. +These come in the form of \textit{state occupation probability} estimates, meaning estimates of the probability of being in each state of the disease over a given time frame. + + +The survival analysis `task view' of the Comprehensive R Archive Network lists seven +R packages that are able to fit \textit{general} multi-state models and, at the same time, feature some kind of regression model or algorithm: \code{flexsurv} \citep{flexsurv_package}, \CRANpkg{msm} \citep{Jackson2011}, \CRANpkg{SemiMarkov} \citep{Listwon2015}, \CRANpkg{survival} \citep{survival_package}, \CRANpkg{mstate} \citep{Wreede2010}, \CRANpkg{mboost} \citep{mboost_package} -- as extended by \CRANpkg{gamboostMSM} \citep{gamboostMSM_package} -- and \CRANpkg{penMSM} \citep{penMSM_package}. All of them implement relative risk regression models \citep[as defined in][p. 133]{Aalen2008}. The only exceptions are \CRANpkg{survival}, which also fits Aalen's additive regression model \citep{Aalen1989}, and \code{flexsurv}, which also implements accelerated failure time models \Citep[see, for example,][p. 443]{Aalen2008}. + +Recall that a Cox regression model is a semi-parametric model in which every transition hazard is assumed to be the product of a baseline hazard function of unspecified form (the non-parametric component) and an exponential relative risk function (the parametric component) \citep[][p. 133]{Aalen2008}. +Generally, the relative risk regression models implemented in these packages are Cox regression models. However, some models in \code{flexsurv}, as well as those in \CRANpkg{msm} and \CRANpkg{SemiMarkov}, also restrict the baseline hazards to specific parametric families, i.e. they are fully parametric. In \CRANpkg{msm} and \CRANpkg{SemiMarkov}, the stronger assumptions regarding the functional form of the hazard are leveraged to do away with other common assumptions: \CRANpkg{SemiMarkov} drops the usual Markov property to implement homogeneous semi-Markov models; \CRANpkg{msm} is suitable for \textit{panel data}, i.e., data in which the state of each individual is known only at a finite series of times. + +Packages \CRANpkg{penMSM} and \CRANpkg{gamboostMSM} are the best suited to deal with higher-dimensional covariate data. +The first of these packages relies on a structured fusion lasso method, while the second implements (jointly with \CRANpkg{mboost}) a boosting algorithm. Both methods induce sparsity in the number of non-zero covariate effects, as well as equality among the different transition effects of each covariate, and are thus especially useful to reduce complicated multi-state models to more interpretable ones. The remaining packages assume standard, fixed effects relative risk regression models and do not include regularisation or variable selection features. + + +It is also illustrative to order the seven packages mentioned according to how extensive their analysis workflow is. Packages \CRANpkg{SemiMarkov} and \CRANpkg{penMSM} are intended for the estimation of relative transition hazards only (i.e., for estimating the impact of covariates on each transition hazard). With the package \CRANpkg{mboost} (as extended by \CRANpkg{gamboostMSM}) it is also possible to estimate the baseline transition hazards. Finally, a more complete workflow including estimates of both relative and cumulative transition hazards, as well as state occupation probabilities, is implemented in \code{flexsurv}, \CRANpkg{msm} and \CRANpkg{mstate}, and has been under implementation in \CRANpkg{survival} (version 3.0 or later). + + +The present paper provides an introduction to \CRANpkg{ebmstate}, a new R package for multi-state survival analysis available for download on the Comprehensive R Archive Network (CRAN). +The main goal of \CRANpkg{ebmstate} is to provide an analysis framework for the Cox model that performs better with higher-dimensional covariate data and is also complete, in the sense of being able to generate point and interval estimates of relative transition hazards, cumulative transition hazards and state occupation probabilities, both under clock-forward and clock-reset models. + A fundamental characteristic of \CRANpkg{ebmstate} is that it re-implements and extends the analysis framework of \CRANpkg{mstate}, which is complete in the sense just mentioned. In fact, to a large extent, our package was built by importing, adapting and replacing functions from the \CRANpkg{mstate} package. This not only eliminates redundancies, but also makes our package more accessible to the numerous users of \CRANpkg{mstate} (the three papers associated with \CRANpkg{mstate} have jointly over 2000 citations). + +To improve the performance of \CRANpkg{mstate}'s multi-state Cox model when dealing with higher-dimensional covariate data, a ridge-type regularisation feature was added. We allow the regression coefficients of the model to be partitioned into groups, with each group having its own Gaussian prior. A group can gather, for example, all the regression coefficients for a given transition. Or, within a given transition, coefficients can be grouped according to the covariate type they refer to (for example, demographic, clinical or genomic type). + The resulting hierarchical Bayes model is \textit{empirical} in that a full prior elicitation is not required (the mean and variance hyper-parameters of the Gaussian are estimated from the data). Model fitting relies on the iterative algorithm introduced by \citet{Schall1991}, which typically converges after a small number of steps. A simulation study showing that Schall's algorithm performance compares well with that of other algorithms for ridge penalty optimisation, including one based on cross-validation, can be found in \citet{Perperoglou2014}. + +The asymptotic confidence intervals generated by \CRANpkg{mstate} are applicable when the number of observations is much larger than the number of parameters to be estimated (see section \nameref{sec:interval_estimation} below). +To preserve the completeness of \CRANpkg{mstate}'s framework in higher-dimensional settings, we therefore implemented non-parametric bootstrap intervals of regression coefficients, cumulative transition hazards and state occupation probabilities. + +The high computational cost implied by the non-parametric bootstrap motivated a third extension to \CRANpkg{mstate}. We developed an estimator of state occupation probabilities under clock-reset Cox models that is based on a convolution argument \citep[as in][]{Spitoni2012} and the Fast Fourier transform (FFT). At present, the estimation of such probabilities for clock-forward Cox models can be carried out using the efficient, product-limit based algorithm available in \CRANpkg{mstate}. However, for clock-reset Cox models, only a simulation-based estimator is available in this package (see also the \code{flexsurv} package for a similar, simulation-based estimator). The FFT estimator in \CRANpkg{ebmstate} was conceived as a faster alternative to this simulation-based estimator, but its scope is currently restricted to multi-state models with transition structures that have no cycles, i.e. in which a transition between two states is either not possible or follows a unique sequence of states. + Figure \ref{fig:package_summary_figure} provides a short graphical summary of \CRANpkg{ebmstate}, with the main inputs -- a genomic-clinical data set and an empirical Bayes multi-state Cox model -- and the main outputs -- the estimates of relative hazards and state occupation probabilities (cumulative transition hazards are omitted). + + As already mentioned, our empirical Bayes method improves estimator performance in models with larger numbers of covariates (see section \nameref{sec:estimator_performance} on estimator performance). +Also, as a ridge-type regression method, it can be used as an alternative to the lasso method of \CRANpkg{penMSM} in two particular cases: when the levels of correlation between covariates are high enough to compromise the stability of lasso-based covariate selection; or simply to improve prediction accuracy when interpretability is not essential and the number of covariates is not greater than the number of observations \citep{Zou2005}. +In addition, and perhaps more importantly, \CRANpkg{ebmstate} goes beyond the regularised estimation of transition hazards offered by \CRANpkg{penMSM} and \CRANpkg{gamboostMSM}: point and interval estimates of state occupation probabilities under the regularised Cox model can also be computed. + + +\section{Models} +A multi-state Cox model is a continuous-time stochastic process with a finite (and usually small) state space $\mathcal{S}$. + To better describe the models implemented in \CRANpkg{ebmstate}, we define the following notation. We let $t$ denote the time since some initiating event (usually diagnosis or disease onset). For $t \in \left[0, \infty\right)$, we define the following random variables: $X(t)$ represents the disease state of the patient, $S(t)$ the time spent in the current state, and $\vec{Z}\left(t\right)$ the value of a covariate vector. +The realisation of each component of the process $\lbrace\vec{Z}\left(t\right)\rbrace$ is a step function, possibly approximating the evolution in time of a continuous covariate. + In addition, $\lbrace\vec{Z}\left(t\right)\rbrace$ is assumed not-adapted to the filtration generated by $\lbrace X\left(t\right)\rbrace$ (an adapted covariate is one whose path until $t$ is known once $\lbrace X \left(u\right)\rbrace$, $u \leq t$, is known). + The transition hazard rate of a patient from state $i$ to state $j$ ($i\neq j$) at time $t$, conditional on the sojourn time and the covariate vector, is defined as + \begin{align*} + &\alpha_{ij}\left(t|\mathbf{z},s \right):=\lim_{h \downarrow 0}\frac{1}{h}\mathrm{P}\left[X(t+h)=j\,|\,X(t)=i,S(t)=s,\vec{Z}(t)=\mathbf{z} \right]\;, \;s\in \left[0,\infty\right)\;,\;t\in \left[s,\infty\right)\;. +\end{align*} +Independent right-censoring and left-truncation are assumed throughout \citep[][p. 57]{Aalen2008}. The purpose of the present section is to give a (not necessarily exhaustive) description of the scope of \CRANpkg{mstate} and \CRANpkg{ebmstate} with respect to the multi-state Cox model. Using the terminology in \citet{Putter2011}, a Cox model is termed a `clock-reset' model when +\begin{align} +\label{eq:clock_reset_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}, s\right)&=\lambda_{ij}^{(0)}\left(s\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad, +\end{align} +and it is termed a `clock-forward' model when +\begin{align} +\label{eq:clock_forward_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}\right)&=\alpha_{ij}^{(0)}\left(t\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad. +\end{align} +In both cases, $i,j \in \mathcal{S}$, with $i\neq j$; $\boldsymbol{\beta}_{\scriptscriptstyle ij}$ is an unknown vector of regression coefficient parameters, and both $\lambda^{\scriptscriptstyle (0)}_{ij}(\cdot)$ and $\alpha^{\scriptscriptstyle (0)}_{ij}(\cdot)$ are unknown (baseline hazard) functions, non-negative on $\mathds{R}^{+}$. When, as in equation \ref{eq:clock_reset_Cox}, $\alpha_{ij}\left(t|\mathbf{z},s\right)$ is the same for all $t\geq s$, we simplify its notation to $\lambda_{ij}\left(s|\mathbf{z}\right)$. As can be seen from equations \ref{eq:clock_reset_Cox} and \ref{eq:clock_forward_Cox}, the `clock-reset' and `clock-forward' models are models for how the transition hazard rates are affected by time. In the former case, the only relevant time scale is the time $s$ spent in the current state, whereas in the latter only the time $t$ since the initiating event matters. +While the `clock-forward' model is arguably the default one in multi-state survival analysis \citep{Andersen1993,Aalen2008}, in some cases the `clock-reset' model is more appropriate. For example, in some forms of cancer, it can be sensible to assume that the transition hazards from the state of complete remission depend on the sojourn time, rather than on the time since the initial diagnosis. + +\subsection{Relative transition hazards} +\label{sec:models_relative_hazards} +The parametric component of the transition hazard from $i$ to $j$, written $\exp\left[\boldsymbol{\beta}^{\intercal}_{ij} \,\mathbf{z}\right]$, is termed the relative transition hazard. In \CRANpkg{mstate} and \CRANpkg{ebmstate}, estimating the relative transition hazard amounts to estimating the regression coefficient vector $\boldsymbol{\beta}_{ij}\,$. +In \CRANpkg{mstate}, these parameters are assumed to be non-random. With \CRANpkg{ebmstate}, the following prior distributions can be imposed. + +Define $\mathcal{P}$ as the set of all pairs of states between which a direct transition is possible. Let $\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ij} \rbrace $, for all $(i, j) \in \mathcal{P}$, be a partition of $\boldsymbol \beta$, a vector containing the regression coefficients for all direct transitions allowed. Each $\boldsymbol{\beta}_{\scriptscriptstyle ij}$ is further partitioned into $\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ijk} \rbrace$, for $k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace$. In \CRANpkg{ebmstate}, the most general model regarding the prior distribution of $\boldsymbol{\beta}$ makes two assumptions: a) the scalar components of $\boldsymbol{\beta}$ are independent and normally distributed; b) the scalar components of $\boldsymbol{\beta}_{\scriptscriptstyle i j k}$ have a common (and undetermined) mean $\mu_{\scriptscriptstyle ijk}$ and a common (and also undetermined) variance $\sigma^{2}_{\scriptscriptstyle ijk}\;$. + +The purpose of the framework just described is to allow the clustering of covariate effects according to their prior distribution. +If there is no prior knowledge about how this clustering should be done, a single Gaussian prior can be imposed on all regression coefficients at once. If prior knowledge allows the grouping of effects according to the transition they refer to, a different Gaussian prior can be assigned to the coefficients of each transition. Even within each transition, different groups of coefficients can be assigned different prior distributions. In the analysis of biomedical data, for example, there can be a split between genes which are known to affect the transition hazard, and other genes whose effect is unknown. + +\subsection{Cumulative transition hazard functions} + +Our package imports from \CRANpkg{mstate} a Breslow estimator of two types of cumulative transition hazard: one on a global time scale, defined as +\begin{align*} +\mathrm{A}_{ij}\left(t\,|\,\mathbf{z}\right)&:=\int_{0}^{t}\alpha_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad, +\end{align*} +and another on a sojourn time scale, defined as +\begin{align*} +&\Lambda_{ij}(s\,|\,\mathbf{z}):=\int_{0}^{s}\lambda_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad. +\end{align*} +Note that, in either case, the covariate vector is assumed to remain constant. + + +\subsection{State occupation probabilities} +By state occupation probability, we mean the probability that a patient in state $i$ at time $0$ finds herself in state $j$ at time $t$. The estimates of these probabilities can be seen as functionals of the estimated cumulative transition hazard functions. For this reason, the restriction to models with time-fixed covariates, which was just seen to be applicable to the estimators of cumulative transition hazards, carries over to the estimation of state occupation probabilities. + +When conditioning on a given covariate path (time-fixed or not), state occupation probability estimates are not valid unless the covariates are \textit{external} \citep[][p. 142]{Cortese2010,Aalen2008}. +Note that a vector of covariates $\lbrace \vec{Z}(u)\rbrace_{u\geq 0}$ is said to be \textit{external} if, for all $t \in \left[0,\infty\right)$, each transition hazard at $t$, conditional on $ \vec{Z}(t)$, is independent of $\lbrace \vec{Z}(u)\rbrace_{u>t}$ (i.e. independent of the future path of the covariate). Otherwise, it is said to be \textit{internal} \citep[for more details on the distinction between internal and external covariates, see][chapter 6]{Kalbfleisch2002}. +When one does not wish (or is not possible due to $\vec{Z}$ being \textit{internal}) to condition on a future covariate path of the covariate process, the uncertainty introduced by this process needs to be accounted for. This can be done by extending the state space of the disease process, so that it includes information on the disease \textit{and} the covariate process \citep[][p. 170]{Andersen1993}. For example, to include a dichotomous transplant covariate (an internal covariate) in a simple survival model with two states, the state space is expanded from $\lbrace$alive, deceased$\rbrace$ to $\lbrace$alive without transplant, alive with transplant, deceased$\rbrace$. One can then either assume that transplanted patients have a different baseline death hazard or, more simply, that transplantation scales the death hazard by some constant $\exp \left( \gamma\right)$. A similar but more detailed example can be found in \citet[][section 2.3.2, `model 3' ]{Wreede2010}. + +\section{Estimation} +In the current section, we present the estimation methods underlying the extensions of \CRANpkg{mstate} implemented in \CRANpkg{ebmstate}. + \label{sec:estimation} + + +\subsection{Relative and cumulative hazard functions} +Let $\boldsymbol{\mu}_{\scriptscriptstyle ij}$, with $\left(i,j\right) \in \mathcal{P}$ (the set of direct transitions allowed), denote a vector whose scalar components are the parameters $\mu_{\scriptscriptstyle ijk}$, $k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace$. Similarly, let $\boldsymbol{\sigma}^{2}_{\scriptscriptstyle ij}$ be composed of the parameters $\left\lbrace \sigma^{2}_{\scriptscriptstyle ijk}\right\rbrace_{k}$. The estimation of $\boldsymbol{\beta}$, $\boldsymbol{\mu}:=\lbrace\boldsymbol{\mu}_{\scriptscriptstyle{ij}}\rbrace$ and $\boldsymbol{\sigma}^2:=\lbrace\boldsymbol{\sigma}^2_{\scriptscriptstyle ij }\rbrace$ relies on the restricted maximum-likelihood (REML) type algorithm described in \cite{Perperoglou2014}, and introduced by \cite{Schall1991}. The resulting estimate of $\boldsymbol{\beta}$ is a maximum \textit{a posteriori} estimate; the estimates of $\boldsymbol{\mu}$ and $\boldsymbol{\sigma}^{2}$ are empirical Bayes estimates. In \CRANpkg{ebmstate}, the estimator based on this algorithm is implemented in the function \code{CoxRFX} . The results of a simulation study showing its consistency are included in the Supporting Scripts and Data (file ESM\_1.html, section 1). + + +The computation of cumulative hazard rates for given covariate values and an estimated regression coefficient vector relies on the function \code{msfit\_generic}, which is essentially a wrapper for the function \code{mstate::msfit} (see section \nameref{sec:computing_cumulative_hazards}). For the mathematical details of this computation, we refer therefore the reader to \citet{Wreede2010}. + + + +\subsection{State occupation probabilities} +\label{sec:trans_probs} +The package \CRANpkg{mstate} includes a simulation-based estimator that can take as input either $\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ or $\hat{\Lambda}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ to generate estimates of state occupation probabilities under the clock-forward or the clock-reset model respectively. +Another available estimator, an Aalen-Johansen-type estimator based on product integration, is far more efficient computationally and takes as input $\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ only. As the scope of this estimator has been restricted to clock-forward Cox models \citep{Andersen1993,Aalen2008}, in our package we implemented a convolution-based estimator as a computationally efficient alternative (for models with a transition structure that has no cycles). + +For convenience, let the sequence of states from $0$ to $n$ have the labels $0,1,2,...,n\,$, where $0$ is the initial state by definition, and $n$ is some state that might (eventually) be reached by the process. In addition, define $X_{0}:=X(0)$ and $T_{0}:=0$, and let $\left(X_{i},T_{i}\right)$, $i \in \left\lbrace 1,2,... \right\rbrace$, denote the marked point process associated with $\left\lbrace X(t)\right\rbrace$, so that $T_{i}$ is the time of the $i^{th}$ transition and $X_{i}$ is the state the process jumps to at time $T_{i}$. +The inter-transition times are denoted by $\tau_{ij}:=T_{j}-T_{i}$, for $j>i$. +We can write the probability that a patient in state $0$ at time $0$ finds herself in state $n$ at time $t$, conditional on $\vec{Z}(u)=\mathbf{z}$ for all $u \geq 0$, as +\begin{align*} + &\mathrm{P}\left[X(t)=n\,|\,X(0)=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right]\\ + &\,=\mathrm{P}\left[X_{n}=n,\tau_{0,n} < t,\tau_{n,n+1}\geq t- \tau_{0,n} |X_{0}=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right] \,.\nonumber +\end{align*} + +Recall that $\lambda_{i,i+1}\left(s\,|\, \mathbf{z}\right)$ denotes the hazard rate of a transition to state $i+1$ at time $s$ since arrival in state $i$, for a patient that has covariate vector $\mathbf{z}$. The cumulative hazard for the same transition between sojourn times $0$ and $s$, if the patient's covariate vector remains constant at $\mathbf{z}$, is represented by $\Lambda_{i,i+1}\left(s \,|\, \mathbf{z}\right):=\int_{0}^{s}\lambda_{i,i+1}\left(x\,|\, \mathbf{z}\right)\mathrm{d}x$. + Similarly, we let $\lambda_{i}\left(s\,|\, \mathbf{z}\right)$ represent the hazard rate of going to any state that can be reached directly from $i$, at time $s$ since arrival in state $i$, for a patient with covariate vector $\mathbf{z}$. The cumulative hazard for the same event between sojourn times $0$ and $s$, if the patient's covariate vector remains constant at $\mathbf{z}$, is represented by $\Lambda_{i}\left(s \,|\, \mathbf{z}\right)$. + The expressions $\hat{\Lambda}_{i}\left(s \,|\, \mathbf{z}\right)$ and $\hat{\Lambda}_{i,i+1}\left(s \,|\, \mathbf{z}\right)$ denote the Breslow estimators of the cumulative hazards just defined. +% {\color{blue} In \CRANpkg{ebmstate}, the function $\hat{\Lambda}_{i,i+1}\left(\cdot \,|\, \mathbf{z}\right)$ is a spline function that very close approximates the Breslow estimator of the cumulative hazard a spline interpolation of Breslow estimates of the cumulative hazard and the function $\hat{\Lambda}_{i}\left(\cdot \,|\, \mathbf{z}\right)$ is defined as $\sum_{j}\hat{\Lambda}_{i,j}\left(\cdot \,|\, \mathbf{z}\right)$. } + In what follows, all references to probabilities, hazard rates and cumulative hazards are to be understood as conditional on $\vec{Z}(u)=\mathbf{z}\,$, for $u\geq 0$: this condition is omitted to simplify the notation. + + In \CRANpkg{ebmstate}, the function \code{probtrans\_ebmstate} generates a set of state occupation probability estimates at equally spaced time points: +\begin{align*} +&\left\lbrace \hat{p}_{0n}\left(k\right)\right\rbrace_{k} :=\left\lbrace \hat{\mathrm{P}}\left[X_{n}=n,\tau_{0,n} < t_{k},\tau_{n,n+1}\geq t_{k}- \tau_{0,n}\,|\, X_{0}=0 \right] \right\rbrace_{k}\;,\; k=0,1,2,...,K\,;\, t_{k}=k\times \Delta t \;. +\end{align*} +The number $K$ of time intervals is $10,000$ by default and $t_{K}$ is a parameter set by the user. + Defining the functions +\begin{align*} +q_{ij}\left(k\right):=\mathrm{P}\left[X_{j}=j, \tau_{ij}\in \left[t_{k},t_{k+1}\right)\,|\,X_{i}=i\right] +\end{align*} +and + \begin{align*} +r_{i}\left(k\right):=\mathrm{P}\left[\tau_{i,i+1} > t_{k} \,|\,X_{i}=i\right]\;, +\end{align*} +and the finite difference +\begin{align*} + \Delta \hat{\Lambda}_{i,i+1}\left(t_{k}\right):=\hat{\Lambda}_{i,i+1}\left(t_{k+1}\right)-\hat{\Lambda}_{i,i+1}\left(t_{k}\right)\;, +\end{align*} +the algorithm behind \code{probtrans\_ebmstate} can be described as follows: +\begin{enumerate} +\item For $j=1,2,...,n$, compute +\begin{flalign} +\label{eq:est1} + \hat{q}_{j-1,j}\left(k\right)&:=\exp \left[-\hat{\Lambda}_{j-1}\left(t_{k}\right)\right]\Delta \hat{\Lambda}_{j-1,j}\left(t_{k}\right)&& +\end{flalign} + for $k=0,1,...,K-1$. +\item For $j=2,3,...,n$, compute (iteratively) +\begin{flalign} +\label{eq:est2} + \hat{q}_{0j}\left(k\right):=&\sum_{l=0}^{k-1} \hat{q}_{j-1,j}\left(k-l-1\right) \hat{q}_{0,j-1} \left(l\right) && +\end{flalign} + for $k=0,1,...,K-1$. +\item Finally, use the estimates obtained in the last iteration of step 2 to compute +\begin{flalign} +\label{eq:est4} +\hat{p}_{0n}\left(k\right):=&\sum_{l=0}^{k-1} \hat{r}_{n}\left(k-l-1\right) \hat{q}_{0,n}\left(l\right)&& +\end{flalign} + for $k=0,1,...,K$, where $\hat{r}_{n}\left(\cdot\right):=\exp \left[-\hat{\Lambda}_{n}\left(t_{\scriptscriptstyle\left(\cdot\right)}\right)\right]\,$. +\end{enumerate} +Substituting $:=$ for $\approx$ and removing the `hats' in definitions \ref{eq:est1} to \ref{eq:est4}, we get the approximate equalities that justify the algorithm. These approximate equalities are derived in the Supporting Scripts and Data (file ESM\_1.html, section 2). + +Apart from \code{probtrans\_ebmstate}, the function \code{probtrans\_fft} is also based on the convolution argument just shown. +However, this function makes use of the convolution theorem, i.e., of the fact that the convolution of two (vectorized) functions in the time domain is equivalent to a pointwise product of the same functions in the frequency domain. The estimation of state occupation probabilities is thus simplified to +\begin{align*} + \hat{p}_{0n}:=&\mathcal{F}^{\scriptscriptstyle -1}\left\lbrace \hat{\mathrm q}_{0,1} \boldsymbol{\cdot} \hat{\mathrm q}_{1,2}\boldsymbol{\cdot} \mathrm{...}\boldsymbol{\cdot}\hat{\mathrm q}_{n-1,n}\boldsymbol \cdot \hat{\mathrm r}_{n}\right\rbrace\;, +\end{align*} +where $\mathcal{F}$ denotes the discrete Fourier transform, $\hat{\mathrm{q}}_{j-1,j}:=\mathcal{F}(\hat{q}_{j-1,j})$ and $\hat{\mathrm{r}}_{n}:=\mathcal{F}(\hat{r}_{n})$. +Conversion to and from the frequency domain is carried out using the fast Fourier transform algorithm implemented in the \code{fft} function of the base package \texttt{stats}. +The Supporting Scripts and Data contain a short simulation study checking that state occupation probabilities can be accurately estimated with \code{probtrans\_ebmstate} and \code{probtrans\_fft} (see file ESM\_1.html, sections 3 and 4). + + +Figure \ref{fig:mssample} consists of a grid of plots with estimated curves of state occupation probabilities. It compares, in terms of speed and accuracy, the estimator in \code{probtrans\_fft} with an estimator in \code{mstate::mssample} that has the same target, but is simulation-based. Each plot contains a black curve and a superimposed red curve. The red curves in any given column of the grid are all based on the same run of a function: columns 1 to 3 are based on runs of \code{mssample} with the number of samples $n$ equal to $100$, $1000$ and $10.000$ respectively, while column 4 is based on a run of \code{probtrans\_fft}. Each column in the grid reproduces the same 4 black curves. These are based on a single run of \code{mssample} with $n=100.000$ and serve as benchmark. All function runs are based on the same input: a set of cumulative transition hazard estimates for a multi-state model with the `linear' transition structure given in the leftmost diagram of figure \ref{fig:transition_structures}. Plots in a given row refer to the same state of the model. The running times on top of each column refer to the estimation of red curves. +The main conclusion suggested by this analysis of simulated data is that \code{probtrans\_fft} is as accurate as \code{mssample} with $n=10.000$, but it is almost 100 times faster (columns 3 and 4). With $n=1000$, \code{mssample} achieves a good approximation to the true state occupation probabilities, but is still roughly 9 times slower. The details on how figure \ref{fig:mssample} and its underlying data were generated are given in the Supporting Scripts and Data (file ESM\_1.html, section 5). + + + +\subsection{Interval estimation} +\label{sec:interval_estimation} + +Under any model estimated by \CRANpkg{ebmstate} -- as in general under a Bayesian model --, one can, if the sample size is large enough, approximate the posterior by a normal distribution with mean equal to the maximum \textit{a posteriori} estimate and covariance matrix equal to the inverse of the generalised observed Fisher information \citep[see, for example,][p. 83-84]{Gelman2014}. This approximation has first-order accuracy and is thus outperformed by Laplace's method, which has second-order accuracy \citep[][p. 110-111]{Carlin2009}. However, as \citet[p. 112]{Carlin2009} observe, ``for moderate- to high-dimensional $\boldsymbol\theta$ (say, bigger than 10), Laplace\textquotesingle s method will rarely be of sufficient accuracy[...]''. +\citet[][p. 244-251]{Carlin2009} also describe three methods of interval estimation in empirical Bayes settings, but all of them are designed for fully parametric models. These reasons, along with the fact that regularised methods such as the one implemented \CRANpkg{ebmstate} are typically used to fit models with more than a dozen covariates, led us to choose the non-parametric bootstrap as the interval estimation method in \CRANpkg{ebmstate}. Note that the non-parametric bootstrap can be given a Bayesian interpretation. Its interval estimates are approximately the same as those of a Bayesian model that assumes: a) a multinomial distribution for the data; and b) a non-informative Dirichlet prior distribution for the probability assigned to each category in the multinomial distribution. This is a specific case of the so-called Bayesian bootstrap \citep[][p. 272]{Hastie2009}. Further research is needed to determine the theoretical properties of the non-parametric bootstrap in the present setting, but this falls beyond the scope of the present paper. Interval estimates of regression coefficients, cumulative hazards and state occupation probabilities are implemented in the function \code{boot\_ebmstate}. + + +\section{Estimator performance} +\label{sec:estimator_performance} +It is a well-documented fact in the statistical literature that standard least-squares or maximum-likelihood estimators can often be improved by regularisation or shrinkage \citep[see, for example,][]{Samworth2012}. This improvement comes about when the model dimensionality is high enough that the bias introduced by regularisation is outweighed by the reduction in the estimator variance. In the current setting, one might therefore ask: what kind of dimensionality does a semi-parametric, multi-state Cox model need to have to be outperformed by its empirical Bayes counterpart? + A simulation study we carried out offers a tentative answer to this question, by comparing estimators under both Cox models for an increasing number of covariates. The study also features a third method, based on a fully non-parametric model, as a null model method. This was included to give an idea of how many covariates the empirical Bayes +model can deal with before it becomes no better than a simple non-regressive model. + +\subsection{Simulation setup} +We assessed the performance of all estimators defined by the tuple $\left[a,m, G, n,p(n)\right]$, where $a\in \lbrace$regression coefficients, relative hazards, state occupation probabilities$\rbrace$ is the target of estimation, $m\in \lbrace$standard Cox, empirical Bayes Cox, null$\rbrace$ is the assumed hazard model, $G \in \lbrace$linear, competing risks, `m' structure$\rbrace$ is the transition structure of the model (illustrated in figure \ref{fig:transition_structures}) and $n\in \lbrace 100,1000\rbrace$ is the number of patients/disease histories in the training data set; +the variable $p$ denotes the number of coefficients/covariates per transition in the true model and its range depends on $n$: $p\left(100\right) \in \lbrace 10,40,70,100 \rbrace$ whereas $p\left(100\right) \in \lbrace 10,100,200,300 ,400,500\rbrace$. By `relative hazards' and `state occupation probabilities', we mean here the relative transition hazards of an out-of-sample patient, and her state occupation probabilities at 7 chosen time points. +We generated a batch of 300 independent absolute error observations (`NA' estimates included) for each estimator, where each observation is recorded after training the estimator on a newly simulated data set. Each boxplot in figures \ref{fig:estimator_performance_boxplots_100patients} ($n=100$) and \ref{fig:estimator_performance_boxplots_1000patients} ($n=1000$) is based on one of these batches. As all estimators are \textit{vector} estimators, each absolute error is actually an \textit{average} absolute error, where the average is taken over the components of the vector. + +All training data sets were simulated from clock-reset Cox models. Apart from $G$ (the model transition structure), $n$ and $p$, also the true baseline hazards are held fixed within each batch of 300 training data sets. +%However, the regression coefficient vector of the model is sampled at random before simulating a new data set, as we found that using the same vector in the simulation of all data sets in a batch would make the results highly dependent on the actual coefficient values used. +The coefficient vectors used in the simulation are always non-sparse and are scaled by $\sqrt{\frac{10}{p}}$ to keep the log-hazard variance constant when the dimensionality grows. +All covariates are dichotomous and mutually independent. +To compute the coefficient errors for the non-parametric (null) model method, we think of it as a degenerate Cox model in which all regression coefficient estimates are fixed at zero. +The estimation of regression coefficients under the standard Cox and the empirical Bayes Cox models was performed with \code{survival::coxph} and \code{ebmstate::CoxRFX} respectively; the estimation of state occupation probabilities is based on \code{mstate::probtrans} for the null model and on \code{ebmstate::probtrans\_fft} for both the standard Cox and the empirical Bayes Cox models. + + The reason we did not consider simulation scenarios with more than 500 covariates per transition, in data sets of 1000 patients, was simply computational cost. For example, generating the data and error observations for the scenario with $n=1000$, $p=100$ and $G=$`m' structure took less than one hour to generate using 20 CPU cores in parallel; the same scenario but with $p=500$ took 6.5 days using 25 CPU cores. More details about the simulation setup can be found in the Supporting Scripts and Data (file ESM\_1.html, section 6, subsection `sample script'). + + +\subsection{Missing values} +Whenever an estimator was able to compute a valid estimate of its target for each training data set, i.e., when it did not return any `NA' estimates, its boxplots are based on 300 valid error observations. This was always the case with non-parametric estimators: the estimates of regression coefficients and relative hazards of this type of estimators are trivial (fixed at zero and one respectively) and hence it is also straightforward to compute absolute errors. + It also happened that non-parametric estimators of state occupation probabilities had no `NA' estimates (see file ESM\_1.html, section 6, figure 6.3, in the Supporting Scripts and Data). The situation was similar for the empirical Bayes Cox model estimators, which showed no more than 5$\%$ missing estimates in any of the simulation scenarios studied (ibid., figures 6.1 and 6.2). However, for the standard Cox model ones, the number of `NA' estimates depends to a large extent on the number of patients in the data set, as well as on the dimensionality and transition structure of the model (figures \ref{fig:na_props_100patients_coxph} and \ref{fig:na_props_1000patients_coxph}). In data sets of 100 patients, it fares well in models with fewer than 10 covariates per transition, or in models with up to 40 covariates, if the transition structure is linear. Otherwise its failure rates range from roughly 25$\%$ to nearly 100$\%$. +In data sets of 1000 patients, the proportion of `NA' estimates is never above 10$\%$, if the transition structure is linear, but it can climb above 60$\%$ for other transition structures. + + +\subsection{Comparison of estimators} +With respect to the performance of the three methods studied, the boxplots in figures \ref{fig:estimator_performance_boxplots_100patients} and \ref{fig:estimator_performance_boxplots_1000patients} suggest the following conclusions: +\begin{itemize} +\item As $p/n$ grows, the empirical Bayes estimators quickly outperform the standard Cox model ones. They already fare substantially better at $p/n=0.1$ for both $n=100$ and $n=1000$ and for all estimation targets. At the same time, the relative performance of the empirical Bayes method with respect to the null model one decreases. At $p/n=0.5$, the difference between these two methods is already rather small for all simulation scenarios. +\item The relative performance of the empirical Bayes method with respect to the null method decreases as the number of co-occurring transition hazards in the model grows. All other things equal, the empirical Bayes method has the best performance under the `linear' structure model, which has no competing transitions; it performs less well under the `m' structure transition model, where two transition hazards can co-occur; and has the worse relative performances under the `competing risks' model, where three transition hazards co-occur. This trend is clearer for $n=100$ (figure \ref{fig:estimator_performance_boxplots_100patients}) but can also be detected in the relative hazard errors for $n=1000$ (figure \ref{fig:estimator_performance_boxplots_1000patients}). In any case, the empirical Bayes method seems to be far more robust than the standard Cox model against increases in the number of co-occurring transition hazards. +\item Having as target the regression coefficients or the state occupation probabilities, instead of relative hazards, makes the empirical Bayes method better in comparison to the null method. In fact, as $p/n$ grows, the empirical Bayes method is never outperformed by the null method except in the estimation of relative hazards. +\end{itemize} + + +\section{Survival analysis workflow} +The features of \lstinline!mstate! were illustrated in \citet{Wreede2010} using a simple workflow. The starting point of this workflow is a data set in `long format'. Such data set can be fed into \lstinline!survival::coxph! to obtain estimates of the regression coefficients of a multi-state Cox model. The resulting model fit object can be passed on to \lstinline!mstate::msfit!, along with a vector of covariates of a particular patient, to get personalised estimates of the cumulative hazard functions. +Finally, state occupation probabilities for the same patient can be estimated if the object created by \lstinline!mstate::msfit! is fed into \lstinline!mstate::probtrans!. +In this section, we describe how \CRANpkg{ebmstate} extends the scope of this workflow, i.e., how it uses the packages \CRANpkg{survival} and \CRANpkg{mstate} to generate estimates under a multi-state empirical Bayes Cox model. A diagram summarising the extension is shown in figure \ref{fig:workflow}. In the \nameref{sec:model_assessment} subsection, we give some recommendations on how to assess and compare models, but for more detailed tutorials on how to analyse multi-state data using models defined by transition hazards, we refer the reader to \citet{Putter2007tutorial} and \citet{Putter2011tutorial}. + + The main steps of the \CRANpkg{ebmstate} workflow are here illustrated using a data set of patients with myelodysplastic syndromes (MDS) which has been described and studied in \citet{Papaemmanuil2013}. A myelodysplastic syndrome is a form of leukemia in which the bone marrow is not able to produce enough mature blood cells, and which sometimes develops into a cancer of white blood cells with a quick and aggressive progression, i.e., into acute myeloid leukemia (AML). Figure \ref{fig:trans_diagrams}a illustrates an illness-death type model for MDS patients and also gives a breakdown of the number of transition events. + The conversion to a model with a transition structure that has no cycles (i.e., that can be handled by our convolution-based estimators) is shown in figure \ref{fig:trans_diagrams}b. The data set used for model estimation, obtained after a number of pre-processing steps, contains the disease history of 576 patients, as well as measurements on 30 covariates. Of these 30 covariates, 11 are mutation covariates and the remaining are clinical or demographic (see figure \ref{fig:trans_diagrams}c). + The running time for the estimation of relative transition hazards does not exceed 10 seconds in a standard laptop computer. The same holds for the estimation of cumulative transition hazards or state occupation probabilities for a given patient. The complete R code underlying the data analysis in the current section can be found in the Supporting Scripts and Data (file ESM\_2.html). For running only the R snippets shown below and reproduce their results, the best option is to use the R script in file ESM\_3.R of the Supporting Scripts and Data. + + +\subsection{Input data} +Table \ref{table:long_format_data} shows a fragment of the MDS data set. The data is in `long format', which means that each row refers to a period of risk for a given transition and patient. +For example, row $i$ tells us that, at time \code{Tstart[i]}, patient \code{id[i]} entered state \code{from[i]}, and thereby began to be at risk for transition \code{trans[i]}, i.e., at risk of going from state \code{from[i]} to state \code{to[i]}. If the first transition of patient \code{id[i]} after time \code{Tstart[i]} occurs before the last follow-up time for this patient, \code{Tstop[i]} records the time of this transition (regardless of whether the patient moved to state \code{to[i]} or not). Otherwise, \code{Tstop[i]} is set to the last follow-up time. The value of \code{status[i]} is set to 1 if and only if the first transition of patient \code{id[i]} after \code{Tstart[i]} is to state \code{to[i]} and occurs before the last follow-up (otherwise it is set to 0). +The value of \code{time[i]} is defined simply as \code{Tstop[i]}$ - $\code{Tstart[i]}, and \code{strata[i]} is the stratum of the baseline hazard for transition \code{trans[i]} (more about this variable in the following section). For \code{x} $\in \left\lbrace \right.$ \code{ASXL1}, \code{DNMT3A}, $\dots \left. \right \rbrace $, \code{x[i]} denotes the level of covariate \code{x} between \code{Tstart[i]} and \code{Tstop[i]} in patient \code{id[i]}. (In the MDS data set, we assume that the relative hazard of a patient is determined by her covariate vector at $t=0$, i.e., we assume all covariates to be time-fixed.) +If a patient enters a new state, and this state communicates directly with $n$ other states, then, as long as the patient actually spends time in the new state (i.e. the time of transition is not the same as the last follow-up time), $n$ rows must be added to the data set, with each row corresponding to a different possible transition. + +From table \ref{table:long_format_data}, we know that patient 77 entered state 1 (`MDS') at time 0 and remained in this state until time 2029, when she moved to state 3 (`death before AML'). There are no rows to describe the evolution of patient 77 after entering state 3, as this state is an absorbing state. As to patient 78, she remained in state 1 until time 332, and moved from there to state 2 (`AML'). She lived with AML for 1117 days and moved to state 4 (`death after AML') at time 1449. + + +\begin{table} +\begin{example} +id from to trans Tstart Tstop time status strata ASXL1 DNMT3A [...] +77 1 2 1 0 2029 2029 0 1 0 0 . +77 1 3 2 0 2029 2029 1 2 0 0 . +78 1 2 1 0 332 332 1 1 1 0 . +78 1 3 2 0 332 332 0 2 1 0 . +78 2 4 3 332 1449 1117 1 3 1 0 . +\end{example} +\caption{A 5-row fragment of the MDS data set (in long format)} +\label{table:long_format_data} +\end{table} + +\subsection{Fitting an empirical Bayes Cox model} +\label{sec:fit_bayes_cox_model} +Once the data is in `long format', the estimation of an empirical Bayes model can be carried out using the function \code{CoxRFX}. A simple example of the first argument of \code{CoxRFX}, denoted `\code{Z}', is a data frame gathering the \code{trans}, \code{strata} and covariate columns of the data in long format: +\begin{example} +outcome_covs <- c("id","from","to","trans","Tstart","Tstop","time","status", + "strata") +Z <- mstate_data[!names(mstate_data) %in% outcome_covs] +#(`mstate_data' has the data in long format) +\end{example} +The \code{strata} column determines which baseline hazard functions are assumed to be equal. + In table \ref{table:long_format_data}, each transition is assumed to have a (potentially) different baseline hazard. The model's assumptions regarding how covariates affect the hazard are reflected on the format of the covariate columns of \code{Z}. When the \code{Z} argument is the one created in the previous block of code, \code{CoxRFX} returns a single regression coefficient estimate for each covariate. In other words, the impact of any covariate is assumed to be the same for every transition. + + +There are however ways of relaxing this assumption. One can replace the \code{ASXL1} column in Z (or any other covariate column) by several `type-specific' \code{ASXL1} columns: the \code{ASXL1} column specific for type $i$ would show the mutation status of \code{ASXL1} in rows belonging to transition of type $i$, and show zero in all other rows. This would force \code{CoxRFX} to estimate a (potentially) different \code{ASXL1} coefficient for each transition type. This process of covariate expansion by type can be based on any partition of the set of transitions. When each type corresponds to a single transition, we refer to it simply as `covariate expansion by transition'. The output shown below illustrates the effect of expanding the covariates in `mstate\_data' by transition. +\begin{example} +# Columns `id' and `trans' from `mstate_data' together with the first +# two expanded covariates (patients 77 and 78): + id trans ASXL1.1 ASXL1.2 ASXL1.3 DNMT3A.1 DNMT3A.2 DNMT3A.3 [...] + 77 1 0 0 0 0 0 0 . + 77 2 0 0 0 0 0 0 . + 78 1 1 0 0 0 0 0 . + 78 2 0 1 0 0 0 0 . + 78 3 0 0 1 0 0 0 . +\end{example} +The example code given below shows how to use \CRANpkg{mstate} to expand covariates by transition and how to create a \code{Z} argument that makes \code{CoxRFX} estimate a regression coefficient for each covariate for transitions 1 and 2, and assume a fully non-parametric hazard for transition 3. +\begin{example} +# To expand covariates by transition using mstate::expand.covs, +# first set the class of `mstate_data' as +class(mstate_data) <- c("data.frame","msdata") + +# then add the transition matrix as attribute: +attr(mstate_data,"trans") <- tmat +#(`tmat' is the output of mstate::transMat) + +# Expand covariates by transition: +covariates_expanded_123 <- mstate::expand.covs( + mstate_data, + covs = names(mstate_data)[! names(mstate_data) %in% outcome_covs], + append = F +) + +# remove all covariates for transition 3 from `covariates_expanded_123' +# to fit a fully non-parametric model on this transition: +covariates_expanded_12 <- covariates_expanded_123[ + !grepl(".3",names(covariates_expanded_123),fixed = T) +] + +#argument `Z' of coxrfx +Z_12 <- data.frame(covariates_expanded_12,strata = mstate_data$trans, + trans = mstate_data$trans) +\end{example} + +The second argument of \code{CoxRFX} (`\code{surv}') is a survival object that can easily be built by feeding the outcome variable columns of the data to the function \code{Surv} (from the package \CRANpkg{survival}). +Whether \code{CoxRFX} fits a clock-forward model or a clock-reset model depends on the kind of survival object: +\begin{example} +#argument `surv' for a clock-forward model +surv <- Surv(mstate_data$Tstart,mstate_data$Tstop,mstate_data$status) + +#argument `surv' for a clock-reset model +surv <- Surv(mstate_data$time,mstate_data$status) +\end{example} + +The argument \code{groups} of \code{CoxRFX} is a vector whose length equals the number of covariates in the data. In other words, the length of \code{groups} is \code{ncol(Z)-2}, since the argument \code{Z} must include both the covariate data and the \code{strata} and \code{trans} columns. If, for $i \neq j $, \code{groups[i]}=\code{groups[j]} $=\text{`foo'}$, this means that the regression coefficients of the $i^{th}$ and $j^{th}$ covariates of \code{Z} both belong to a group named `foo' of coefficients with the same prior. For the \code{Z} object built above, the \code{groups} argument created in the following block of code embodies the assumption that all coefficients associated with a given transition have the same prior distribution. The final line of code fits the empirical Bayes model. + +\begin{example} +#argument `groups' of coxrfx +groups_12 <- paste0(rep("group",ncol(Z)-2),c("_1","_2")) + +#fit random effects model +model_12 <- CoxRFX(Z_12,surv,groups_12,tmat) +\end{example} + + +Figure \ref{fig:coef_plots} shows regression coefficient point estimates for a clock-reset, empirical Bayes model fitted with the code above. Also shown are 95\% non-parametric bootstrap confidence intervals computed using \code{ebmstate::boot\_ebmstate}. The $x$-axis scale is logarithmic to allow estimates to be read as relative hazards more easily. For example, a mutation in \textit{RUNX1} is associated with a twofold increase in the hazard of progression from MDS to AML, and treatment centre 4 is associated with a 3-fold increase in the hazard of dying before progressing to AML, when compared to the baseline value of `treatment centre' (treatment centre = 2 or 5). In covariates that have been log-transformed (age, platelet count and neutrophil count) or logit-transformed (proportions of myeloblasts and ring sideroblasts in the bone marrow), the interpretation of estimates is different. For example, an increase in age by a factor of $e$ ($\approx 2.72$) almost triples the hazard of dying before AML; the same increase in the ratio $bm\_blasts/(1-bm\_blasts)$ (where \textit{bm\_blasts} is the proportion of myeloblasts in the bone marrow) is associated with an increment in the hazard of dying before AML of approximately $16\%$. + + + + +\subsection{Computing cumulative transition hazard estimates} +\label{sec:computing_cumulative_hazards} +The function \code{msfit\_generic} is the generic function in \CRANpkg{ebmstate} that computes cumulative transition hazards for a given set of covariate values and an estimated Cox model. It calls a different method according to the class of its \code{object} argument. The default method corresponds to the original \code{msfit} function of the \CRANpkg{mstate} package and is appropriate for objects of class \code{coxph}, i.e., objects that contain the fit of a Cox model with fixed effects. The other available method for \code{msfit\_generic}, \code{msfit\_generic.coxrfx}, is just the original \code{msfit} function, (slightly) adapted to deal with objects generated by \code{CoxRFX}. +Quite importantly, \code{msfit\_generic.coxrfx} does not allow the variance of the cumulative hazards to be computed, as this computation relies on asymptotic results which may not be valid for an empirical Bayes model. As a result, it only has two other arguments apart from the object of class \code{coxrfx}: a data frame with the covariate values of the patient whose cumulative hazards we want to compute; and a transition matrix describing the states and transitions in the model (such as the one that can be generated using \code{transMat} from the package \CRANpkg{mstate}). +The following block of code exemplifies how these objects can be built and generates the \code{msfit} object containing the cumulative transition hazard estimates for a sample patient. Note that the object with the patient data must include a row for each transition, as well as a column specifying the transition stratum of each row of covariates. + +\begin{example} +# Build `patient_data' data frame with the covariate values for which +# cumulative hazards are to be computed (covariate values of patient 78): +patient_data <- mstate.data[mstate.data$id == 78,,drop = F][rep(1,3),] +patient_data$strata <- patient_data$trans <- 1:3 +patient_data <- mstate::expand.covs( + patient_data, + covs = names(patient_data)[ ! names(patient_data) %in% outcome_covs], + append = T +) +patient_data <- patient_data[ ! grepl(".3",names(patient_data),fixed = T)] + +# The `patient_data' data frame has only 3 rows (one for each transition). +# The output below shows its `id' and `trans' columns +# and expanded covariates ASXL1 and DNMT3A: + id trans ASXL1.1 ASXL1.2 DNMT3A.1 DNMT3A.2 [...] + 78 1 1 0 0 0 . + 78 2 0 1 0 0 . + 78 3 0 0 0 0 . + +# compute cumulative hazards +msfit_object_12 <- msfit_generic(model_12,patient_data,tmat) +\end{example} + +Figure \ref{fig:patient78_cumhaz} shows three plots of estimated cumulative transition hazards for the sampled patient, one for each transition in the model, along with $95\%$ non-parametric bootstrap confidence intervals (computed with \code{ebmstate::boot\_ebmstate}). Throughout the plotted period, the `slope' of the cumulative hazard (i.e., the hazard rate) for the MDS to AML transition is lower than the one for the MDS to death transition, and this in turn is lower than the one for the AML to death transition. It should be recalled that the cumulative hazard estimate is strictly non-parametric for this last transition, i.e., it is the same for all patients. The central plot of figure \ref{fig:patient78_cumhaz} suggests that, as time since diagnosis goes by, the hazard of dying in MDS increases (possibly an effect of age). On the other hand, the hazard of dying in AML seems to decrease (slightly) with time (rightmost plot). Conclusions regarding the evolution of the AML hazard are hard to draw, since the confidence intervals for the corresponding cumulative hazard curve are very wide (leftmost plot). + +If an object generated by \code{msfit\_generic} is fed to \code{plot}, and the package \CRANpkg{mstate} is loaded, the method \code{mstate:::plot.msfit} will be called. This is an efficient way of automatically plotting the cumulative hazard estimates for all transitions, but confidence interval lines (separately estimated) cannot be added. + + + +\subsection{Computing state occupation probability estimates} +\label{sec:computing_transition_probs} +The functions \code{probtrans\_mstate}, \code{probtrans\_ebmstate} and \code{probtrans\_fft} compute estimates of state occupation probabilities for a given \code{msfit} object. +All three functions generate objects of class \code{probtrans} that can be fed to the \code{plot.probtrans} method from the package \CRANpkg{mstate}. +The first of these functions should only be used for clock-forward models, as it relies on product-limit calculations. It calls the method \code{probtrans\_mstate.default}, if the \code{msfit} object was generated by \code{msfit\_generic.default}, or the method \code{probtrans\_mstate.coxrfx}, if it was generated by \code{msfit\_generic.coxrfx}. Both methods are identical to the function \code{probtrans} in the \CRANpkg{mstate} package, with the reserve that \code{probtrans\_mstate.coxrfx} does not allow the computation of the variances or covariances of the state occupation probability estimator. + +The functions \code{probtrans\_ebmstate} and \code{probtrans\_fft} are the functions in \CRANpkg{ebmstate} for the computation of state occupation probability estimates under clock-reset models with a transition structure that has no cycles. When using \code{probtrans\_fft} (the faster, but somewhat less stable, of these two functions), three arguments must be supplied: the initial state of the process whose state occupation probabilities one wishes to compute, the \code{msfit} object, and the upper time limit for the generation of estimates (\code{max\_time}). Both functions are based on a discrete-time approximation to a series of convolutions. The default argument \code{nr\_steps} controls the number of (equally spaced) time steps used in this approximation. The arguments \code{max\_time} and \code{nr\_steps} should be increased until the estimated curves become stable. + +The following line of code computes point estimates of state occupation probabilities for the sample patient. + +\begin{example} +probtrans_object_12 <- probtrans_fft("MDS",msfit_object_12, max_time = 4000) +\end{example} +Estimates are shown in figure \ref{fig:patient78_transProbs}, along with $95\%$ non-parametric, bootstrap confidence intervals. For this particular patient, the estimated probability of being dead after AML remains below 0.4 throughout a period of 10 years from the MDS diagnosis; if the patient does reach AML, death is expected to happen quickly thereafter, as reflected in the very low estimates for the probability of being in AML at any point in time. The following block of code shows how to compute confidence intervals with \code{boot\_ebmstate}: + +\begin{example} +# Creating the object arguments for boot_ebmstate() + +# `groups' arguments was already created, but we need to add names to it +names(groups_12) <- names(covariates_expanded_12) + +# `mstate_data_expanded' argument (similar to `covariates_expanded' but +# including outcome variables) +mstate_data_expanded <- cbind( + mstate_data[names(mstate_data) %in% outcome_covs], + covariates_expanded_12 +) + +# create the non-parametric bootstrap confidence intervals +boot_ebmstate_object <- boot_ebmstate( + mstate_data = mstate_data_expanded, + which_group = groups_12, + min_nr_samples = 100, + patient_data = patient_data, + tmat = tmat, + initial_state = "MDS", + time_model = "clockreset", + input_file = NULL, + coxrfx_args = list(max.iter = 200), + probtrans_args = list(max_time = 4000) +) +\end{example} + + +\subsection{Model assessment} +\label{sec:model_assessment} +For any model fitted with \CRANpkg{ebmstate}, two performance metrics can be easily computed: the \textit{concordance} statistic (\citealp{harrell1982evaluating}; see also the help page of \code{survival::concordance} for the definition of concordance) and the \textit{Bayesian Information Criterion} (BIC) score \citep{schwarz1978estimating}. As an example of how these two metrics can be obtained and used for model comparison, suppose we wish to compare `model\_12' fitted above -- which consists of a Cox regression including all covariates for transitions 1 and 2 and a fully non-parametric model for transition 3 -- with a model that combines Cox regressions of all covariates for each of the three transitions (denoted `model\_123' below). +The following code snippet shows how to fit this second model. + +\begin{example} +# arguments `groups' and `Z' for fitting a Cox regression model on all transitions +Z_123 <- data.frame( + covariates_expanded_123, + strata = mstate_data$trans, + trans = mstate_data$trans +) +groups_123 <- paste0(rep("group", ncol(Z_123) - 2), c("_1", "_2", "_3")) + +# Fit a Cox regression model for all transitions +model_123 <- CoxRFX(Z = Z_123, surv = surv, groups = groups_123) +\end{example} +\noindent +Running the \code{concordance} function in the \CRANpkg{survival} package for each model yields the following output: +\begin{example} +> concordance(model_12) + Call: + concordance.coxph(object = model_12) + + n= 1210 + Concordance= 0.8131 se= 0.01314 + concordant discordant tied.x tied.y tied.xy + strata=1 18040 2783 0 1 0 + strata=2 37919 9678 0 7 0 + strata=3 0 0 1052 0 4 + +> concordance(model_123) + Call: + concordance.coxph(object = model_123) + + n= 1210 + Concordance= 0.8168 se= 0.01312 + concordant discordant tied.x tied.y tied.xy + strata=1 18041 2782 0 1 0 + strata=2 37920 9677 0 7 0 + strata=3 784 268 0 4 0 +\end{example} +\noindent +The output shows that modelling transition 3 with a Cox model, instead of a fully parametric one, has a negligible impact on the overall concordance. However, this is due to the fact that there are far fewer observations for this transition. The concordance for transition 3 only, which corresponds to strata 3, is 0.5 under the fully parametric model (i.e., all patients are assigned the same transition hazard) and considerably higher under the Cox regression ($784/(784+268)=0.75$). +Ideally, the comparison of models of different complexity should be carried out on a test sample rather than on the training data. For this purpose, the test data can be input into to the \code{concordance} function (argument \code{newdata}). However, in the present case, only 61 patients were ever at risk of dying with AML (i.e. of undergoing transition 3), and of these only 41 actually died, so we might prefer to keep all patients in the training data, rather than saving a fraction of them for testing purposes. Such an option will yield more accurate coefficient estimates, at the expense of not allowing the computation of unbiased estimates of model performance. If the goal is only to compare models, we can make do without test data, by using an information score that penalises model complexity, such as the BIC. To facilitate model comparison, the BIC score is one of the attributes of the model fit object: + +\begin{example} +> model_12$BIC + [1] 2508.37 +> model_123$BIC + [1] 2483.49 +\end{example} +\noindent +The best model is the one with the lowest score, so the choice of `model\_123' is confirmed. + + +\section{Discussion} + +We have shown that \CRANpkg{ebmstate} is suitable for higher-dimensional, multi-state survival analysis, and that it is both efficient and easy-to-use. To a significant extent, the user-friendliness of \CRANpkg{ebmstate} stems from the fact that it was not built `from the ground up'. Instead, we produced a package that is more easily accessible to the many users of \CRANpkg{mstate} by taking advantage of whichever features of this package were useful to our method and by eliminating redundancies. +The connection between \CRANpkg{ebmstate} and \CRANpkg{mstate} is based on the fact that the function \code{CoxRFX} takes the same type of input and produces the same type of output as \code{coxph} from the package \code{survival}, and the function \code{probtrans\_fft} (or \code{probtrans\_ebmstate}) has the same type of input and output as \code{probtrans} from \CRANpkg{mstate} (as shown in figure \ref{fig:workflow}). + +We also sought to improve our package's user-friendliness by making it as efficient as possible. The reduction of computational cost is based on two features. First, our empirical Bayes method relies on an expectation-maximisation algorithm that estimates both the parameters and the hyper-parameters of the model, i.e., no further tuning of the model is required. Second, in \CRANpkg{ebmstate}, the computation of state occupation probability estimates relies on analytical results rather than on simulation: not only for clock-forward models, where we import from \CRANpkg{mstate} a product-limit estimator, but also for clock-reset models, where we implement our own estimator based on a convolution argument and the fast Fourier transform. + +To our knowledge, \CRANpkg{ebmstate} is the first R package to put together a framework for multi-state model estimation that is complete and suitable for higher-dimensional data. + It does so by implementing point and interval estimators of regression coefficients, cumulative transition hazards and state occupation probabilities, under regularised multi-state Cox models. + In section \nameref{sec:estimator_performance}, the results of the simulation study suggest that for data sets with 100 patients or more and a ratio of $p$ (patients) to $n$ (coefficients per transition) greater than 0.1, the standard Cox model estimator is clearly outperformed by the empirical Bayes one when it comes to the estimation of relative hazards and state occupation probabilities of an out-of-sample patient, or the regression coefficients of the model. However, the same study suggests that using an empirical Bayes method instead of a fully non-parametric one is of limited or no value in settings where $p/n \geq 1$. This loss of usefulness can already happen for $p/n\leq 1/2$ when it comes to the estimation of the relative hazards of an out-of-sample patient, especially for transition structures with multiple competing transitions. + + As mentioned in previous sections, \CRANpkg{ebmstate} imports a product-limit estimator from \CRANpkg{mstate} that targets the state occupation probabilities of patients with \textit{time-fixed} covariate vectors. However, these estimators are extendible to models with time-dependent covariates, as long as these are external and the estimates are conditional on specific covariate paths \citep[][p. 142]{Aalen2008}. For piecewise constant covariates, it is likely that such an adaptation could be obtained by combining transition probability estimates obtained for each period in which the covariates are fixed. While no significant theoretical obstacles are foreseen in this matter, the computer implementation for more than a single piecewise constant covariate is likely to be a laborious task. We have left it therefore for future work. + + \section*{Acknowledgements} + The authors are supported by grant NNF17OC0027594 from the Novo Nordisk Foundation. We thank an anonymous reviewer for their constructive comments and helpful suggestions which led to a much-improved manuscript. + +\section*{Supporting Scripts and Data} +In the supporting Scripts and Data, the file \file{ESM\_1.html} contains additional simulation results and theoretical demonstrations. Additional details on the analysis of the MDS data set are given in the file \file{ESM\_2.html}. The MDS data set is in files \file{MDS.TPD.20Nov2012.csv} and \file{mds.paper.clin.txt}. The file \file{ESM\_3.R} contains a simplified R script to run the code snippets in the present paper. The \CRANpkg{ebmstate} package is available on CRAN. + +\section{Conflict of interest} +The authors have declared no conflict of interest. + +\bibliography{costa-gerstung} + + +\address{Rui J. Costa\\ + European Molecular Biology Laboratory\\ + European Bioinformatics Institute (EMBL-EBI)\\ + Hinxton, CB10 1SD\\ + United Kingdom\\ + \email{ruibarrigana@hotmail.com}} + +\address{Moritz Gerstung\\ + aff. 1: European Molecular Biology Laboratory\\ European Bioinformatics Institute (EMBL-EBI)\\ + Hinxton, CB10 1SD\\ + United Kindom\\ + aff. 2: German Cancer Research Center (DKFZ)\\ + Im Neuenheimer Feld 280\\ + 69120 Heidelberg\\ + Germany\\ + \email{moritz.gerstung@dkfz.de}} + +\clearpage + + {\LARGE \textbf{Figures}} +\vspace{3cm} + +\begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/package_summary_figure.pdf} + \caption{Summary of inputs and outputs of the package \texttt{ebmstate}. The input data set should be one that violates the assumption -- commonly used in survival analysis -- that the number of observations is much larger than the number of parameters to be estimated (a genomic-clinical data set is shown as a typical example). The input model is a multi-state Cox model defined by a transition structure and a prior distribution on the regression coefficients. This prior distribution is defined by partitioning the vector of regression coefficients into groups of regression coefficients, with each group having its own Gaussian prior with undetermined mean and variance. The outputs of \texttt{ebmstate} include estimates of the relative transition hazards associated with each covariate, as well as estimates of the probability that a specific patient (with specific covariate measurements) has of occupying each state of the model over some time period. Estimates of cumulative transition hazards are omitted from the figure.} + \label{fig:package_summary_figure} + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=13.5cm, angle=0]{figures/mssample_and_probtrans_fft.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Comparison of running times and estimation accuracy of \texttt{mssample} and \texttt{probtrans\_fft}. Each plot in the grid shows two estimated curves of state occupation probabilities. The black curves are based on a single run of \texttt{mstate::mssample} with $n=100.000$ observations (approximately 17 minutes of running time) and are the same across columns. They serve as benchmark for precision assessment. In columns 1 to 3 of the grid, the superimposed red curves are based on a run of \texttt{mssample} with respectively 100, 1000, and 10.000 observations. In the rightmost column, the red curves are based on a run of \texttt{probtrans\_fft}. All functions have as input the same set of cumulative transition hazards. These were estimated using a non-parametric multi-state model and a data set of 1000 patients generated according to a clock-reset Cox model with a `linear' transition structure (leftmost diagram of figure \ref{fig:transition_structures}). Plots in the same row refer to the same state of the model, while those in the same column refer to the same run of a function. Running times and, where appropriate, number of simulations ($n$) are given on top of each column.} + \label{fig:mssample} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/transition_structures.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Model transition structures. We studied the performance of Cox model estimators, empirical Bayes Cox model estimators and fully non-parametric estimators with respect to these 3 transition structures.} + \label{fig:transition_structures} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/na_props_100patients_coxph.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Proportions of valid, infinite and missing (`NA') estimates for the standard Cox model estimators in the simulation study of figure \ref{fig:estimator_performance_boxplots_100patients} (100 patients per simulated data set).} + \label{fig:na_props_100patients_coxph} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/na_props_1000patients_coxph.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Proportions of valid, infinite and missing (`NA') estimates for the standard Cox model estimators in the simulation study of figure \ref{fig:estimator_performance_boxplots_1000patients} (1000 patients per simulated data set).} + \label{fig:na_props_1000patients_coxph} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/estimator_performance_boxplots_100patients.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with \textbf{100 observations} each. In the figure grid there is a boxplot corresponding to every tuple $(a,m, G, p)$ such that $a\in \lbrace$regression coefficients, relative hazards, state occupation probabilities$\rbrace$ is the target of estimation, $m\in \lbrace$standard Cox, empirical Bayes Cox, null$\rbrace$ is the hazard model, $G \in \lbrace$linear, competing risks, `m' structure$\rbrace$ is the transition structure of the model, and $p \in \lbrace 10,40,70,100 \rbrace$ is the number of coefficients/covariates per transition. + Each boxplot is based on at most 300 average absolute error observations. + Figure \ref{fig:na_props_100patients_coxph}, together with figures 6.1 and 6.3 in file ESM\_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot's y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. + } + \label{fig:estimator_performance_boxplots_100patients} % label for the figure + \end{figure} + + + \begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/estimator_performance_boxplots_1000patients.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{ + Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with \textbf{1000 observations} each. In the figure grid there is a boxplot corresponding to every tuple $(a,m, G, p)$ such that $a\in \lbrace$regression coefficients, relative hazards, state occupation probabilities$\rbrace$ is the target of estimation, $m\in \lbrace$standard Cox, empirical Bayes Cox, null$\rbrace$ is the hazard model, $G \in \lbrace$linear, competing risks, `m' structure$\rbrace$ is the transition structure of the model, and $p \in \lbrace 10,100,200,300,400,500 \rbrace$ is the number of coefficients/covariates per transition. + Each boxplot is based on at most 300 average absolute error observations. + Figure \ref{fig:na_props_1000patients_coxph}, together with figures 6.2 and 6.3 in file ESM\_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot's y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. + } + \label{fig:estimator_performance_boxplots_1000patients} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=13.5cm, angle=0]{figures/workflow0.pdf} % width changes size + \vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Extension of the \texttt{mstate} analysis framework by \texttt{ebmstate}. Arrows correspond to functions. Boxes correspond to inputs or outputs of functions. Functions \texttt{CoxRFX} and \texttt{probtrans\_fft} from \texttt{ebmstate} compute point estimates only. Interval estimates can be obtained using the non-parametric bootstrap algorithm implemented in the function \texttt{ebmstate::boot\_ebmstate}.} + \label{fig:workflow} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=14.5cm, angle=0]{figures/data_summary_figs2.pdf} % width changes size + \vspace*{-0.25cm} % manual adjustment of vertical spacing + \caption{\textbf{a}: transition model implied by the data set of patients with myelodysplastic syndromes, together with transition event numbers; \textbf{b}: conversion to a transition structure without cycles; \textbf{c}: transformations applied to the MDS covariate data and summary statistics for the data before transformation. MDS stands for \textit{myelodysplastic syndromes}; AML stands for \textit{acute myeloid leukemia}.} + \label{fig:trans_diagrams} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=12.5cm, angle=0]{figures/coef_plots.pdf} % width changes size + \vspace*{-0.25cm} % manual adjustment of vertical spacing + \caption{Point estimates of regression coefficients for the Cox model fitted to the MDS data, along with 95\% non-parametric bootstrap confidence intervals. The $x$-axis scale is logarithmic so that coefficient estimates can be read as relative hazard estimates. If $\gamma_{ij}$ is the element of $\hat{\boldsymbol{\beta}}_{ij}$ associated with a given covariate, $\exp\left(\gamma_{ij}\right)$ is the estimated relative hazard for this covariate in transition $\left(i,j\right)$. In general, a relative hazard estimate $r$ for a covariate $z$ in transition $\left(i,j\right)$ means that a one-unit increase in $z$ is associated with an $r$-fold increase in the hazard of this transition. If $z$ was obtained by log-transformation (as in age, platelet counts and neutrophil counts), a one-unit increase in $z$ corresponds to scaling the original covariate by $e\approx 2.72$. In case $z$ was obtained by logit-transformation (as in bone marrow blasts and sideroblasts proportions), the same one-unit increase corresponds to scaling the odds of the original covariate by $e$.} + \label{fig:coef_plots} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=12.5cm, angle=0]{figures/patient78_cumhaz_final.png} % width changes size + %\vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Point estimates of cumulative transition hazards for a sample patient with MDS (black curve), along with $95\%$ non-parametric confidence intervals (dashed red lines).} + \label{fig:patient78_cumhaz} % label for the figure + \end{figure} + + \begin{figure}[h] + \centering + \includegraphics[width=10.5cm, angle=0]{figures/patient78_transProbs_final.png} % width changes size + %\vspace*{0.25cm} % manual adjustment of vertical spacing + \caption{Point estimates of state occupation probabilities for a sample patient with MDS (black curve), along with $95\%$ non-parametric confidence intervals (dashed red lines).} + \label{fig:patient78_transProbs} % label for the figure + \end{figure} + + % \begin{figure}[h] + % \centering + % \includegraphics[width=12.5cm, angle=0]{figures/trans_probs_mosaic.pdf} % width changes size + % \vspace*{-0.25cm} % manual adjustment of vertical spacing + % \caption{Leave-one-out estimates of state occupation probabilities for a random sample of 196 individuals with MDS. The $x$-axis measures time since diagnosis and runs from 0 to 10 years.} + % \label{fig:trans_probs_mosaic} % label for the figure + % \end{figure} + + \enlargethispage{3.0\baselineskip} + \ No newline at end of file diff --git a/_articles/RJ-2024-002/data/MDS.TPD.20Nov2012.csv b/_articles/RJ-2024-002/data/MDS.TPD.20Nov2012.csv new file mode 100644 index 0000000000..a8e39304ad --- /dev/null +++ b/_articles/RJ-2024-002/data/MDS.TPD.20Nov2012.csv @@ -0,0 +1 @@ +ID_VARIANT,SAMPLE NAME,CHR,POSITION,WT,MT,Type,%_MUT_IN_NORM,NORM_DEPTH,%_MUT_IN_TUM,TUM_DEPTH,Repeat Flag,Gene,CCDS,c.,p.,Decision 58190874,PD6919a,1,43804954,C,G,Sub,0,17,43.08,65,na,MPL,CCDS483.1,c.404C>G,p.P135R,UNKNOWN 58169904,PD6075a,1,43806103,C,G,Sub,0,64,47.8,500,na,MPL,CCDS483.1,c.899C>G,p.T300S,UNKNOWN 58196115,PD6527a,1,43806157,G,T,Sub,0,53,48.2,500,na,MPL,CCDS483.1,c.953G>T,p.S318I,UNKNOWN 58105139,PD6283a,1,43812480,A,G,Sub,0,34,46.52,460,na,MPL,CCDS483.1,c.1183A>G,p.N395D,UNKNOWN 66763344,PD8936a,1,43814979,G,A,Sub,0,79,14.52,124,na,MPL,CCDS483.1,c.1514G>A,p.S505N,ONCOGENIC 58072725,PD7119a,1,43815009,G,T,Sub,0,58,13.93,201,na,MPL,CCDS483.1,c.1544G>T,p.W515L,ONCOGENIC 58092084,PD6840a,1,43815009,G,T,Sub,0,58,51.76,170,na,MPL,CCDS483.1,c.1544G>T,p.W515L,ONCOGENIC 58141078,PD6987a,1,43815009,G,T,Sub,0,58,73.82,191,na,MPL,CCDS483.1,c.1544G>T,p.W515L,ONCOGENIC 58101094,PD6539a,1,43818229,C,T,Sub,0,44,38.57,293,na,MPL,CCDS483.1,c.1694C>T,p.P565L,UNKNOWN 58092610,PD7016a,1,43818294,G,A,Sub,0,71,16.59,211,na,MPL,CCDS483.1,c.1759G>A,p.A587T,UNKNOWN 58164529,PD6883a,1,43818306,T,G,Sub,0,84,34.95,309,na,MPL,CCDS483.1,c.1771T>G,p.Y591D,ONCOGENIC 208125438,PD6229a,1,43818427,a,-,D,0,85,7.511737089,212,1,MPL,CCDS483.1,c.1892delA,p.Y631fs*>5,UNKNOWN 58159356,PD6972a,1,52255328,A,G,Sub,0,32,50,500,na,NRD1,CCDS559.1,c.3374T>C,p.I1125T,UNKNOWN 58154383,PD6858a,1,52256599,G,C,Sub,0,24,49.29,424,na,NRD1,CCDS559.1,c.3228C>G,p.N1076K,UNKNOWN 58191110,PD7082a,1,52260248,G,A,Sub,0,31,45.51,490,na,NRD1,CCDS559.1,c.2875C>T,p.P959S,UNKNOWN 66786136,PD7384a,1,52260502,C,T,Sub,0,56,5.81,155,na,NRD1,CCDS559.1,c.2833G>A,p.V945I,UNKNOWN 66763345,PD8936a,1,52260534,G,A,Sub,0,47,14.67,75,na,NRD1,CCDS559.1,c.2809-8C>T,p.?,UNKNOWN 58143714,PD6812a,1,52263959,A,G,Sub,0,50,45.21,146,na,NRD1,CCDS559.1,c.2770T>C,p.F924L,UNKNOWN 66763347,PD8936a,1,52271189,C,T,Sub,0,13,5.38,223,na,NRD1,CCDS559.1,c.2359G>A,p.A787T,UNKNOWN 58085575,PD6095a,1,52283787,G,T,Sub,0,137,51.29,232,na,NRD1,CCDS559.1,c.1516C>A,p.H506N,UNKNOWN 58094029,PD6779a,1,52283787,G,T,Sub,0,137,39.51,329,na,NRD1,CCDS559.1,c.1516C>A,p.H506N,UNKNOWN 208174677,PD6880a,1,52305973,atc,-,D,0,107,41.05011933,370,3,NRD1,CCDS559.1,c.553_555delGAT,p.D185delD,UNKNOWN 58079555,PD6098a,1,115252197,G,C,Sub,0,64,21,500,na,NRAS,CCDS877.1,c.443C>G,p.T148S,UNKNOWN 58115743,PD6278a,1,115256521,A,T,Sub,0,142,33,500,na,NRAS,CCDS877.1,c.190T>A,p.Y64N,ONCOGENIC 58189780,PD6147a,1,115256528,T,A,Sub,0,159,39.4,500,na,NRAS,CCDS877.1,c.183A>T,p.Q61H,ONCOGENIC 66902144,PD8937a,1,115256532,C,T,Sub,0.66,152,13.67,373,na,NRAS,CCDS877.1,c.179G>A,p.G60E,ONCOGENIC 58102260,PD6270a,1,115258744,C,A,Sub,0,109,29.8,500,na,NRAS,CCDS877.1,c.38G>T,p.G13V,ONCOGENIC 58126001,PD6242a,1,115258744,C,A,Sub,0,109,15.2,500,na,NRAS,CCDS877.1,c.38G>T,p.G13V,ONCOGENIC 58162692,PD6904a,1,115258744,C,A,Sub,0,109,21.8,500,na,NRAS,CCDS877.1,c.38G>T,p.G13V,ONCOGENIC 58203015,PD6946a,1,115258744,C,A,Sub,0,109,10,500,na,NRAS,CCDS877.1,c.38G>T,p.G13V,ONCOGENIC 66887156,PD7382a,1,115258745,C,A,Sub,0,111,8.26,351,na,NRAS,CCDS877.1,c.37G>T,p.G13C,ONCOGENIC 68195840,PD8730a,1,115258745,C,A,Sub,0,111,5.8,500,na,NRAS,CCDS877.1,c.37G>T,p.G13C,ONCOGENIC 58081540,PD6079a,1,115258747,C,T,Sub,0,112,15.2,500,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 58083097,PD6223a,1,115258747,C,T,Sub,0,112,28.8,500,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 58090935,PD6261a,1,115258747,C,T,Sub,0,112,13.8,500,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 58093801,PD6313a,1,115258747,C,A,Sub,0,112,38.84,484,na,NRAS,CCDS877.1,c.35G>T,p.G12V,ONCOGENIC 58126002,PD6242a,1,115258747,C,T,Sub,0,112,18.4,500,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 58135523,PD6869a,1,115258747,C,T,Sub,0,112,43.4,500,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 58143414,PD5726a,1,115258747,C,A,Sub,0,112,47,500,na,NRAS,CCDS877.1,c.35G>T,p.G12V,ONCOGENIC 58190873,PD6919a,1,115258747,C,T,Sub,0,112,23.36,428,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 66960549,PD8734a,1,115258747,C,T,Sub,0,112,44.06,379,na,NRAS,CCDS877.1,c.35G>A,p.G12D,ONCOGENIC 68195635,PD8731a,1,115258747,C,A,Sub,0,112,17.75,293,na,NRAS,CCDS877.1,c.35G>T,p.G12V,ONCOGENIC 58115272,PD6887a,1,115258748,C,G,Sub,0,113,44.2,500,na,NRAS,CCDS877.1,c.34G>C,p.G12R,ONCOGENIC 58203016,PD6946a,1,115258748,C,A,Sub,0,113,24,500,na,NRAS,CCDS877.1,c.34G>T,p.G12C,ONCOGENIC 66951417,PD7391a,1,115258748,C,T,Sub,0,113,51.28,390,na,NRAS,CCDS877.1,c.34G>A,p.G12S,ONCOGENIC 58108696,PD7032a,1,117554247,C,T,Sub,0,172,53.13,288,na,CD101,CCDS891.1,c.500C>T,p.T167I,UNKNOWN 347174097,PD7375a,1,117554571,-,T,I,0,141,36.11111111,72,1,CD101,CCDS891.1,c.824_825insT,p.R276fs*16,UNKNOWN 58079556,PD6098a,1,117556304,G,A,Sub,0,50,11.76,102,na,CD101,CCDS891.1,c.1118G>A,p.G373D,UNKNOWN 58131532,PD6537a,1,117556312,A,T,Sub,0,40,46.45,155,na,CD101,CCDS891.1,c.1126A>T,p.R376*,UNKNOWN 67016618,PD7386a,1,117559847,C,T,Sub,0,97,12.82,117,na,CD101,CCDS891.1,c.1364C>T,p.T455I,UNKNOWN 58098413,PD6543a,1,117559886,T,C,Sub,0,79,48.85,305,na,CD101,CCDS891.1,c.1403T>C,p.I468T,UNKNOWN 58181027,PD6488a,1,117559886,T,C,Sub,0,79,54.55,187,na,CD101,CCDS891.1,c.1403T>C,p.I468T,UNKNOWN 58197379,PD6979a,1,117564368,G,A,Sub,0,145,18.07,83,na,CD101,CCDS891.1,c.2191G>A,p.D731N,UNKNOWN 66976704,PD8939a,1,117568193,G,A,Sub,0,226,10.34,87,na,CD101,CCDS891.1,c.2491G>A,p.A831T,UNKNOWN 58157190,PD6279a,1,117568280,A,G,Sub,0,203,48.76,242,na,CD101,CCDS891.1,c.2578A>G,p.K860E,UNKNOWN 58198248,PD7001a,1,117568425,G,C,Sub,0,118,37,300,na,CD101,CCDS891.1,c.2723G>C,p.W908S,UNKNOWN 58095594,PD6100a,1,117568471,G,A,Sub,0,119,10.17,59,na,CD101,CCDS891.1,c.2769G>A,p.W923*,UNKNOWN 58174984,PD7030a,1,117576697,G,T,Sub,0,64,48.8,500,na,CD101,CCDS891.1,c.3040G>T,p.D1014Y,UNKNOWN 58105261,PD6944a,1,151785720,A,A,Sub,0,24,14.81,81,na,RORC,CCDS1004.1,c.1169C>T,p.A390V,UNKNOWN 58071091,PD6083a,1,151785996,C,T,Sub,0,60,32.6,273,na,RORC,CCDS1004.1,c.1034G>A,p.C345Y,UNKNOWN 58201825,PD6894a,1,151787445,C,T,Sub,0,47,12.5,40,na,RORC,CCDS1004.1,c.755G>A,p.G252D,UNKNOWN 66876090,PD8735a,1,151787539,C,T,Sub,0,94,13.45,171,na,RORC,CCDS1004.1,c.661G>A,p.G221S,UNKNOWN 58163139,PD7077a,1,151787610,C,T,Sub,0,47,10.73,410,na,RORC,CCDS1004.1,c.590G>A,p.G197E,UNKNOWN 58194992,PD6905a,1,151787890,C,T,Sub,0,91,19.15,47,na,RORC,CCDS1004.1,c.310G>A,p.G104S,UNKNOWN 66955100,PD7365a,1,151804212,C,T,Sub,0,14,47.06,17,na,RORC,CCDS1004.1,c.29G>A,p.R10Q,UNKNOWN 58098189,PD6256a,2,25457159,C,G,Sub,0,112,49.41,170,na,DNMT3A,CCDS33157.1,c.2728G>C,p.A910P,ONCOGENIC 58194581,PD6107a,2,25457176,G,A,Sub,0.91,110,34.33,201,na,DNMT3A,CCDS33157.1,c.2711C>T,p.P904L,ONCOGENIC 58096998,PD6939a,2,25457230,T,C,Sub,0,85,38.46,143,na,DNMT3A,CCDS33157.1,c.2657A>G,p.Q886R,ONCOGENIC 58074981,PD6849a,2,25457242,C,T,Sub,0,81,43.79,153,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58078371,PD5747a,2,25457242,C,T,Sub,0,81,46.2,184,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58079920,PD6520a,2,25457242,C,T,Sub,0,81,44.76,105,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58081156,PD5768a,2,25457242,C,G,Sub,0,81,23.78,185,na,DNMT3A,CCDS33157.1,c.2645G>C,p.R882P,ONCOGENIC 58107368,PD6149a,2,25457242,C,T,Sub,0,81,34.25,73,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58111619,PD6968a,2,25457242,C,T,Sub,0,81,26.67,210,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58129745,PD6974a,2,25457242,C,T,Sub,0,81,42.24,161,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58130630,PD6482a,2,25457242,C,T,Sub,0,81,30.61,147,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58135048,PD7079a,2,25457242,C,T,Sub,0,81,50.62,162,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58140092,PD6135a,2,25457242,C,T,Sub,0,81,49.4,166,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58166844,PD6861a,2,25457242,C,T,Sub,0,81,10.32,126,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58173037,PD6233a,2,25457242,C,T,Sub,0,81,32.93,167,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58200960,PD6288a,2,25457242,C,T,Sub,0,81,47.3,148,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 66786093,PD7384a,2,25457242,C,T,Sub,0,81,34.88,43,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 67027928,PD8732a,2,25457242,C,T,Sub,0,81,39.06,64,na,DNMT3A,CCDS33157.1,c.2645G>A,p.R882H,ONCOGENIC 58069352,PD7021a,2,25457243,G,A,Sub,0,77,10.43,115,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58072347,PD6926a,2,25457243,G,A,Sub,0,77,38.78,147,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58081830,PD7111a,2,25457243,G,A,Sub,0,77,17.09,199,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58088186,PD7088a,2,25457243,G,A,Sub,0,77,35.8,176,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58133714,PD6163a,2,25457243,G,A,Sub,0,77,50.53,95,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58152393,PD5746a,2,25457243,G,T,Sub,0,77,18.64,177,na,DNMT3A,CCDS33157.1,c.2644C>A,p.R882S,ONCOGENIC 58171941,PD7090a,2,25457243,G,A,Sub,0,77,10.14,138,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58177754,PD5732a,2,25457243,G,A,Sub,0,77,33.12,157,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58183163,PD6311a,2,25457243,G,A,Sub,0,77,14.69,177,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58202866,PD6909a,2,25457243,G,A,Sub,0,77,11.76,119,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58203755,PD6202a,2,25457243,G,A,Sub,0,77,37.11,97,na,DNMT3A,CCDS33157.1,c.2644C>T,p.R882C,ONCOGENIC 58189932,PD7084a,2,25457284,A,G,Sub,0,59,19.8,101,na,DNMT3A,CCDS33157.1,c.2603T>C,p.F868S,ONCOGENIC 58068318,PD6197a,2,25458595,A,G,Sub,0,127,41.18,255,na,DNMT3A,CCDS33157.1,c.2578T>C,p.W860R,ONCOGENIC 208348707,PD6186a,2,25458597,a,-,D,0,129,4.87804878,82,2,DNMT3A,CCDS33157.1,c.2576delT,p.L859fs*22,ONCOGENIC 58119597,PD6190a,2,25458604,C,T,Sub,0,125,10,190,na,DNMT3A,CCDS33157.1,c.2569G>A,p.D857N,ONCOGENIC 58171347,PD6862a,2,25458627,G,A,Sub,0,111,30.04,263,na,DNMT3A,CCDS33157.1,c.2546C>T,p.P849L,ONCOGENIC 58150716,PD5755a,2,25458649,G,C,Sub,0,118,17.1,269,na,DNMT3A,CCDS33157.1,c.2524C>G,p.Q842E,ONCOGENIC 66971083,PD7379a,2,25458690,C,T,Sub,0,115,12,50,na,DNMT3A,CCDS33157.1,c.2483G>A,p.S828N,ONCOGENIC 207917719,PD5762a,2,25459846,gctta<6>acagt,-,D,0,15,25.73529412,114,0,DNMT3A,CCDS33157.1,c.2422_2437del16,p.T808fs*12,ONCOGENIC 58160559,PD7045a,2,25463209,C,A,Sub,0,45,32.14,56,na,DNMT3A,CCDS33157.1,c.2284G>T,p.G762C,ONCOGENIC 66937067,PD7383a,2,25463235,C,T,Sub,0,42,36,50,na,DNMT3A,CCDS33157.1,c.2258G>A,p.W753*,ONCOGENIC 58068532,PD6540a,2,25463286,C,T,Sub,0,36,36.44,118,na,DNMT3A,CCDS33157.1,c.2207G>A,p.R736H,ONCOGENIC 58140465,PD6538a,2,25463287,G,A,Sub,0,36,42.31,78,na,DNMT3A,CCDS33157.1,c.2206C>T,p.R736C,ONCOGENIC 58126854,PD6151a,2,25463289,T,C,Sub,0,35,36.36,33,na,DNMT3A,CCDS33157.1,c.2204A>G,p.Y735C,ONCOGENIC 208021714,PD7119a,2,25463298,aag,-,D,0,33,11.39240506,79,na,DNMT3A,CCDS33157.1,c.2193_2195delCTT,p.F732delF,ONCOGENIC 66931073,PD7381a,2,25463298,A,G,Sub,0,33,48.15,27,na,DNMT3A,CCDS33157.1,c.2195T>C,p.F732S,ONCOGENIC 58089148,PD5742a,2,25463299,A,G,Sub,0,33,38.78,49,na,DNMT3A,CCDS33157.1,c.2194T>C,p.F732L,ONCOGENIC 208503932,PD6850a,2,25463304,a,-,D,0,30,8.571428571,70,1,DNMT3A,CCDS33157.1,c.2189delT,p.L730fs*49,ONCOGENIC 58198299,PD7001a,2,25463523,C,T,Sub,0,37,49.23,65,na,DNMT3A,CCDS33157.1,c.2159G>A,p.R720H,ONCOGENIC 58198300,PD7001a,2,25463524,G,C,Sub,0,37,37.88,66,na,DNMT3A,CCDS33157.1,c.2158C>G,p.R720G,ONCOGENIC 58201795,PD6894a,2,25463572,C,T,Sub,0,41,11.9,42,na,DNMT3A,CCDS33157.1,c.2110G>A,p.V704M,ONCOGENIC 209612626,PD5777a,2,25463586,c,-,D,0,39,23.38709677,124,4,DNMT3A,CCDS33157.1,c.2096delG,p.G699fs*6,ONCOGENIC 66971084,PD7379a,2,25463587,C,G,Sub,0,37,55.56,9,na,DNMT3A,CCDS33157.1,c.2095G>C,p.G699R,ONCOGENIC 208305307,PD6295a,2,25464470,g,-,D,0,27,21.12676056,71,1,DNMT3A,CCDS33157.1,c.2043delC,p.M682fs*23,ONCOGENIC 58180116,PD5781a,2,25464576,C,T,Sub,0,17,20.9,134,na,DNMT3A,CCDS33157.1,c.1937G>A,p.G646E,ONCOGENIC 208325904,PD6266a,2,25467139,t,-,D,0,9,43.5483871,62,1,DNMT3A,CCDS33157.1,c.1736delA,p.D579fs*72,ONCOGENIC 58192148,PD6141a,2,25467436,A,C,Sub,0,38,52.68,112,na,DNMT3A,CCDS33157.1,c.1640T>G,p.L547R,POSSIBLE ONCOGENIC 58096999,PD6939a,2,25467473,A,G,Sub,0,37,37.74,106,na,DNMT3A,CCDS33157.1,c.1603T>C,p.S535P,POSSIBLE ONCOGENIC 58182686,PD6171a,2,25467498,G,T,Sub,0,38,27.27,44,na,DNMT3A,CCDS33157.1,c.1578C>A,p.Y526*,ONCOGENIC 208290510,PD6300a,2,25468151,ag,-,D,0,11,30.23255814,85,2,DNMT3A,CCDS33157.1,c.1524_1525delCT,p.F509fs*36,ONCOGENIC 208059672,PD6523a,2,25469043,t,-,D,0,49,49.27536232,138,1,DNMT3A,CCDS33157.1,c.1415delA,p.D472fs*179,ONCOGENIC 209684134,PD6920a,2,25469090,ct,-,D,0,35,12.06896552,116,1,DNMT3A,CCDS33157.1,c.1367_1368delAG,p.K456fs*16,ONCOGENIC 58140339,PD7112a,2,25469114,G,T,Sub,0,27,47,217,na,DNMT3A,CCDS33157.1,c.1344C>A,p.Y448*,ONCOGENIC 209719732,PD6869a,2,25469149,t,-,D,0,24,41.30434783,91,1,DNMT3A,CCDS33157.1,c.1309delA,p.T437fs*214,ONCOGENIC 58112570,PD7087a,2,25469526,G,C,Sub,0,13,55.32,47,na,DNMT3A,CCDS33157.1,c.1242C>G,p.F414L,ONCOGENIC 208324702,PD5749a,2,25469529,g,-,D,0,13,30.76923077,39,1,DNMT3A,CCDS33157.1,c.1239delC,p.F414fs*237,ONCOGENIC 58086871,PD6125a,2,25469603,C,T,Sub,0,16,55.36,56,na,DNMT3A,CCDS33157.1,c.1165G>A,p.D389N,POSSIBLE ONCOGENIC 58068171,PD6142a,2,25469940,C,T,Sub,0,64,45.54,101,na,DNMT3A,CCDS33157.1,c.1102G>A,p.A368T,ONCOGENIC 58143237,PD6841a,2,25469945,C,G,Sub,0,68,21.95,41,na,DNMT3A,CCDS33157.1,c.1097G>C,p.R366P,ONCOGENIC 58118464,PD6081a,2,25469975,T,C,Sub,0,80,16.38,116,na,DNMT3A,CCDS33157.1,c.1067A>G,p.Q356R,POSSIBLE ONCOGENIC 58109997,PD6175a,2,25470011,A,T,Sub,0,59,23.81,105,na,DNMT3A,CCDS33157.1,c.1031T>A,p.L344Q,POSSIBLE ONCOGENIC 58086352,PD7017a,2,25470027,C,T,Sub,0,54,61.61,112,na,DNMT3A,CCDS33157.1,c.1015G>A,p.V339M,POSSIBLE ONCOGENIC 346913640,PD8647a,2,25470461,act,-,D,0,76,24.52830189,53,1,DNMT3A,CCDS33157.1,c.1011_1013delAGT,p.V339delV,POSSIBLE ONCOGENIC 208503342,PD6896a,2,25470467,aatttgcc,-,D,0,82,37.24137931,111,1,DNMT3A,CCDS33157.1,c.1000_1007delGGCAAATT,p.G334fs*6,ONCOGENIC 209687793,PD6169a,2,25470469,t,-,D,0,76,12.25806452,155,3,DNMT3A,CCDS33157.1,c.1005delA,p.K335fs*10,ONCOGENIC 58092027,PD6840a,2,25470498,G,A,Sub,0,61,50,152,na,DNMT3A,CCDS33157.1,c.976C>T,p.R326C,POSSIBLE ONCOGENIC 208305308,PD6295a,2,25470519,t,-,D,0,42,5.769230769,156,1,DNMT3A,CCDS33157.1,c.955delA,p.S319fs*26,ONCOGENIC 208497703,PD6259a,2,25470533,-,A,I,0,31,13.15789474,152,1,DNMT3A,CCDS33157.1,c.940_941insT,p.W314fs*10,ONCOGENIC 58097965,PD6231a,2,25470535,C,T,Sub,0,30,28.91,128,na,DNMT3A,CCDS33157.1,c.939G>A,p.W313*,ONCOGENIC 58112572,PD7087a,2,25470559,C,T,Sub,0,27,25.53,141,na,DNMT3A,CCDS33157.1,c.915G>A,p.W305*,ONCOGENIC 58082991,PD6193a,2,25470588,C,A,Sub,0,19,10.98,82,na,DNMT3A,CCDS33157.1,c.886G>T,p.V296L,POSSIBLE ONCOGENIC 68196182,PD8645a,2,25470908,C,A,Sub,0,48,41.67,24,na,DNMT3A,CCDS33157.1,c.853G>T,p.E285*,ONCOGENIC 208121852,PD6495a,2,25470955,-,C,I,0,70,40.22988506,87,1,DNMT3A,CCDS33157.1,c.805_806insG,p.A269fs*12,ONCOGENIC 58179967,PD6799a,2,25470968,C,A,Sub,0,56,57.14,49,na,DNMT3A,CCDS33157.1,c.793G>T,p.V265L,POSSIBLE ONCOGENIC 208506775,PD7073a,2,25471029,agggc<4>ctcct,-,D,0,53,24.76190476,87,0,DNMT3A,CCDS33157.1,c.719_732delAGGAGGCCAGCCCT,p.E240fs*8,ONCOGENIC 66858117,PD8935a,2,118575018,A,G,Sub,0,22,32.22,90,na,DDX18,CCDS2120.1,c.86-2A>G,p.?,UNKNOWN 209687387,PD6256a,2,118575158,aag,-,D,0,38,37.45583039,241,2,DDX18,CCDS2120.1,c.224_226delAAG,p.E76delE,UNKNOWN 68092935,PD8728a,2,118575301,C,T,Sub,0,75,10.69,159,na,DDX18,CCDS2120.1,c.367C>T,p.P123S,UNKNOWN 58144849,PD6310a,2,118577228,C,T,Sub,0,54,49.52,416,na,DDX18,CCDS2120.1,c.374C>T,p.T125M,UNKNOWN 58162079,PD7026a,2,118577323,G,A,Sub,0,151,46.4,500,na,DDX18,CCDS2120.1,c.469G>A,p.D157N,UNKNOWN 58189792,PD6147a,2,118582184,G,A,Sub,0,221,53,500,na,DDX18,CCDS2120.1,c.1106G>A,p.R369Q,UNKNOWN 66931077,PD7381a,2,118583050,A,G,Sub,0,189,39.8,500,na,DDX18,CCDS2120.1,c.1396A>G,p.T466A,UNKNOWN 58082458,PD6977a,2,118588230,A,G,Sub,0,80,42.86,28,na,DDX18,CCDS2120.1,c.1943A>G,p.K648R,UNKNOWN 58118005,PD7038a,2,145155899,C,T,Sub,0,222,46.15,195,na,ZEB2,CCDS2186.1,c.2855G>A,p.R952K,UNKNOWN 58163810,PD5722a,2,145156499,G,A,Sub,0,500,46.2,500,na,ZEB2,CCDS2186.1,c.2255C>T,p.T752M,UNKNOWN 58079573,PD6098a,2,145156838,A,G,Sub,0,230,11.6,500,na,ZEB2,CCDS2186.1,c.1916T>C,p.V639A,UNKNOWN 58105293,PD6944a,2,145157263,G,A,Sub,0,243,30,10,na,ZEB2,CCDS2186.1,c.1491A>T,p.Q497H,UNKNOWN 208048410,PD6881a,2,145157834,-,T,I,0,96,16.59192825,669,5,ZEB2,CCDS2186.1,c.919_920insA,p.P309fs*47,UNKNOWN 58159702,PD5769a,2,145162544,T,C,Sub,0,98,50.45,331,na,ZEB2,CCDS2186.1,c.451A>G,p.R151G,UNKNOWN 58130866,PD6824a,2,145187437,G,A,Sub,0,60,10.31,485,na,ZEB2,CCDS2186.1,c.230C>T,p.A77V,UNKNOWN 208311585,PD6538a,2,197002207,gg,-,D,0,127,38.96848138,348,3,STK17B,CCDS2315.1,c.1082_1083delCC,p.P361fs*4,UNKNOWN 58200967,PD6288a,2,197002219,A,C,Sub,0,133,52.46,345,na,STK17B,CCDS2315.1,c.1071T>G,p.D357E,UNKNOWN 208149714,PD7089a,2,197002245,tgc,-,D,0,156,38.66995074,352,2,STK17B,CCDS2315.1,c.1043_1045delGCA,p.S348delS,UNKNOWN 58197969,PD6117a,2,197005998,G,A,Sub,0,15,46.45,155,na,STK17B,CCDS2315.1,c.631C>T,p.P211S,UNKNOWN 58179126,PD6784a,2,197010769,C,A,Sub,0,61,21,500,na,STK17B,CCDS2315.1,c.346G>T,p.G116*,UNKNOWN 58105294,PD6944a,2,197010774,T,A,Sub,0,60,32.2,500,na,STK17B,CCDS2315.1,c.341C>T,p.A114V,UNKNOWN 58194433,PD6881a,2,197028105,C,T,Sub,0,44,10.27,224,na,STK17B,CCDS2315.1,c.3G>A,p.M1I,UNKNOWN 68081882,PD8647a,2,198257193,A,G,Sub,0,111,13.01,123,na,SF3B1,CCDS33356.1,c.3757-8T>C,p.?,UNKNOWN 58095548,PD6100a,2,198257766,G,A,Sub,0,85,10.58,189,na,SF3B1,CCDS33356.1,c.3686C>T,p.A1229V,POSSIBLE ONCOGENIC 58099856,PD6888a,2,198257782,G,T,Sub,0,76,13.81,239,na,SF3B1,CCDS33356.1,c.3670C>A,p.P1224T,POSSIBLE ONCOGENIC 58116904,PD6239a,2,198260883,C,T,Sub,0,302,34.25,438,na,SF3B1,CCDS33356.1,c.3436G>A,p.G1146R,POSSIBLE ONCOGENIC 58080289,PD7002a,2,198266494,T,C,Sub,0,220,18.3,306,na,SF3B1,CCDS33356.1,c.2342A>G,p.D781G,ONCOGENIC 58131353,PD6126a,2,198266494,T,C,Sub,0,220,46.78,295,na,SF3B1,CCDS33356.1,c.2342A>G,p.D781G,ONCOGENIC 58110221,PD6203a,2,198266606,C,G,Sub,0,144,41.07,168,na,SF3B1,CCDS33356.1,c.2230G>C,p.A744P,ONCOGENIC 58174785,PD6145a,2,198266606,C,G,Sub,0,144,33.47,242,na,SF3B1,CCDS33356.1,c.2230G>C,p.A744P,ONCOGENIC 66774910,PD7378a,2,198266713,C,T,Sub,0,78,46.15,130,na,SF3B1,CCDS33356.1,c.2219G>A,p.G740E,ONCOGENIC 58114039,PD6990a,2,198266714,C,T,Sub,1.27,79,42.48,226,na,SF3B1,CCDS33356.1,c.2218G>A,p.G740R,ONCOGENIC 68092946,PD8728a,2,198266810,C,T,Sub,0,70,11.05,190,na,SF3B1,CCDS33356.1,c.2122G>A,p.A708T,ONCOGENIC 58141828,PD6500a,2,198266831,C,A,Sub,0,71,35.14,333,na,SF3B1,CCDS33356.1,c.2101G>T,p.V701F,ONCOGENIC 58067469,PD6948a,2,198266834,T,C,Sub,0,69,26.97,356,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58068174,PD6142a,2,198266834,T,C,Sub,0,69,38.55,415,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58068531,PD6540a,2,198266834,T,C,Sub,0,69,31.19,436,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58069360,PD7021a,2,198266834,T,C,Sub,0,69,14.73,353,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58069575,PD6839a,2,198266834,T,C,Sub,0,69,19,442,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58070164,PD6153a,2,198266834,T,C,Sub,0,69,20.58,277,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58070916,PD7092a,2,198266834,T,C,Sub,0,69,40.77,444,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58071915,PD6476a,2,198266834,T,C,Sub,0,69,36.41,401,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58073964,PD6891a,2,198266834,T,C,Sub,0,69,14.81,351,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58081504,PD6079a,2,198266834,T,C,Sub,0,69,18.84,467,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58081837,PD7111a,2,198266834,T,C,Sub,0,69,21.56,450,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58084695,PD6304a,2,198266834,T,C,Sub,0,69,30.15,262,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58086068,PD6264a,2,198266834,T,C,Sub,0,69,36.31,336,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58088192,PD7088a,2,198266834,T,C,Sub,0,69,34.93,418,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58088319,PD6483a,2,198266834,T,C,Sub,0,69,39.66,358,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58091330,PD6315a,2,198266834,T,C,Sub,0,69,17.87,470,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58092035,PD6840a,2,198266834,T,C,Sub,0,69,48.7,499,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58094038,PD6779a,2,198266834,T,C,Sub,0,69,49.84,311,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58097007,PD6939a,2,198266834,T,C,Sub,0,69,44.12,485,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58097160,PD7104a,2,198266834,T,C,Sub,0,69,35.47,344,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58097310,PD7108a,2,198266834,T,C,Sub,0,69,19.05,378,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58097906,PD6196a,2,198266834,T,C,Sub,0,69,41.84,380,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58100624,PD6829a,2,198266834,T,C,Sub,0,69,44.89,479,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58101327,PD6133a,2,198266834,T,C,Sub,0,69,53.1,452,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58101861,PD6295a,2,198266834,T,C,Sub,0,69,28.15,373,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58104230,PD7085a,2,198266834,T,C,Sub,0,69,42.95,461,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58104592,PD6065a,2,198266834,T,C,Sub,0,69,34.35,425,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58108159,PD6298a,2,198266834,T,C,Sub,0,69,11.11,342,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58108501,PD7000a,2,198266834,T,C,Sub,0,69,45.91,416,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58111942,PD6991a,2,198266834,T,C,Sub,0,69,12.25,408,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58113216,PD6262a,2,198266834,T,C,Sub,0,69,40.62,325,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58114547,PD6287a,2,198266834,T,C,Sub,0,69,37.24,384,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58116072,PD7072a,2,198266834,T,C,Sub,0,69,41.15,469,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58117457,PD6120a,2,198266834,T,C,Sub,0,69,47.65,426,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58117814,PD6522a,2,198266834,T,C,Sub,0,69,13.07,375,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58118623,PD7098a,2,198266834,T,C,Sub,0,69,46.59,410,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58120690,PD6085a,2,198266834,T,C,Sub,0,69,40.34,414,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58121040,PD6265a,2,198266834,T,C,Sub,0,69,13.78,421,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58126853,PD6151a,2,198266834,T,C,Sub,0,69,28.57,238,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58127969,PD6074a,2,198266834,T,C,Sub,0,69,47.27,476,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58128851,PD7011a,2,198266834,T,C,Sub,0,69,15.85,410,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58131053,PD6993a,2,198266834,T,C,Sub,0,69,10.84,452,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58133511,PD6837a,2,198266834,T,C,Sub,0,69,44.6,426,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58133718,PD6163a,2,198266834,T,C,Sub,0,69,48.48,330,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58134035,PD6094a,2,198266834,T,C,Sub,0,69,22.88,437,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58135384,PD6247a,2,198266834,T,C,Sub,0,69,35.76,316,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58137215,PD6092a,2,198266834,T,C,Sub,0,69,47.02,436,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58137852,PD6796a,2,198266834,T,C,Sub,0,69,44.02,393,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58140473,PD6538a,2,198266834,T,C,Sub,0,69,37.31,394,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58141616,PD6989a,2,198266834,T,C,Sub,0,69,33.02,421,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58143730,PD6812a,2,198266834,T,C,Sub,0,69,46.59,440,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58143921,PD6835a,2,198266834,T,C,Sub,0,69,42.86,469,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58144727,PD6497a,2,198266834,T,C,Sub,0,69,34.85,396,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58145157,PD6225a,2,198266834,T,C,Sub,0,69,42.21,398,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58145347,PD5725a,2,198266834,T,C,Sub,0,69,43.97,398,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58145917,PD6144a,2,198266834,T,C,Sub,0,69,39.86,419,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58147102,PD6070a,2,198266834,T,C,Sub,0,69,39.86,434,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58147815,PD6850a,2,198266834,T,C,Sub,0,69,12.59,413,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58148200,PD7105a,2,198266834,T,C,Sub,0,69,38.16,359,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58150357,PD6338a,2,198266834,T,C,Sub,0,69,17.52,234,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58154491,PD6802a,2,198266834,T,C,Sub,0,69,43.86,399,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58154836,PD6518a,2,198266834,T,C,Sub,0,69,38.15,401,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58158096,PD6536a,2,198266834,T,C,Sub,0,69,34.94,435,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58160160,PD7024a,2,198266834,T,C,Sub,0,69,18.8,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58160381,PD6140a,2,198266834,T,C,Sub,0,69,39.78,465,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58163088,PD7077a,2,198266834,T,C,Sub,0,69,29.6,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58166657,PD6828a,2,198266834,T,C,Sub,0,69,43.6,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58167789,PD6875a,2,198266834,T,C,Sub,0,69,43.68,380,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58169091,PD7097a,2,198266834,T,C,Sub,0,69,38.68,424,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58169734,PD7073a,2,198266834,T,C,Sub,0,69,40,420,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58170592,PD7022a,2,198266834,T,C,Sub,0,69,18.92,370,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58171950,PD7090a,2,198266834,T,C,Sub,0,69,10,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58172172,PD6967a,2,198266834,T,C,Sub,0,69,43.62,470,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58172858,PD6831a,2,198266834,T,C,Sub,0,69,17.42,465,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58175602,PD6333a,2,198266834,T,C,Sub,0,69,9.27,464,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58176612,PD7075a,2,198266834,T,C,Sub,0,69,30.86,405,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58176800,PD6961a,2,198266834,T,C,Sub,0,69,35.29,357,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58180122,PD5781a,2,198266834,T,C,Sub,0,69,25.36,489,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58181986,PD6931a,2,198266834,T,C,Sub,0,69,38,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58183168,PD6311a,2,198266834,T,C,Sub,0,69,15.47,362,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58188944,PD5780a,2,198266834,T,C,Sub,0,69,21.73,451,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58189944,PD7084a,2,198266834,T,C,Sub,0,69,38.61,417,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58192155,PD6141a,2,198266834,T,C,Sub,0,69,43.75,480,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58193153,PD6478a,2,198266834,T,C,Sub,0,69,38.04,397,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58194188,PD6826a,2,198266834,T,C,Sub,0,69,50.4,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58195034,PD6905a,2,198266834,T,C,Sub,0,69,21.6,500,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58195662,PD7029a,2,198266834,T,C,Sub,0,69,33.33,426,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58195920,PD6999a,2,198266834,T,C,Sub,0,69,43.93,412,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58197567,PD6873a,2,198266834,T,C,Sub,0,69,45.81,310,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58198072,PD6938a,2,198266834,T,C,Sub,0,69,14.56,412,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58198883,PD7076a,2,198266834,T,C,Sub,0,69,34.52,423,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58202638,PD6300a,2,198266834,T,C,Sub,0,69,40.16,366,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58203615,PD7074a,2,198266834,T,C,Sub,0,69,42.02,445,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58205370,PD6259a,2,198266834,T,C,Sub,0,69,12.14,313,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 66822140,PD8733a,2,198266834,T,C,Sub,0,69,17.14,280,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 66858121,PD8935a,2,198266834,T,C,Sub,0,69,37.88,198,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 66915546,PD9659a,2,198266834,T,C,Sub,0,69,36.46,277,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 66931087,PD7381a,2,198266834,T,C,Sub,0,69,37.91,182,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 66937310,PD9711a,2,198266834,T,C,Sub,0,69,39.59,245,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 66962453,PD9663a,2,198266834,T,C,Sub,0,69,51.3,230,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 67012401,PD7390a,2,198266834,T,C,Sub,0,69,38.34,193,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 67027931,PD8732a,2,198266834,T,C,Sub,0,69,52.07,242,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 68081883,PD8647a,2,198266834,T,C,Sub,0,69,42.05,176,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 68196177,PD8645a,2,198266834,T,C,Sub,0,69,36.16,224,na,SF3B1,CCDS33356.1,c.2098A>G,p.K700E,ONCOGENIC 58079005,PD6504a,2,198267359,C,G,Sub,0,176,50,206,na,SF3B1,CCDS33356.1,c.1998G>C,p.K666N,ONCOGENIC 58144551,PD6884a,2,198267359,C,G,Sub,0,176,41.05,190,na,SF3B1,CCDS33356.1,c.1998G>C,p.K666N,ONCOGENIC 58152399,PD5746a,2,198267359,C,G,Sub,0,176,12.5,216,na,SF3B1,CCDS33356.1,c.1998G>C,p.K666N,ONCOGENIC 58178858,PD6481a,2,198267359,C,G,Sub,0,176,46.22,238,na,SF3B1,CCDS33356.1,c.1998G>C,p.K666N,ONCOGENIC 58071274,PD6122a,2,198267360,T,G,Sub,0,180,47.37,266,na,SF3B1,CCDS33356.1,c.1997A>C,p.K666T,ONCOGENIC 58071478,PD6927a,2,198267360,T,C,Sub,0,180,14.46,242,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58093628,PD6158a,2,198267360,T,C,Sub,0,180,40.56,180,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58104084,PD6498a,2,198267360,T,C,Sub,0,180,49.62,260,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58138339,PD6918a,2,198267360,T,C,Sub,0,180,45.5,200,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58147692,PD6059a,2,198267360,T,C,Sub,0,180,43.12,218,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58147981,PD7080a,2,198267360,T,G,Sub,0,180,37.37,198,na,SF3B1,CCDS33356.1,c.1997A>C,p.K666T,ONCOGENIC 58148376,PD7106a,2,198267360,T,C,Sub,0,180,35.41,209,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58148745,PD7102a,2,198267360,T,C,Sub,0,180,33.61,244,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58156891,PD6086a,2,198267360,T,C,Sub,0,180,45.87,218,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58170696,PD6994a,2,198267360,T,C,Sub,0,180,48.51,235,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58191156,PD7082a,2,198267360,T,A,Sub,0,180,43.22,236,na,SF3B1,CCDS33356.1,c.1997A>T,p.K666M,ONCOGENIC 58204507,PD6507a,2,198267360,T,G,Sub,0,180,28.42,190,na,SF3B1,CCDS33356.1,c.1997A>C,p.K666T,ONCOGENIC 66971433,PD9661a,2,198267360,T,C,Sub,0,180,41.1,73,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 66997238,PD9660a,2,198267360,T,C,Sub,0,180,26.67,105,na,SF3B1,CCDS33356.1,c.1997A>G,p.K666R,ONCOGENIC 58068331,PD6197a,2,198267361,T,G,Sub,0,180,48.67,226,na,SF3B1,CCDS33356.1,c.1996A>C,p.K666Q,ONCOGENIC 58166193,PD6830a,2,198267361,T,G,Sub,0,180,44.35,230,na,SF3B1,CCDS33356.1,c.1996A>C,p.K666Q,ONCOGENIC 58198313,PD7001a,2,198267361,T,G,Sub,0,180,45.57,237,na,SF3B1,CCDS33356.1,c.1996A>C,p.K666Q,ONCOGENIC 58070341,PD6501a,2,198267371,G,C,Sub,0,183,41.15,192,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58082316,PD7081a,2,198267371,G,T,Sub,0,183,39.15,212,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58089888,PD6934a,2,198267371,G,C,Sub,0,183,33.17,199,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58096112,PD6249a,2,198267371,G,C,Sub,0,183,21.4,215,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58098901,PD6054a,2,198267371,G,C,Sub,0,183,45.92,233,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58135056,PD7079a,2,198267371,G,T,Sub,0,183,50.6,251,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58139977,PD7083a,2,198267371,G,C,Sub,0,183,40.69,204,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58165389,PD6221a,2,198267371,G,C,Sub,0,183,39.36,249,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58168938,PD5779a,2,198267371,G,C,Sub,0,183,21.14,246,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58179403,PD7093a,2,198267371,G,T,Sub,0,183,30.54,298,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58186017,PD6224a,2,198267371,G,C,Sub,0,183,37.91,182,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58186497,PD6062a,2,198267371,G,T,Sub,0,183,42.22,225,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58187186,PD6296a,2,198267371,G,T,Sub,0,183,16.41,195,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58194580,PD6107a,2,198267371,G,C,Sub,0,183,34.84,244,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 58202384,PD7078a,2,198267371,G,T,Sub,0,183,37.94,282,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58203905,PD6250a,2,198267371,G,C,Sub,0,183,35.6,191,na,SF3B1,CCDS33356.1,c.1986C>G,p.H662Q,ONCOGENIC 66763376,PD8936a,2,198267371,G,T,Sub,0,183,72.92,48,na,SF3B1,CCDS33356.1,c.1986C>A,p.H662Q,ONCOGENIC 58112574,PD7087a,2,198267373,G,C,Sub,0,185,28.04,214,na,SF3B1,CCDS33356.1,c.1984C>G,p.H662D,ONCOGENIC 58118135,PD7109a,2,198267481,T,C,Sub,0,158,27.04,233,na,SF3B1,CCDS33356.1,c.1876A>G,p.N626D,ONCOGENIC 58077804,PD6521a,2,198267483,C,A,Sub,0,159,18.56,194,na,SF3B1,CCDS33356.1,c.1874G>T,p.R625L,ONCOGENIC 58083651,PD6131a,2,198267483,C,A,Sub,0,159,40,260,na,SF3B1,CCDS33356.1,c.1874G>T,p.R625L,ONCOGENIC 58086730,PD6091a,2,198267483,C,A,Sub,0,159,44.76,248,na,SF3B1,CCDS33356.1,c.1874G>T,p.R625L,ONCOGENIC 58107372,PD6149a,2,198267483,C,A,Sub,0,159,41.29,155,na,SF3B1,CCDS33356.1,c.1874G>T,p.R625L,ONCOGENIC 58150029,PD7100a,2,198267483,C,A,Sub,0,159,28.9,218,na,SF3B1,CCDS33356.1,c.1874G>T,p.R625L,ONCOGENIC 58150951,PD6959a,2,198267483,C,A,Sub,0,159,39.57,230,na,SF3B1,CCDS33356.1,c.1874G>T,p.R625L,ONCOGENIC 58075792,PD6289a,2,198267484,G,C,Sub,0,156,44.1,195,na,SF3B1,CCDS33356.1,c.1873C>G,p.R625G,ONCOGENIC 58082141,PD6148a,2,198267484,G,A,Sub,0,156,41.92,167,na,SF3B1,CCDS33356.1,c.1873C>T,p.R625C,ONCOGENIC 58106801,PD6511a,2,198267484,G,A,Sub,0,156,27.86,201,na,SF3B1,CCDS33356.1,c.1873C>T,p.R625C,ONCOGENIC 58172625,PD6155a,2,198267484,G,A,Sub,0,156,11.49,235,na,SF3B1,CCDS33356.1,c.1873C>T,p.R625C,ONCOGENIC 58084174,PD7086a,2,198267491,C,G,Sub,0,144,39.62,265,na,SF3B1,CCDS33356.1,c.1866G>C,p.E622D,ONCOGENIC 58125422,PD6490a,2,198267491,C,A,Sub,0.69,144,37.7,244,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58126405,PD7095a,2,198267491,C,A,Sub,0.69,144,38.03,284,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58127452,PD7099a,2,198267491,C,A,Sub,0.69,144,35.16,219,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58130629,PD6482a,2,198267491,C,G,Sub,0,144,32.33,232,na,SF3B1,CCDS33356.1,c.1866G>C,p.E622D,ONCOGENIC 58140336,PD7112a,2,198267491,C,A,Sub,0.69,144,40.32,248,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58144197,PD7091a,2,198267491,C,G,Sub,0,144,41.45,193,na,SF3B1,CCDS33356.1,c.1866G>C,p.E622D,ONCOGENIC 58155325,PD6165a,2,198267491,C,A,Sub,0.69,144,48.33,180,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58161070,PD6878a,2,198267491,C,G,Sub,0,144,21.17,222,na,SF3B1,CCDS33356.1,c.1866G>C,p.E622D,ONCOGENIC 58187538,PD6492a,2,198267491,C,A,Sub,0.69,144,37.8,246,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58194071,PD7110a,2,198267491,C,A,Sub,0.69,144,30.09,216,na,SF3B1,CCDS33356.1,c.1866G>T,p.E622D,ONCOGENIC 58195118,PD7071a,2,198267491,C,G,Sub,0,144,44.92,256,na,SF3B1,CCDS33356.1,c.1866G>C,p.E622D,ONCOGENIC 68092602,PD8648a,2,198267491,C,G,Sub,0,144,28.04,107,na,SF3B1,CCDS33356.1,c.1866G>C,p.E622D,ONCOGENIC 58079775,PD6514a,2,198267705,C,T,Sub,0,41,34.59,185,na,SF3B1,CCDS33356.1,c.1774G>A,p.E592K,ONCOGENIC 58101980,PD6329a,2,198267705,C,T,Sub,0,41,29.97,287,na,SF3B1,CCDS33356.1,c.1774G>A,p.E592K,ONCOGENIC 67016569,PD7386a,2,198273119,G,A,Sub,0,22,17.39,46,na,SF3B1,CCDS33356.1,c.1091C>T,p.A364V,UNKNOWN 58082320,PD7081a,2,209103928,C,A,Sub,0,86,46.56,189,na,IDH1,CCDS2381.1,c.1021G>T,p.A341S,UNKNOWN 58138030,PD6803a,2,209106777,C,T,Sub,0,65,12.69,323,na,IDH1,CCDS2381.1,c.791G>A,p.G264D,POSSIBLE ONCOGENIC 66876014,PD8735a,2,209106777,C,T,Sub,0,65,36.36,110,na,IDH1,CCDS2381.1,c.791G>A,p.G264D,POSSIBLE ONCOGENIC 58143244,PD6841a,2,209108202,A,T,Sub,0,128,46.74,184,na,IDH1,CCDS2381.1,c.647T>A,p.L216Q,UNKNOWN 208443236,PD6098a,2,209108215,-,T,I,0,118,16.0130719,306,5,IDH1,CCDS2381.1,c.633_634insA,p.N213fs*8,UNKNOWN 58070927,PD7092a,2,209113112,C,T,Sub,0,122,16.4,500,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 58078133,PD6517a,2,209113112,C,T,Sub,0,122,11.4,500,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 58079780,PD6514a,2,209113112,C,T,Sub,0,122,42.76,456,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 58079927,PD6520a,2,209113112,C,T,Sub,0,122,37.79,344,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 58138286,PD6823a,2,209113112,C,T,Sub,0,122,12.6,500,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 58157390,PD7009a,2,209113112,C,T,Sub,0,122,29.01,393,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 67000092,PD9662a,2,209113112,C,T,Sub,0,122,39.69,320,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 67008685,PD7388a,2,209113112,C,T,Sub,0,122,31.5,254,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 68081886,PD8647a,2,209113112,C,T,Sub,0,122,18.1,232,na,IDH1,CCDS2381.1,c.395G>A,p.R132H,ONCOGENIC 58073095,PD6985a,2,209113113,A,A,Sub,0,122,12.26,155,na,IDH1,CCDS2381.1,c.394C>T,p.R132C,ONCOGENIC 58115297,PD6887a,2,209113113,G,A,Sub,0,122,41.73,393,na,IDH1,CCDS2381.1,c.394C>T,p.R132C,ONCOGENIC 58129758,PD6974a,2,209113113,G,A,Sub,0,122,16.55,417,na,IDH1,CCDS2381.1,c.394C>T,p.R132C,ONCOGENIC 58197402,PD6979a,2,209113113,G,A,Sub,0,122,11.85,211,na,IDH1,CCDS2381.1,c.394C>T,p.R132C,ONCOGENIC 58198809,PD6958a,2,209113113,G,A,Sub,0,122,31.2,500,na,IDH1,CCDS2381.1,c.394C>T,p.R132C,ONCOGENIC 58139984,PD7083a,2,215797431,T,C,Sub,0,77,46.76,417,na,ABCA12,CCDS33372.1,c.7715A>G,p.Y2572C,UNKNOWN 58147093,PD6070a,2,215797431,T,C,Sub,0,77,55.03,358,na,ABCA12,CCDS33372.1,c.7715A>G,p.Y2572C,UNKNOWN 58095557,PD6100a,2,215809813,C,T,Sub,0,81,28.57,119,na,ABCA12,CCDS33372.1,c.7255G>A,p.G2419S,UNKNOWN 58110229,PD6203a,2,215813368,C,A,Sub,0,144,45.4,500,na,ABCA12,CCDS33372.1,c.7056G>T,p.L2352F,UNKNOWN 58128858,PD7011a,2,215818600,C,G,Sub,0,130,36.72,177,na,ABCA12,CCDS33372.1,c.6625G>C,p.E2209Q,UNKNOWN 58128859,PD7011a,2,215818632,G,C,Sub,0,129,37.66,231,na,ABCA12,CCDS33372.1,c.6593C>G,p.T2198S,UNKNOWN 58104232,PD7085a,2,215818660,T,C,Sub,0,147,50,334,na,ABCA12,CCDS33372.1,c.6565A>G,p.M2189V,UNKNOWN 66937321,PD9711a,2,215818801,G,A,Sub,0,124,47.13,157,na,ABCA12,CCDS33372.1,c.6424C>T,p.R2142C,UNKNOWN 58131741,PD6842a,2,215819972,G,A,Sub,0,100,47.7,457,na,ABCA12,CCDS33372.1,c.6347C>T,p.S2116F,UNKNOWN 58068327,PD6197a,2,215821471,A,G,Sub,0,190,45.37,313,na,ABCA12,CCDS33372.1,c.6149T>C,p.I2050T,UNKNOWN 58105298,PD6944a,2,215821481,G,T,Sub,0,173,72.97,37,na,ABCA12,CCDS33372.1,c.6139G>A,p.A2047T,UNKNOWN 66876015,PD8735a,2,215823020,G,A,Sub,0,160,24.71,174,na,ABCA12,CCDS33372.1,c.6098C>T,p.T2033I,UNKNOWN 58082461,PD6977a,2,215823174,T,C,Sub,0,107,33.33,9,na,ABCA12,CCDS33372.1,c.5944A>G,p.S1982G,UNKNOWN 67016573,PD7386a,2,215838692,G,A,Sub,0,158,10,240,na,ABCA12,CCDS33372.1,c.5543C>T,p.S1848F,UNKNOWN 58071671,PD6110a,2,215840606,T,C,Sub,0,289,56.16,479,na,ABCA12,CCDS33372.1,c.5284A>G,p.M1762V,UNKNOWN 58129408,PD6102a,2,215840606,T,C,Sub,0,289,45.4,500,na,ABCA12,CCDS33372.1,c.5284A>G,p.M1762V,UNKNOWN 58082462,PD6977a,2,215840615,T,C,Sub,0,288,16.81,113,na,ABCA12,CCDS33372.1,c.5275A>G,p.T1759A,UNKNOWN 58093630,PD6158a,2,215848585,T,C,Sub,0,22,49.54,323,na,ABCA12,CCDS33372.1,c.4168A>G,p.M1390V,UNKNOWN 58150366,PD6338a,2,215848589,A,C,Sub,0,21,44.02,234,na,ABCA12,CCDS33372.1,c.4164T>G,p.I1388M,UNKNOWN 58095558,PD6100a,2,215852496,G,A,Sub,0,35,10.57,265,na,ABCA12,CCDS33372.1,c.3851C>T,p.P1284L,UNKNOWN 58154983,PD6491a,2,215854113,T,C,Sub,0,264,46.99,481,na,ABCA12,CCDS33372.1,c.3769A>G,p.I1257V,UNKNOWN 58092312,PD6890a,2,215855454,T,C,Sub,0,96,44.31,492,na,ABCA12,CCDS33372.1,c.3596A>G,p.E1199G,UNKNOWN 58203625,PD7074a,2,215855598,A,T,Sub,0,89,43.6,500,na,ABCA12,CCDS33372.1,c.3452T>A,p.F1151Y,UNKNOWN 58091864,PD6782a,2,215855653,G,A,Sub,0,71,27.6,500,na,ABCA12,CCDS33372.1,c.3397C>T,p.L1133F,UNKNOWN 58173047,PD6233a,2,215862429,C,T,Sub,0,113,50.99,353,na,ABCA12,CCDS33372.1,c.3284G>A,p.R1095Q,UNKNOWN 58179132,PD6784a,2,215882793,C,T,Sub,0,108,11.8,500,na,ABCA12,CCDS33372.1,c.1721G>A,p.G574E,UNKNOWN 58071121,PD6083a,2,215884074,G,A,Sub,0,97,17.27,417,na,ABCA12,CCDS33372.1,c.1643C>T,p.A548V,UNKNOWN 58092811,PD7028a,2,215884457,T,C,Sub,0,173,52.37,422,na,ABCA12,CCDS33372.1,c.1351A>G,p.K451E,UNKNOWN 58174463,PD5716a,2,215891598,G,T,Sub,0,50,40.45,309,na,ABCA12,CCDS33372.1,c.1126C>A,p.P376T,UNKNOWN 58090033,PD6940a,2,215896600,G,A,Sub,0,88,51.69,474,na,ABCA12,CCDS33372.1,c.1006C>T,p.H336Y,UNKNOWN 58191970,PD6965a,2,215901713,T,C,Sub,0,56,16.88,314,na,ABCA12,CCDS33372.1,c.949A>G,p.K317E,UNKNOWN 58077034,PD6983a,2,215914447,C,G,Sub,0,54,37.5,8,na,ABCA12,CCDS33372.1,c.596G>C,p.W199S,UNKNOWN 67016545,PD7386a,3,105397371,G,A,Sub,0,117,16.67,222,na,CBLB,CCDS2948.1,c.2473C>T,p.P825S,UNKNOWN 66858059,PD8935a,3,105400642,C,T,Sub,0,57,5.13,273,na,CBLB,CCDS2948.1,c.2222G>A,p.C741Y,UNKNOWN 58193968,PD7110a,3,105404254,T,C,Sub,0,102,42.16,472,na,CBLB,CCDS2948.1,c.2111A>G,p.E704G,UNKNOWN 58164788,PD6786a,3,105421300,A,T,Sub,0,53,45.65,46,na,CBLB,CCDS2948.1,c.1597T>A,p.S533T,UNKNOWN 67016546,PD7386a,3,105422878,T,A,Sub,0,79,11.93,109,na,CBLB,CCDS2948.1,c.1547A>T,p.K516I,UNKNOWN 58151971,PD6156a,3,105452950,G,A,Sub,0,142,10.2,500,na,CBLB,CCDS2948.1,c.1106C>T,p.T369I,UNKNOWN 58115034,PD6982a,3,105586379,C,T,Sub,0,247,12.67,300,na,CBLB,CCDS2948.1,c.43G>A,p.G15R,UNKNOWN 207924305,PD6779a,3,128200135,ctt,-,D,0,85,47.36842105,34,2,GATA2,CCDS3049.1,c.1168_1170delAAG,p.K390delK,ONCOGENIC 208263576,PD6492a,3,128200135,ctt,-,D,0,85,16.31578947,179,2,GATA2,CCDS3049.1,c.1168_1170delAAG,p.K390delK,ONCOGENIC 344722935,PD8939a,3,128200135,ctt,-,D,0,82,37.03703704,27,2,GATA2,CCDS3049.1,c.1168_1170delAAG,p.K390delK,ONCOGENIC 58166714,PD6861a,3,128200154,C,T,Sub,0,70,12.88,132,na,GATA2,CCDS3049.1,c.1151G>A,p.R384K,ONCOGENIC 58089759,PD6934a,3,128200155,T,C,Sub,0,70,21.28,47,na,GATA2,CCDS3049.1,c.1150A>G,p.R384G,ONCOGENIC 58135510,PD6869a,3,128200691,C,T,Sub,0,17,45.71,35,na,GATA2,CCDS3049.1,c.1114G>A,p.A372T,ONCOGENIC 58144455,PD6884a,3,128202770,T,C,Sub,0,47,22.22,27,na,GATA2,CCDS3049.1,c.950A>G,p.N317S,ONCOGENIC 208392909,PD6139a,3,128202804,ag,-,D,0,23,29.03225806,31,3,GATA2,CCDS3049.1,c.915_916delCT,p.W306fs*77,POSSIBLE ONCOGENIC 58144456,PD6884a,3,128202830,T,C,Sub,0,18,20,20,na,GATA2,CCDS3049.1,c.890A>G,p.N297S,POSSIBLE ONCOGENIC 58101063,PD6539a,3,168802822,T,C,Sub,0,214,46.54,434,na,MECOM,CCDS3205.1,c.3031A>G,p.M1011V,UNKNOWN 58096690,PD6889a,3,168806813,G,A,Sub,0,140,43.94,462,na,MECOM,CCDS3205.1,c.2996C>T,p.P999L,UNKNOWN 58090835,PD6261a,3,168806877,G,T,Sub,0,141,44.65,430,na,MECOM,CCDS3205.1,c.2932C>A,p.Q978K,UNKNOWN 66807897,PD7370a,3,168808013,C,T,Sub,0,253,12.86,140,na,MECOM,CCDS3205.1,c.2612G>A,p.S871N,UNKNOWN 58079236,PD6822a,3,168810822,C,T,Sub,0,215,15.2,500,na,MECOM,CCDS3205.1,c.2524G>A,p.A842T,UNKNOWN 66999994,PD9662a,3,168819968,G,A,Sub,0,73,48.76,121,na,MECOM,CCDS3205.1,c.2087C>T,p.A696V,UNKNOWN 66875926,PD8735a,3,168819993,C,T,Sub,0,72,12.16,74,na,MECOM,CCDS3205.1,c.2062G>A,p.A688T,UNKNOWN 67016540,PD7386a,3,168833418,G,A,Sub,0,113,11.89,185,na,MECOM,CCDS3205.1,c.1678C>T,p.R560*,UNKNOWN 58166910,PD6152a,3,168833541,T,C,Sub,0,194,48.59,389,na,MECOM,CCDS3205.1,c.1555A>G,p.M519V,UNKNOWN 66858058,PD8935a,3,168838981,C,A,Sub,0,53,7,100,na,MECOM,CCDS3205.1,c.431G>T,p.S144I,UNKNOWN 58108334,PD6900a,3,168845744,A,C,Sub,0,255,54.6,500,na,MECOM,CCDS3205.1,c.154T>G,p.S52A,UNKNOWN 58115471,PD5738a,4,17963535,G,C,Sub,0,91,18.2,500,na,LCORL,CCDS3425.1,c.421C>G,p.R141G,UNKNOWN 58139717,PD5782a,4,55124952,C,T,Sub,0,216,44.75,362,na,PDGFRA,CCDS3495.1,c.17C>T,p.P6L,UNKNOWN 58082418,PD6977a,4,55127388,A,G,Sub,0,120,45,500,na,PDGFRA,CCDS3495.1,c.176A>G,p.Y59C,UNKNOWN 58105217,PD6944a,4,55130094,C,A,Sub,0,92,77.78,9,na,PDGFRA,CCDS3495.1,c.628G>A,p.A210T,UNKNOWN 58195824,PD6999a,4,55133495,C,T,Sub,0,94,54.19,203,na,PDGFRA,CCDS3495.1,c.799C>T,p.P267S,UNKNOWN 58191639,PD6781a,4,55133534,G,A,Sub,0,120,38.05,113,na,PDGFRA,CCDS3495.1,c.838G>A,p.A280T,UNKNOWN 67016454,PD7386a,4,55141053,C,T,Sub,0,49,10.34,58,na,PDGFRA,CCDS3495.1,c.1699C>T,p.P567S,UNKNOWN 58070095,PD6153a,4,55146541,A,G,Sub,0,134,46.22,225,na,PDGFRA,CCDS3495.1,c.2215A>G,p.T739A,UNKNOWN 58092395,PD7016a,4,55153679,G,A,Sub,0,34,17.97,423,na,PDGFRA,CCDS3495.1,c.2645G>A,p.G882D,UNKNOWN 58115181,PD6887a,4,55156637,G,T,Sub,0,242,50,352,na,PDGFRA,CCDS3495.1,c.3038G>T,p.S1013I,UNKNOWN 58201067,PD6324a,4,55156657,A,G,Sub,0,208,50.65,308,na,PDGFRA,CCDS3495.1,c.3058A>G,p.I1020V,UNKNOWN 58118264,PD6081a,4,55161385,G,A,Sub,0,223,13,500,na,PDGFRA,CCDS3495.1,c.3216G>A,p.M1072I,UNKNOWN 58110763,PD6870a,4,55161392,G,A,Sub,0.46,219,49,500,na,PDGFRA,CCDS3495.1,c.3223G>A,p.D1075N,UNKNOWN 58164443,PD6883a,4,55524231,T,C,Sub,0,30,35.63,87,na,KIT,CCDS3496.1,c.50T>C,p.L17P,UNKNOWN 58098448,PD6783a,4,55561701,C,T,Sub,0,116,43.45,313,na,KIT,CCDS3496.1,c.91C>T,p.P31S,POSSIBLE ONCOGENIC 58072949,PD6985a,4,55565874,G,A,Sub,0,129,10.56,464,na,KIT,CCDS3496.1,c.698G>A,p.C233Y,UNKNOWN 68195605,PD8731a,4,55592103,G,A,Sub,0,112,30.87,149,na,KIT,CCDS3496.1,c.1427G>A,p.S476N,UNKNOWN 67027850,PD8732a,4,55594094,G,A,Sub,0,65,5.08,354,na,KIT,CCDS3496.1,c.1879+1G>A,p.?,UNKNOWN 58194280,PD6881a,4,55594177,C,T,Sub,0,53,10.6,500,na,KIT,CCDS3496.1,c.1880C>T,p.P627L,ONCOGENIC 68092815,PD8728a,4,55594273,C,T,Sub,0,20,6.16,146,na,KIT,CCDS3496.1,c.1976C>T,p.A659V,POSSIBLE ONCOGENIC 58164943,PD6228a,4,55595599,C,T,Sub,0,113,53.6,500,na,KIT,CCDS3496.1,c.2089C>T,p.H697Y,ONCOGENIC 58099404,PD6928a,4,55598082,A,T,Sub,0,212,49.24,262,na,KIT,CCDS3496.1,c.2279A>T,p.D760V,POSSIBLE ONCOGENIC 58171711,PD6130a,4,55598084,G,A,Sub,0,216,49.42,257,na,KIT,CCDS3496.1,c.2281G>A,p.E761K,ONCOGENIC 68195830,PD8730a,4,55599295,G,T,Sub,0,244,5.33,150,na,KIT,CCDS3496.1,c.2421G>T,p.K807N,ONCOGENIC 68195607,PD8731a,4,55599321,A,T,Sub,0,267,6.08,181,na,KIT,CCDS3496.1,c.2447A>T,p.D816V,ONCOGENIC 58149488,PD6818a,4,55603436,G,T,Sub,0,76,45.98,498,na,KIT,CCDS3496.1,c.2792G>T,p.S931I,UNKNOWN 67016456,PD7386a,4,55604658,C,T,Sub,0,108,5.11,274,na,KIT,CCDS3496.1,c.2866C>T,p.R956W,POSSIBLE ONCOGENIC 58105220,PD6944a,4,55604716,G,T,Sub,0,143,66.67,3,na,KIT,CCDS3496.1,c.2924A>T,p.D975V,UNKNOWN 58105223,PD6944a,4,106155124,A,A,Sub,0,141,76.92,13,na,TET2,CCDS47120.1,c.25G>A,p.V9I,UNKNOWN 208504257,PD6850a,4,106155319,g,-,D,0,142,15.64853556,1193,1,TET2,CCDS47120.1,c.220delG,p.V74fs*0,ONCOGENIC 58189362,PD7018a,4,106155406,C,G,Sub,0,111,52.8,500,na,TET2,CCDS47120.1,c.307C>G,p.L103V,UNKNOWN 209602626,PD6787a,4,106155410,c,-,D,0,113,44.11764706,1392,1,TET2,CCDS47120.1,c.311delC,p.S104fs*9,ONCOGENIC 58088436,PD6920a,4,106155428,A,G,Sub,0,114,17.8,500,na,TET2,CCDS47120.1,c.329A>G,p.K110R,UNKNOWN 208478041,PD6080a,4,106155468,-,T,I,,107,31.41762452,1827,1,TET2,CCDS47120.1,c.369_370insT,p.N124fs*0,ONCOGENIC 208395012,PD7072a,4,106155472,t,-,D,,106,35.63791875,1403,2,TET2,CCDS47120.1,c.373delT,p.F125fs*3,ONCOGENIC 208316647,PD6957a,4,106155496,c,-,D,,95,42.55874674,1532,2,TET2,CCDS47120.1,c.397delC,p.P133fs*12,ONCOGENIC 208520142,PD6252a,4,106155605,at,-,D,0,89,30.09307135,961,1,TET2,CCDS47120.1,c.506_507delAT,p.H169fs*6,ONCOGENIC 208409548,PD6960a,4,106155609,c,-,D,,86,39.29393707,1303,1,TET2,CCDS47120.1,c.510delC,p.C171fs*12,ONCOGENIC 207943105,PD6877a,4,106155749,c,-,D,0,122,33.81088825,1392,2,TET2,CCDS47120.1,c.650delC,p.V218fs*32,ONCOGENIC 208111219,PD6812a,4,106155774,c,-,D,,117,37.94212219,1866,2,TET2,CCDS47120.1,c.675delC,p.L226fs*24,ONCOGENIC 343876938,PD7371a,4,106155792,t,-,D,0,115,37.15498938,942,1,TET2,CCDS47120.1,c.693delT,p.Q232fs*18,ONCOGENIC 208520143,PD6252a,4,106155861,t,-,D,0,117,18.68344627,1027,1,TET2,CCDS47120.1,c.762delT,p.Q255fs*38,ONCOGENIC 208348463,PD6203a,4,106155921,c,-,D,,89,35.05798394,1121,1,TET2,CCDS47120.1,c.822delC,p.N275fs*18,ONCOGENIC 208275847,PD6224a,4,106155921,c,-,D,,89,10.98130841,1284,1,TET2,CCDS47120.1,c.822delC,p.N275fs*18,ONCOGENIC 208221352,PD6225a,4,106155944,-,T,I,,81,86.53653654,1998,1,TET2,CCDS47120.1,c.845_846insT,p.E283fs*0,ONCOGENIC 58078091,PD6517a,4,106155946,G,T,Sub,0,73,43.8,500,na,TET2,CCDS47120.1,c.847G>T,p.E283*,ONCOGENIC 343483775,PD7366a,4,106156043,c,-,D,0,87,65.39634146,655,2,TET2,CCDS47120.1,c.944delC,p.Q317fs*30,ONCOGENIC 207940330,PD6534a,4,106156043,c,-,D,,90,25.17702596,1271,2,TET2,CCDS47120.1,c.944delC,p.Q317fs*30,ONCOGENIC 68195835,PD8730a,4,106156072,C,T,Sub,0,80,72.29,397,na,TET2,CCDS47120.1,c.973C>T,p.Q325*,ONCOGENIC 208212580,PD6310a,4,106156073,aa,-,D,0,80,54.06218656,982,5,TET2,CCDS47120.1,c.974_975delAA,p.K326fs*4,ONCOGENIC 58143360,PD5726a,4,106156120,C,T,Sub,0,65,43.2,500,na,TET2,CCDS47120.1,c.1021C>T,p.Q341*,ONCOGENIC 58135999,PD5743a,4,106156158,T,A,Sub,0,74,25.8,500,na,TET2,CCDS47120.1,c.1059T>A,p.C353*,ONCOGENIC 208302780,PD6245a,4,106156169,g,-,D,0,81,36.0724234,713,1,TET2,CCDS47120.1,c.1070delG,p.S357fs*15,ONCOGENIC 207995407,PD6872a,4,106156202,a,-,D,,89,21.31350682,1614,2,TET2,CCDS47120.1,c.1103delA,p.E368fs*4,ONCOGENIC 58165610,PD7116a,4,106156246,C,T,Sub,0,70,36.6,500,na,TET2,CCDS47120.1,c.1147C>T,p.Q383*,ONCOGENIC 208196670,PD6504a,4,106156263,t,-,D,,75,35.48840478,1423,1,TET2,CCDS47120.1,c.1164delT,p.K389fs*38,ONCOGENIC 207939035,PD6950a,4,106156334,t,-,D,,61,24.12060302,1194,2,TET2,CCDS47120.1,c.1235delT,p.P413fs*14,ONCOGENIC 58096724,PD6889a,4,106156348,C,T,Sub,0,59,87.6,500,na,TET2,CCDS47120.1,c.1249C>T,p.Q417*,ONCOGENIC 208124489,PD6261a,4,106156351,cttc,-,D,0,69,39.34837093,645,2,TET2,CCDS47120.1,c.1252_1255delCTTC,p.P419fs*7,ONCOGENIC 208390766,PD5711a,4,106156354,c,-,D,,65,26.12393682,823,2,TET2,CCDS47120.1,c.1255delC,p.P419fs*8,ONCOGENIC 58119049,PD6080a,4,106156358,C,G,Sub,0,58,23.8,500,na,TET2,CCDS47120.1,c.1259C>G,p.S420*,ONCOGENIC 58140198,PD7112a,4,106156360,G,T,Sub,0,59,90.8,500,na,TET2,CCDS47120.1,c.1261G>T,p.E421*,ONCOGENIC 209689028,PD6254a,4,106156361,a,-,D,0,62,32.08955224,536,2,TET2,CCDS47120.1,c.1262delA,p.G422fs*5,ONCOGENIC 58099573,PD7008a,4,106156366,A,T,Sub,0,58,35.2,500,na,TET2,CCDS47120.1,c.1267A>T,p.K423*,ONCOGENIC 58078907,PD6504a,4,106156399,C,T,Sub,0,56,45,500,na,TET2,CCDS47120.1,c.1300C>T,p.H434Y,ONCOGENIC 208168737,PD6272a,4,106156454,a,-,D,,86,25.18518519,1080,1,TET2,CCDS47120.1,c.1355delA,p.E452fs*34,ONCOGENIC 58200197,PD5758a,4,106156478,C,T,Sub,0,99,45,500,na,TET2,CCDS47120.1,c.1379C>T,p.S460F,ONCOGENIC 208007431,PD6275a,4,106156498,acac,-,D,0,114,12.92576419,1076,2,TET2,CCDS47120.1,c.1399_1402delACAC,p.T467fs*18,ONCOGENIC 208166576,PD5773a,4,106156579,g,-,D,0,113,78.76106195,1017,3,TET2,CCDS47120.1,c.1480delG,p.T495fs*2,ONCOGENIC 58179454,PD5773a,4,106156579,G,A,Sub,0,113,13.59,206,na,TET2,CCDS47120.1,c.1480G>A,p.G494R,POSSIBLE ONCOGENIC 58147581,PD6059a,4,106156627,G,T,Sub,0,114,42.6,500,na,TET2,CCDS47120.1,c.1528G>T,p.E510*,ONCOGENIC 58185104,PD6485a,4,106156657,G,A,Sub,0,134,49.8,500,na,TET2,CCDS47120.1,c.1558G>A,p.G520S,UNKNOWN 58105224,PD6944a,4,106156661,G,A,Sub,0.76,131,15.49,71,na,TET2,CCDS47120.1,c.1562G>A,p.S521N,UNKNOWN 58077918,PD6877a,4,106156687,C,T,Sub,0,128,12.6,500,na,TET2,CCDS47120.1,c.1588C>T,p.Q530*,ONCOGENIC 58110165,PD6203a,4,106156687,C,T,Sub,0,128,48.2,500,na,TET2,CCDS47120.1,c.1588C>T,p.Q530*,ONCOGENIC 208372079,PD6165a,4,106156692,-,A,I,0,136,44.16553596,1473,0,TET2,CCDS47120.1,c.1593_1594insA,p.L532fs*35,ONCOGENIC 58090719,PD6206a,4,106156729,C,T,Sub,0,143,34.4,500,na,TET2,CCDS47120.1,c.1630C>T,p.R544*,ONCOGENIC 58125190,PD6194a,4,106156729,C,T,Sub,0,143,41.4,500,na,TET2,CCDS47120.1,c.1630C>T,p.R544*,ONCOGENIC 58204184,PD6484a,4,106156729,C,T,Sub,0,143,45.6,500,na,TET2,CCDS47120.1,c.1630C>T,p.R544*,ONCOGENIC 208223995,PD7099a,4,106156745,c,-,D,0,170,31.5647482,1110,1,TET2,CCDS47120.1,c.1646delC,p.T549fs*12,ONCOGENIC 58080397,PD6545a,4,106156747,C,T,Sub,0,163,44.6,500,na,TET2,CCDS47120.1,c.1648C>T,p.R550*,ONCOGENIC 66791644,PD7364a,4,106156747,C,T,Sub,0,163,46,500,na,TET2,CCDS47120.1,c.1648C>T,p.R550*,ONCOGENIC 67000178,PD9662a,4,106156747,C,T,Sub,0,163,32.6,500,na,TET2,CCDS47120.1,c.1648C>T,p.R550*,ONCOGENIC 58135092,PD7079a,4,106156776,T,A,Sub,0,166,47,500,na,TET2,CCDS47120.1,c.1677T>A,p.Y559*,ONCOGENIC 208526363,PD6296a,4,106156777,-,T,I,0,175,26.3,1000,1,TET2,CCDS47120.1,c.1678_1679insT,p.K561fs*6,ONCOGENIC 208053152,PD6530a,4,106156782,-,AA,I,0,172,41.09090909,820,3,TET2,CCDS47120.1,c.1683_1684insAA,p.P562fs*7,ONCOGENIC 208559172,PD6510a,4,106156841,-,A,I,0,137,52.55775578,1210,0,TET2,CCDS47120.1,c.1742_1743insA,p.N582fs*0,ONCOGENIC 58120897,PD6265a,4,106156875,T,G,Sub,0,152,25,500,na,TET2,CCDS47120.1,c.1776T>G,p.Y592*,ONCOGENIC 343478085,PD7390a,4,106156913,-,A,I,0,152,40.73275862,463,1,TET2,CCDS47120.1,c.1814_1815insA,p.Y605fs*1,ONCOGENIC 216795505,PD5747a,4,106156914,c,-,D,0,149,33.17422434,837,1,TET2,CCDS47120.1,c.1815delC,p.Y605fs*0,ONCOGENIC 207910054,PD6925a,4,106156936,g,-,D,0,131,32.09580838,831,6,TET2,CCDS47120.1,c.1837delG,p.L615fs*24,ONCOGENIC 208245113,PD6281a,4,106156941,-,G,I,0,130,43.22660099,810,6,TET2,CCDS47120.1,c.1842_1843insG,p.L615fs*23,ONCOGENIC 345250233,PD8735a,4,106156941,-,G,I,0,143,13.40388007,567,6,TET2,CCDS47120.1,c.1842_1843insG,p.L615fs*23,ONCOGENIC 58183306,PD6124a,4,106156963,C,T,Sub,0,118,31,500,na,TET2,CCDS47120.1,c.1864C>T,p.Q622*,ONCOGENIC 58139897,PD7083a,4,106157053,C,T,Sub,0,120,36,500,na,TET2,CCDS47120.1,c.1954C>T,p.Q652*,ONCOGENIC 58121069,PD6791a,4,106157119,C,T,Sub,0,151,51.4,500,na,TET2,CCDS47120.1,c.2020C>T,p.Q674*,ONCOGENIC 58073186,PD6941a,4,106157155,A,T,Sub,0,173,22.8,500,na,TET2,CCDS47120.1,c.2056A>T,p.R686*,ONCOGENIC 58148285,PD7106a,4,106157167,C,T,Sub,0,175,31.8,500,na,TET2,CCDS47120.1,c.2068C>T,p.Q690*,ONCOGENIC 58164446,PD6883a,4,106157167,C,T,Sub,0,175,26.6,500,na,TET2,CCDS47120.1,c.2068C>T,p.Q690*,ONCOGENIC 58165411,PD6787a,4,106157167,C,T,Sub,0,175,41.4,500,na,TET2,CCDS47120.1,c.2068C>T,p.Q690*,ONCOGENIC 216795506,PD5747a,4,106157252,t,-,D,0,114,30.81481481,672,4,TET2,CCDS47120.1,c.2153delT,p.L719fs*32,ONCOGENIC 208452010,PD6819a,4,106157264,-,A,I,0,109,44.20654912,792,2,TET2,CCDS47120.1,c.2165_2166insA,p.P723fs*3,ONCOGENIC 66964071,PD7367a,4,106157306,C,G,Sub,0,94,40.47,341,na,TET2,CCDS47120.1,c.2207C>G,p.S736*,ONCOGENIC 58140870,PD6301a,4,106157329,C,T,Sub,0,82,20.8,500,na,TET2,CCDS47120.1,c.2230C>T,p.Q744*,ONCOGENIC 208163446,PD5738a,4,106157351,-,A,I,0,87,13.00970874,515,3,TET2,CCDS47120.1,c.2252_2253insA,p.N752fs*2,ONCOGENIC 208069636,PD5750a,4,106157368,c,-,D,0,105,65.83850932,481,1,TET2,CCDS47120.1,c.2269delC,p.L757fs*56,ONCOGENIC 58165611,PD7116a,4,106157371,C,T,Sub,0,107,34,500,na,TET2,CCDS47120.1,c.2272C>T,p.Q758*,ONCOGENIC 208217109,PD7031a,4,106157375,-,T,I,0,107,49.58563536,724,4,TET2,CCDS47120.1,c.2276_2277insT,p.P761fs*8,ONCOGENIC 207923738,PD6779a,4,106157376,tt,-,D,0,106,24.24242424,455,4,TET2,CCDS47120.1,c.2277_2278delTT,p.F760fs*8,ONCOGENIC 207987212,PD6258a,4,106157380,c,-,D,0,101,37.03703704,617,2,TET2,CCDS47120.1,c.2281delC,p.P761fs*52,ONCOGENIC 209594106,PD6793a,4,106157408,aa,-,D,0,139,41.05011933,414,3,TET2,CCDS47120.1,c.2309_2310delAA,p.E772fs*8,ONCOGENIC 58204365,PD6507a,4,106157419,T,A,Sub,0,135,50,500,na,TET2,CCDS47120.1,c.2320T>A,p.S774T,POSSIBLE ONCOGENIC 208275865,PD6224a,4,106157450,-,A,I,0,129,54.03930131,916,0,TET2,CCDS47120.1,c.2351_2352insA,p.C784fs*0,ONCOGENIC 208398228,PD6270a,4,106157451,t,-,D,0,129,46.75052411,952,4,TET2,CCDS47120.1,c.2352delT,p.H786fs*27,ONCOGENIC 208275866,PD6224a,4,106157454,-,T,I,0,137,64.77644493,916,4,TET2,CCDS47120.1,c.2355_2356insT,p.H786fs*3,ONCOGENIC 58202366,PD7078a,4,106157467,C,T,Sub,0,135,40.8,500,na,TET2,CCDS47120.1,c.2368C>T,p.Q790*,ONCOGENIC 208076164,PD6521a,4,106157573,c,-,D,0,153,7.142857143,658,1,TET2,CCDS47120.1,c.2474delC,p.S825fs*0,ONCOGENIC 58102141,PD6270a,4,106157573,C,G,Sub,0,148,46.4,500,na,TET2,CCDS47120.1,c.2474C>G,p.S825*,ONCOGENIC 58190994,PD7082a,4,106157578,G,A,Sub,0,147,45.2,500,na,TET2,CCDS47120.1,c.2479G>A,p.A827T,UNKNOWN 209592174,PD6947a,4,106157589,-,AATA,I,0,155,26.34146341,741,1,TET2,CCDS47120.1,c.2490_2491insAATA,p.Q831fs*16,ONCOGENIC 58141416,PD6192a,4,106157603,C,A,Sub,0,134,42.2,500,na,TET2,CCDS47120.1,c.2504C>A,p.S835*,ONCOGENIC 208077123,PD6539a,4,106157608,a,-,D,0,143,8.131241084,701,2,TET2,CCDS47120.1,c.2509delA,p.N837fs*4,ONCOGENIC 208077124,PD6539a,4,106157632,a,-,D,0,144,17.24137931,725,3,TET2,CCDS47120.1,c.2533delA,p.E846fs*27,ONCOGENIC 208146295,PD6244a,4,106157650,c,-,D,0,125,44.28857715,499,2,TET2,CCDS47120.1,c.2551delC,p.P851fs*22,ONCOGENIC 209563001,PD6124a,4,106157731,c,-,D,0,105,27.25988701,707,1,TET2,CCDS47120.1,c.2632delC,p.L878fs*43,ONCOGENIC 58142141,PD6201a,4,106157745,C,A,Sub,0,100,34.64,407,na,TET2,CCDS47120.1,c.2646C>A,p.C882*,ONCOGENIC 208038954,PD6189a,4,106157764,a,-,D,0,102,36.02150538,925,2,TET2,CCDS47120.1,c.2665delA,p.K889fs*32,ONCOGENIC 344037866,PD7367a,4,106157788,C,-,D,0,104,28.77,511,1,TET2,CCDS47120.1,c.2689delC,p.Q897fs*24,ONCOGENIC 58103809,PD6970a,4,106157827,C,T,Sub,0,112,39,500,na,TET2,CCDS47120.1,c.2728C>T,p.Q910*,ONCOGENIC 66930937,PD7381a,4,106157836,C,T,Sub,0,122,34.89,427,na,TET2,CCDS47120.1,c.2737C>T,p.Q913*,ONCOGENIC 58087469,PD6480a,4,106157845,C,T,Sub,0,127,83.8,500,na,TET2,CCDS47120.1,c.2746C>T,p.Q916*,ONCOGENIC 58121070,PD6791a,4,106157845,C,T,Sub,0,127,29,500,na,TET2,CCDS47120.1,c.2746C>T,p.Q916*,ONCOGENIC 58169598,PD7073a,4,106157845,C,T,Sub,0,127,44.2,500,na,TET2,CCDS47120.1,c.2746C>T,p.Q916*,ONCOGENIC 58184143,PD5776a,4,106157845,C,T,Sub,0,127,38,500,na,TET2,CCDS47120.1,c.2746C>T,p.Q916*,ONCOGENIC 208265120,PD7010a,4,106157893,g,-,D,0,179,6.666666667,420,1,TET2,CCDS47120.1,c.2794delG,p.D932fs*21,ONCOGENIC 66902151,PD8937a,4,106157902,G,T,Sub,0,177,42.77,318,na,TET2,CCDS47120.1,c.2803G>T,p.G935*,ONCOGENIC 58111824,PD6991a,4,106157908,C,G,Sub,0,175,51.2,500,na,TET2,CCDS47120.1,c.2809C>G,p.H937D,UNKNOWN 208065207,PD6822a,4,106157909,a,-,D,,188,4.109589041,949,1,TET2,CCDS47120.1,c.2810delA,p.H937fs*16,ONCOGENIC 58162786,PD6950a,4,106157995,C,T,Sub,0,177,26.8,500,na,TET2,CCDS47120.1,c.2896C>T,p.Q966*,ONCOGENIC 58184490,PD6876a,4,106157995,C,T,Sub,0,177,23.8,500,na,TET2,CCDS47120.1,c.2896C>T,p.Q966*,ONCOGENIC 66915657,PD9659a,4,106157995,C,T,Sub,0,177,45,500,na,TET2,CCDS47120.1,c.2896C>T,p.Q966*,ONCOGENIC 58108547,PD7000a,4,106158049,G,T,Sub,0,133,39.4,500,na,TET2,CCDS47120.1,c.2950G>T,p.E984*,ONCOGENIC 208409549,PD6960a,4,106158052,c,-,D,0,137,30.20344288,638,2,TET2,CCDS47120.1,c.2953delC,p.P985fs*22,ONCOGENIC 208293943,PD6228a,4,106158064,-,AAGC,I,0,138,25.30933633,795,0,TET2,CCDS47120.1,c.2965_2966insAAGC,p.P989fs*21,ONCOGENIC 208302781,PD6245a,4,106158109,a,-,D,0,130,40.94117647,425,5,TET2,CCDS47120.1,c.3010delA,p.K1005fs*2,ONCOGENIC 207949310,PD6303a,4,106158201,-,T,I,0,116,55.08196721,1218,3,TET2,CCDS47120.1,c.3102_3103insT,p.H1036fs*7,ONCOGENIC 207949311,PD6303a,4,106158204,-,G,I,0,121,13.05418719,1218,0,TET2,CCDS47120.1,c.3105_3106insG,p.H1036fs*7,ONCOGENIC 208548680,PD6339a,4,106158218,-,A,I,0,112,44.36090226,399,1,TET2,CCDS47120.1,c.3119_3120insA,p.F1041fs*2,ONCOGENIC 208526350,PD6296a,4,106158238,a,-,D,0,98,5.979073244,669,1,TET2,CCDS47120.1,c.3139delA,p.T1047fs*8,ONCOGENIC 58166466,PD7006a,4,106158253,A,T,Sub,0,104,26.4,500,na,TET2,CCDS47120.1,c.3154A>T,p.K1052*,ONCOGENIC 58171044,PD6499a,4,106158256,C,T,Sub,0,103,44.4,500,na,TET2,CCDS47120.1,c.3157C>T,p.Q1053*,ONCOGENIC 208077125,PD6539a,4,106158259,g,-,D,0,103,29.30354796,760,1,TET2,CCDS47120.1,c.3160delG,p.V1054fs*0,ONCOGENIC 343679321,PD7377a,4,106158259,-,T,I,0,103,52.31023102,606,1,TET2,CCDS47120.1,c.3160_3161insT,p.V1056fs*2,ONCOGENIC 58171965,PD7090a,4,106158268,G,T,Sub,0,106,10.2,500,na,TET2,CCDS47120.1,c.3169G>T,p.E1057*,ONCOGENIC 58086949,PD6125a,4,106158301,C,T,Sub,0,100,15.4,500,na,TET2,CCDS47120.1,c.3202C>T,p.Q1068*,ONCOGENIC 58147188,PD6807a,4,106158362,C,A,Sub,0,114,43,500,na,TET2,CCDS47120.1,c.3263C>A,p.S1088*,ONCOGENIC 207979057,PD6311a,4,106158408,t,-,D,0,119,15.24390244,654,4,TET2,CCDS47120.1,c.3309delT,p.F1104fs*2,ONCOGENIC 343753316,PD8737a,4,106158414,ag,-,D,0,119,10.625,320,2,TET2,CCDS47120.1,c.3315_3316delAG,p.E1106fs*23,ONCOGENIC 208523559,PD6264a,4,106158442,c,-,D,,126,4.814814815,540,2,TET2,CCDS47120.1,c.3343delC,p.P1115fs*2,ONCOGENIC 66763401,PD8936a,4,106158443,C,A,Sub,0,124,10,380,na,TET2,CCDS47120.1,c.3344C>A,p.P1115H,UNKNOWN 208363318,PD6503a,4,106158478,c,-,D,0,148,25.5952381,503,1,TET2,CCDS47120.1,c.3379delC,p.Q1127fs*10,ONCOGENIC 58137915,PD6803a,4,106164004,G,A,Sub,0,113,11.03,263,na,TET2,CCDS47120.1,c.3514G>A,p.G1172S,ONCOGENIC 58079062,PD6523a,4,106164013,A,T,Sub,0,124,55.26,333,na,TET2,CCDS47120.1,c.3523A>T,p.I1175F,ONCOGENIC 58194267,PD6881a,4,106164025,A,G,Sub,0,133,12.68,276,na,TET2,CCDS47120.1,c.3535A>G,p.R1179G,ONCOGENIC 209590703,PD6232a,4,106164035,-,GAA,I,0,152,38.58381503,608,0,TET2,CCDS47120.1,c.3545_3546insGAA,p.Y1182>*,ONCOGENIC 58184491,PD6876a,4,106164059,C,A,Sub,0,142,19.4,500,na,TET2,CCDS47120.1,c.3569C>A,p.S1190Y,POSSIBLE ONCOGENIC 66801924,PD7376a,4,106164071,C,T,Sub,0,144,41.54,390,na,TET2,CCDS47120.1,c.3581C>T,p.P1194L,ONCOGENIC 66848019,PD7377a,4,106164085,G,T,Sub,0,125,25.17,433,na,TET2,CCDS47120.1,c.3594+1G>T,p.?,ONCOGENIC 58087291,PD6282a,4,106164741,C,G,Sub,0,13,51.32,189,na,TET2,CCDS47120.1,c.3609C>G,p.S1203R,POSSIBLE ONCOGENIC 208534335,PD6059a,4,106164758,t,-,D,,17,16.54411765,272,1,TET2,CCDS47120.1,c.3626delT,p.L1209fs*17,ONCOGENIC 207964492,PD5726a,4,106164761,tg,-,D,,17,39.29961089,257,2,TET2,CCDS47120.1,c.3629_3630delTG,p.C1211fs*11,ONCOGENIC 58097795,PD6196a,4,106164763,T,C,Sub,0,17,10.67,253,na,TET2,CCDS47120.1,c.3631T>C,p.C1211R,ONCOGENIC 58103088,PD6834a,4,106164764,G,A,Sub,0,16,43.2,250,na,TET2,CCDS47120.1,c.3632G>A,p.C1211Y,ONCOGENIC 58090846,PD6261a,4,106164778,C,T,Sub,0,16,43.51,239,na,TET2,CCDS47120.1,c.3646C>T,p.R1216*,ONCOGENIC 58143224,PD6841a,4,106164794,G,A,Sub,0,20,22.96,331,na,TET2,CCDS47120.1,c.3662G>A,p.C1221Y,ONCOGENIC 66783064,PD7371a,4,106164824,T,C,Sub,0,24,46.41,209,na,TET2,CCDS47120.1,c.3692T>C,p.L1231P,POSSIBLE ONCOGENIC 58192029,PD6141a,4,106164829,T,G,Sub,0,24,48.84,389,1,TET2,CCDS47120.1,c.3697T>G,p.W1233G,ONCOGENIC 58130447,PD6314a,4,106164835,G,A,Sub,0,26,41.51,212,na,TET2,CCDS47120.1,c.3703G>A,p.G1235R,POSSIBLE ONCOGENIC 208251876,PD5776a,4,106164840,ccc,-,D,0,37,35.46099291,379,3,TET2,CCDS47120.1,c.3708_3710delCCC,p.I1236_P1237>M,ONCOGENIC 208086941,PD6799a,4,106164847,tc,-,D,0,39,8.873720137,292,2,TET2,CCDS47120.1,c.3715_3716delTC,p.L1240fs*2,ONCOGENIC 58130448,PD6314a,4,106164854,C,A,Sub,0,39,36.53,219,na,TET2,CCDS47120.1,c.3722C>A,p.A1241D,ONCOGENIC 208002287,PD6255a,4,106164862,ct,-,D,0,40,41.69491525,294,2,TET2,CCDS47120.1,c.3730_3731delCT,p.Y1245fs*22,ONCOGENIC 208342599,PD6807a,4,106164862,ct,-,D,0,40,34.46808511,234,2,TET2,CCDS47120.1,c.3730_3731delCT,p.Y1245fs*22,ONCOGENIC 66937340,PD9711a,4,106164875,T,A,Sub,0,38,57.14,224,na,TET2,CCDS47120.1,c.3743T>A,p.L1248H,ONCOGENIC 58141417,PD6192a,4,106164880,G,T,Sub,0,47,22.56,390,na,TET2,CCDS47120.1,c.3748G>T,p.E1250*,ONCOGENIC 208456799,PD7086a,4,106164887,T,-,D,0,52,4.891304348,368,1,TET2,CCDS47120.1,c.3755delT,p.L1252fs*14,ONCOGENIC 58142142,PD6201a,4,106164913,C,T,Sub,0,46,48.65,185,na,TET2,CCDS47120.1,c.3781C>T,p.R1261C,ONCOGENIC 58160613,PD5715a,4,106164914,G,A,Sub,0,42,36.55,249,na,TET2,CCDS47120.1,c.3782G>A,p.R1261H,ONCOGENIC 66866710,PD7387a,4,106164914,G,A,Sub,0,42,41.57,166,na,TET2,CCDS47120.1,c.3782G>A,p.R1261H,ONCOGENIC 343478082,PD7390a,4,106164920,g,-,D,0,40,40.76923077,130,1,TET2,CCDS47120.1,c.3788delG,p.C1263fs*3,ONCOGENIC 58083402,PD6957a,4,106164920,G,A,Sub,0,41,46.24,279,na,TET2,CCDS47120.1,c.3788G>A,p.C1263Y,ONCOGENIC 58183925,PD6925a,4,106180784,G,A,Sub,0,254,38.63,497,na,TET2,CCDS47120.1,c.3812G>A,p.C1271Y,ONCOGENIC 208083619,PD6334a,4,106180793,ag,-,D,0,279,33.65384615,309,1,TET2,CCDS47120.1,c.3821_3822delAG,p.Q1274fs*25,ONCOGENIC 347424955,PD7387a,4,106180793,ag,-,D,0,277,37.93103448,202,1,TET2,CCDS47120.1,c.3821_3822delAG,p.Q1274fs*25,ONCOGENIC 66866711,PD7387a,4,106180793,A,G,Sub,0,267,7.2,125,na,TET2,CCDS47120.1,c.3821A>G,p.Q1274R,ONCOGENIC 58080398,PD6545a,4,106180816,G,C,Sub,0,258,38.76,485,na,TET2,CCDS47120.1,c.3844G>C,p.G1282R,ONCOGENIC 58120420,PD5740a,4,106180817,G,A,Sub,0,254,44,500,na,TET2,CCDS47120.1,c.3845G>A,p.G1282D,ONCOGENIC 208091500,PD5751a,4,106180824,ctt,-,D,0,246,31.04265403,367,1,TET2,CCDS47120.1,c.3852_3854delCTT,p.F1285delF,ONCOGENIC 345250225,PD8735a,4,106180830,t,-,D,0,216,24.28571429,279,4,TET2,CCDS47120.1,c.3858delT,p.F1287fs*76,ONCOGENIC 58160614,PD5715a,4,106180832,T,C,Sub,0,206,10.2,500,na,TET2,CCDS47120.1,c.3860T>C,p.F1287S,ONCOGENIC 58138640,PD6479a,4,106180835,G,A,Sub,0,203,90.08,252,na,TET2,CCDS47120.1,c.3863G>A,p.G1288D,ONCOGENIC 58154672,PD6808a,4,106180835,G,T,Sub,0,203,34,403,na,TET2,CCDS47120.1,c.3863G>T,p.G1288V,ONCOGENIC 58166467,PD7006a,4,106180835,G,A,Sub,0,203,33.2,500,na,TET2,CCDS47120.1,c.3863G>A,p.G1288D,ONCOGENIC 58110252,PD6266a,4,106180840,T,C,Sub,0,202,46,500,na,TET2,CCDS47120.1,c.3868T>C,p.S1290P,ONCOGENIC 58182126,PD6244a,4,106180845,G,T,Sub,0,190,45.27,338,na,TET2,CCDS47120.1,c.3873G>T,p.W1291C,ONCOGENIC 66951340,PD7391a,4,106180865,G,A,Sub,0,181,16.97,218,na,TET2,CCDS47120.1,c.3893G>A,p.C1298Y,ONCOGENIC 58131486,PD6537a,4,106180870,T,A,Sub,0,184,88.65,414,na,TET2,CCDS47120.1,c.3898T>A,p.F1300I,ONCOGENIC 58103946,PD6498a,4,106180901,A,C,Sub,0,192,45.8,500,na,TET2,CCDS47120.1,c.3929A>C,p.K1310T,POSSIBLE ONCOGENIC 208353948,PD7090a,4,106180909,g,-,D,0,186,24.24778761,563,4,TET2,CCDS47120.1,c.3937delG,p.D1314fs*49,ONCOGENIC 58084121,PD7086a,4,106180926,G,C,Sub,0,171,46.54,462,na,TET2,CCDS47120.1,c.3954G>C,p.E1318D,ONCOGENIC 66937341,PD9711a,4,106180927,G,A,Sub,0,170,48.25,257,na,TET2,CCDS47120.1,c.3954+1G>A,p.?,ONCOGENIC 58079063,PD6523a,4,106182926,T,A,Sub,0,108,11.19,277,na,TET2,CCDS47120.1,c.3965T>A,p.L1322Q,ONCOGENIC 58104180,PD7085a,4,106182926,T,A,Sub,0,108,24.32,292,na,TET2,CCDS47120.1,c.3965T>A,p.L1322Q,ONCOGENIC 58154673,PD6808a,4,106182926,T,G,Sub,0,108,33.09,275,na,TET2,CCDS47120.1,c.3965T>G,p.L1322R,ONCOGENIC 66875910,PD8735a,4,106182940,C,T,Sub,0,121,23.61,144,na,TET2,CCDS47120.1,c.3979C>T,p.Q1327*,ONCOGENIC 58131278,PD6126a,4,106182956,T,C,Sub,0,123,26.48,253,na,TET2,CCDS47120.1,c.3995T>C,p.L1332P,POSSIBLE ONCOGENIC 58103240,PD6211a,4,106182973,A,T,Sub,0,112,44.33,194,na,TET2,CCDS47120.1,c.4012A>T,p.K1338*,ONCOGENIC 209726586,PD6898a,4,106182979,-,T,I,0,109,31.86440678,295,2,TET2,CCDS47120.1,c.4018_4019insT,p.A1341fs*3,ONCOGENIC 66859596,PD7366a,4,106190765,A,G,Sub,0,298,5.03,298,na,TET2,CCDS47120.1,c.4045-2A>G,p.?,ONCOGENIC 208236949,PD6054a,4,106190782,ag,-,D,0,330,13.40388007,567,2,TET2,CCDS47120.1,c.4060_4061delAG,p.R1354fs*46,ONCOGENIC 58184807,PD6276a,4,106190786,C,T,Sub,0,318,42.34,333,na,TET2,CCDS47120.1,c.4064C>T,p.A1355V,UNKNOWN 58072472,PD7118a,4,106190794,T,G,Sub,0,315,37.2,500,na,TET2,CCDS47120.1,c.4072T>G,p.C1358G,ONCOGENIC 58080242,PD7002a,4,106190794,T,C,Sub,0,315,13.4,500,na,TET2,CCDS47120.1,c.4072T>C,p.C1358R,ONCOGENIC 66964073,PD7367a,4,106190797,C,T,Sub,0,319,17.41,316,na,TET2,CCDS47120.1,c.4075C>T,p.R1359C,ONCOGENIC 58068048,PD6142a,4,106190798,G,A,Sub,0,312,40.6,500,na,TET2,CCDS47120.1,c.4076G>A,p.R1359H,ONCOGENIC 58067870,PD5744a,4,106190830,G,A,Sub,0,253,90.29,453,na,TET2,CCDS47120.1,c.4108G>A,p.G1370R,ONCOGENIC 58200866,PD6288a,4,106190830,G,T,Sub,0,253,42,500,na,TET2,CCDS47120.1,c.4108G>T,p.G1370W,ONCOGENIC 67027847,PD8732a,4,106190848,G,A,Sub,0,217,46.4,500,na,TET2,CCDS47120.1,c.4126G>A,p.D1376N,ONCOGENIC 58128208,PD6325a,4,106190850,C,A,Sub,0,221,17.71,367,na,TET2,CCDS47120.1,c.4128C>A,p.D1376E,ONCOGENIC 58092839,PD7034a,4,106190852,T,C,Sub,0,215,14.6,500,na,TET2,CCDS47120.1,c.4130T>C,p.F1377S,POSSIBLE ONCOGENIC 58113355,PD6516a,4,106190855,G,A,Sub,0,209,86.2,500,na,TET2,CCDS47120.1,c.4133G>A,p.C1378Y,ONCOGENIC 58143225,PD6841a,4,106190855,G,A,Sub,0,209,12.8,500,na,TET2,CCDS47120.1,c.4133G>A,p.C1378Y,ONCOGENIC 58157814,PD6513a,4,106190858,C,T,Sub,0,202,49.89,461,na,TET2,CCDS47120.1,c.4136C>T,p.A1379V,POSSIBLE ONCOGENIC 58073690,PD6320a,4,106190860,C,T,Sub,0,206,46.13,401,na,TET2,CCDS47120.1,c.4138C>T,p.H1380Y,ONCOGENIC 58099918,PD6272a,4,106190860,C,T,Sub,0,206,32,500,na,TET2,CCDS47120.1,c.4138C>T,p.H1380Y,ONCOGENIC 58128576,PD5724a,4,106190860,C,T,Sub,0,206,44.33,406,na,TET2,CCDS47120.1,c.4138C>T,p.H1380Y,ONCOGENIC 209616798,PD6278a,4,106190876,-,T,I,0,197,41.83673469,392,2,TET2,CCDS47120.1,c.4154_4155insT,p.L1385fs*16,ONCOGENIC 58178209,PD6334a,4,106190882,A,T,Sub,0,170,30.38,293,na,TET2,CCDS47120.1,c.4160A>T,p.N1387I,POSSIBLE ONCOGENIC 58092840,PD7034a,4,106190887,C,T,Sub,0,165,10.09,456,na,TET2,CCDS47120.1,c.4165C>T,p.Q1389*,ONCOGENIC 58128761,PD7011a,4,106190898,C,G,Sub,0,158,27.17,368,na,TET2,CCDS47120.1,c.4176C>G,p.S1392R,POSSIBLE ONCOGENIC 58132699,PD6160a,4,106190899,A,G,Sub,0,156,52.29,480,na,TET2,CCDS47120.1,c.4177A>G,p.T1393A,ONCOGENIC 58187770,PD6845a,4,106190900,C,T,Sub,0,152,11.33,353,na,TET2,CCDS47120.1,c.4178C>T,p.T1393I,ONCOGENIC 208018005,PD6527a,4,106193729,t,-,D,0,196,43.11377246,166,1,TET2,CCDS47120.1,c.4191delT,p.L1398fs*50,ONCOGENIC 58076406,PD6789a,4,106193748,C,T,Sub,0.51,196,26.9,197,na,TET2,CCDS47120.1,c.4210C>T,p.R1404*,ONCOGENIC 58148286,PD7106a,4,106193748,C,T,Sub,0.51,196,29.71,239,na,TET2,CCDS47120.1,c.4210C>T,p.R1404*,ONCOGENIC 66791645,PD7364a,4,106193778,C,T,Sub,0,185,41.89,222,na,TET2,CCDS47120.1,c.4240C>T,p.Q1414*,ONCOGENIC 58078092,PD6517a,4,106193787,G,T,Sub,0,184,39.86,296,na,TET2,CCDS47120.1,c.4249G>T,p.V1417F,ONCOGENIC 208294593,PD6062a,4,106193788,-,TCTG,I,0,193,38.3908046,389,1,TET2,CCDS47120.1,c.4250_4251insTCTG,p.P1419fs*8,ONCOGENIC 347362171,PD7383a,4,106193849,-,A,I,0,166,25.14619883,342,6,TET2,CCDS47120.1,c.4311_4312insA,p.R1440fs*38,ONCOGENIC 58175178,PD6273a,4,106193850,A,T,Sub,0,149,91.8,500,na,TET2,CCDS47120.1,c.4312A>T,p.K1438*,ONCOGENIC 58076103,PD6281a,4,106193892,C,T,Sub,0,129,45.2,500,na,TET2,CCDS47120.1,c.4354C>T,p.R1452*,ONCOGENIC 58078603,PD6255a,4,106193931,C,T,Sub,0,111,47.6,500,na,TET2,CCDS47120.1,c.4393C>T,p.R1465*,ONCOGENIC 58113962,PD6990a,4,106193931,C,T,Sub,0,111,36.2,500,na,TET2,CCDS47120.1,c.4393C>T,p.R1465*,ONCOGENIC 58119860,PD6848a,4,106193931,C,T,Sub,0,111,23.2,500,na,TET2,CCDS47120.1,c.4393C>T,p.R1465*,ONCOGENIC 58184808,PD6276a,4,106193931,C,T,Sub,0,111,48.12,478,na,TET2,CCDS47120.1,c.4393C>T,p.R1465*,ONCOGENIC 58200867,PD6288a,4,106193931,C,T,Sub,0,111,43.2,500,na,TET2,CCDS47120.1,c.4393C>T,p.R1465*,ONCOGENIC 208424498,PD6332a,4,106193940,aa,-,D,0,110,26.65764547,732,3,TET2,CCDS47120.1,c.4402_4403delAA,p.K1468fs*9,ONCOGENIC 208149016,PD6849a,4,106194011,g,-,D,0,66,37.66891892,592,2,TET2,CCDS47120.1,c.4473delG,p.E1492fs*79,ONCOGENIC 58184145,PD5776a,4,106194013,A,G,Sub,0,63,47,500,na,TET2,CCDS47120.1,c.4475A>G,p.E1492G,POSSIBLE ONCOGENIC 58103811,PD6970a,4,106194019,C,G,Sub,0,65,39.2,500,na,TET2,CCDS47120.1,c.4481C>G,p.S1494*,ONCOGENIC 347173964,PD7375a,4,106194035,aaaa,-,D,0,66,27.57352941,266,4,TET2,CCDS47120.1,c.4497_4500delAAAA,p.Q1501fs*69,ONCOGENIC 58127194,PD5757a,4,106194051,G,A,Sub,0,70,43.24,333,na,TET2,CCDS47120.1,c.4513G>A,p.A1505T,UNKNOWN 58128577,PD5724a,4,106194057,C,T,Sub,0,74,32.43,333,na,TET2,CCDS47120.1,c.4519C>T,p.Q1507*,ONCOGENIC 208071949,PD5740a,4,106194059,g,-,D,0,76,43.43434343,394,2,TET2,CCDS47120.1,c.4521delG,p.A1508fs*63,ONCOGENIC 208463187,PD6522a,4,106196208,t,-,D,0,256,15.59633028,218,4,TET2,CCDS47120.1,c.4541delT,p.L1515fs*56,ONCOGENIC 58076407,PD6789a,4,106196213,C,T,Sub,0,233,20.57,141,na,TET2,CCDS47120.1,c.4546C>T,p.R1516*,ONCOGENIC 58117204,PD5785a,4,106196213,C,T,Sub,0,233,41.4,186,na,TET2,CCDS47120.1,c.4546C>T,p.R1516*,ONCOGENIC 58171045,PD6499a,4,106196213,C,T,Sub,0,233,44.79,259,na,TET2,CCDS47120.1,c.4546C>T,p.R1516*,ONCOGENIC 345149872,PD7376a,4,106196216,-,T,I,0,232,44.33962264,212,3,TET2,CCDS47120.1,c.4549_4550insT,p.S1518fs*60,ONCOGENIC 208095661,PD6484a,4,106196329,ag,-,D,0,182,35.33424284,720,2,TET2,CCDS47120.1,c.4662_4663delAG,p.E1555fs*22,ONCOGENIC 208452011,PD6819a,4,106196402,-,A,I,0,426,47.62790698,1075,1,TET2,CCDS47120.1,c.4735_4736insA,p.Y1579fs*0,ONCOGENIC 208165581,PD6826a,4,106196402,-,A,I,,426,4.287138584,1003,1,TET2,CCDS47120.1,c.4735_4736insA,p.Y1579fs*0,ONCOGENIC 208386794,PD6155a,4,106196427,-,T,I,0,475,18.43575419,1431,1,TET2,CCDS47120.1,c.4760_4761insT,p.I1588fs*26,ONCOGENIC 209726567,PD6898a,4,106196434,t,-,D,0,479,35.88640275,1159,1,TET2,CCDS47120.1,c.4767delT,p.Y1589fs*0,ONCOGENIC 208433424,PD6834a,4,106196453,a,-,D,0,464,33.1995988,996,2,TET2,CCDS47120.1,c.4786delA,p.N1596fs*14,ONCOGENIC 208476743,PD5786a,4,106196457,tc,-,D,0,459,22.56857855,798,1,TET2,CCDS47120.1,c.4790_4791delTC,p.F1597fs*16,ONCOGENIC 58189530,PD5737a,4,106196461,T,A,Sub,0,447,33,500,na,TET2,CCDS47120.1,c.4794T>A,p.Y1598*,ONCOGENIC 208500346,PD6848a,4,106196483,g,-,D,0,382,27.63295099,957,2,TET2,CCDS47120.1,c.4816delG,p.G1606fs*4,ONCOGENIC 58089628,PD6872a,4,106196491,T,G,Sub,0,360,37.2,500,na,TET2,CCDS47120.1,c.4824T>G,p.Y1608*,ONCOGENIC 58147365,PD6057a,4,106196492,T,A,Sub,0,358,12.81,367,na,TET2,CCDS47120.1,c.4825T>A,p.L1609M,UNKNOWN 208305223,PD6295a,4,106196529,t,-,D,0.330033003,303,23.74100719,1104,4,TET2,CCDS47120.1,c.4862delT,p.L1622fs*0,ONCOGENIC 58141176,PD6189a,4,106196546,C,T,Sub,0,295,10,500,na,TET2,CCDS47120.1,c.4879C>T,p.Q1627*,ONCOGENIC 66915658,PD9659a,4,106196556,C,A,Sub,0,280,37.6,500,na,TET2,CCDS47120.1,c.4889C>A,p.S1630*,ONCOGENIC 58139898,PD7083a,4,106196650,T,A,Sub,0,247,31.6,500,na,TET2,CCDS47120.1,c.4983T>A,p.Y1661*,ONCOGENIC 208476744,PD5786a,4,106196655,g,-,D,0,281,5.868263473,835,1,TET2,CCDS47120.1,c.4988delG,p.S1663fs*32,ONCOGENIC 208203558,PD6081a,4,106196677,-,A,I,0,307,21.77531207,720,1,TET2,CCDS47120.1,c.5010_5011insA,p.S1671fs*16,ONCOGENIC 66804261,PD7389a,4,106196705,C,T,Sub,0,338,96.6,500,na,TET2,CCDS47120.1,c.5038C>T,p.Q1680*,ONCOGENIC 207949292,PD6303a,4,106196727,ag,-,D,0,342,6.038647343,1241,2,TET2,CCDS47120.1,c.5060_5061delAG,p.S1688fs*4,ONCOGENIC 209555874,PD6789a,4,106196775,-,G,I,0,338,18.39430894,984,3,TET2,CCDS47120.1,c.5108_5109insG,p.D1704fs*9,ONCOGENIC 208018015,PD6527a,4,106196786,-,GCAG,I,0,314,39.37432578,807,1,TET2,CCDS47120.1,c.5119_5120insGCAG,p.S1708fs*6,ONCOGENIC 208324988,PD6266a,4,106196883,-,T,I,0,228,39.9103139,1337,0,TET2,CCDS47120.1,c.5216_5217insT,p.R1739fs*14,ONCOGENIC 208561022,PD6305a,4,106196921,a,-,D,0.386100386,259,30.12633625,1024,5,TET2,CCDS47120.1,c.5254delA,p.N1753fs*10,ONCOGENIC 58102904,PD6232a,4,106196921,A,T,Sub,0,250,53.4,500,na,TET2,CCDS47120.1,c.5254A>T,p.K1752*,ONCOGENIC 58083912,PD7027a,4,106197041,C,T,Sub,0,185,44.8,500,na,TET2,CCDS47120.1,c.5374C>T,p.H1792Y,POSSIBLE ONCOGENIC 58113094,PD6262a,4,106197114,T,G,Sub,0,154,44,500,na,TET2,CCDS47120.1,c.5447T>G,p.L1816*,ONCOGENIC 209617929,PD6107a,4,106197175,gg,-,D,0,164,33.8894682,947,2,TET2,CCDS47120.1,c.5508_5509delGG,p.A1837fs*8,ONCOGENIC 68195608,PD8731a,4,106197221,C,T,Sub,0,126,46.45,437,na,TET2,CCDS47120.1,c.5554C>T,p.Q1852*,ONCOGENIC 208134944,PD6842a,4,106197223,g,-,D,0,132,88.74598071,926,1,TET2,CCDS47120.1,c.5556delG,p.S1853fs*34,ONCOGENIC 66878120,PD8934a,4,106197254,G,A,Sub,0,112,6.52,184,na,TET2,CCDS47120.1,c.5587G>A,p.A1863T,ONCOGENIC 68195609,PD8731a,4,106197285,T,C,Sub,0,101,44.82,473,na,TET2,CCDS47120.1,c.5618T>C,p.I1873T,ONCOGENIC 58082285,PD7081a,4,106197308,C,T,Sub,0,105,42.8,500,na,TET2,CCDS47120.1,c.5641C>T,p.H1881Y,ONCOGENIC 58118268,PD6081a,4,106197309,A,C,Sub,0,108,29,500,na,TET2,CCDS47120.1,c.5642A>C,p.H1881P,ONCOGENIC 58181890,PD6931a,4,106197317,A,G,Sub,0,112,41.6,500,na,TET2,CCDS47120.1,c.5650A>G,p.T1884A,ONCOGENIC 58204059,PD6280a,4,106197317,A,G,Sub,0,112,43.6,500,na,TET2,CCDS47120.1,c.5650A>G,p.T1884A,ONCOGENIC 58128209,PD6325a,4,106197318,C,T,Sub,0,112,16.2,500,na,TET2,CCDS47120.1,c.5651C>T,p.T1884I,ONCOGENIC 58072473,PD7118a,4,106197347,C,A,Sub,0,123,41.2,500,na,TET2,CCDS47120.1,c.5680C>A,p.P1894T,ONCOGENIC 58147366,PD6057a,4,106197348,C,T,Sub,0,128,16.55,290,na,TET2,CCDS47120.1,c.5681C>T,p.P1894L,ONCOGENIC 58135331,PD6247a,4,106197371,T,C,Sub,0,131,34.2,500,na,TET2,CCDS47120.1,c.5704T>C,p.Y1902H,POSSIBLE ONCOGENIC 58087292,PD6282a,4,106197419,G,T,Sub,0,176,45.8,500,na,TET2,CCDS47120.1,c.5752G>T,p.E1918*,ONCOGENIC 58113095,PD6262a,4,106197491,C,T,Sub,0,199,16.6,500,na,TET2,CCDS47120.1,c.5824C>T,p.Q1942*,ONCOGENIC 58067758,PD6253a,4,106197552,C,T,Sub,0,241,48.6,500,na,TET2,CCDS47120.1,c.5885C>T,p.P1962L,ONCOGENIC 58101757,PD6295a,4,106197552,C,T,Sub,0,241,52.4,500,na,TET2,CCDS47120.1,c.5885C>T,p.P1962L,ONCOGENIC 58156797,PD6086a,4,153244298,G,A,Sub,0,143,53.29,152,na,FBXW7,CCDS3777.1,c.1859C>T,p.P620L,UNKNOWN 58105222,PD6944a,4,153268141,G,A,Sub,0.33,303,86.21,58,na,FBXW7,CCDS3777.1,c.667C>T,p.R223C,UNKNOWN 208324992,PD6266a,4,153332912,-,GAG,I,0,91,51.81488203,960,0,FBXW7,CCDS3777.1,c.43_44insCTC,p.T15_G16insP,UNKNOWN 208353301,PD6187a,4,153332912,-,GAG,I,0,91,47.69488684,1032,0,FBXW7,CCDS3777.1,c.43_44insCTC,p.T15_G16insP,UNKNOWN 208401415,PD6217a,4,153332912,-,GAG,I,0,91,46.52719665,1026,0,FBXW7,CCDS3777.1,c.43_44insCTC,p.T15_G16insP,UNKNOWN 208475793,PD6171a,4,153332912,-,GAG,I,0,91,43.22033898,931,0,FBXW7,CCDS3777.1,c.43_44insCTC,p.T15_G16insP,UNKNOWN 209682139,PD5725a,4,153332912,-,GAG,I,0,91,39.76777939,585,0,FBXW7,CCDS3777.1,c.43_44insCTC,p.T15_G16insP,UNKNOWN 58098573,PD6783a,5,131411498,T,C,Sub,0,31,25.42,59,na,CSF2,CCDS4150.1,c.388T>C,p.F130L,UNKNOWN 58191510,PD6132a,5,131822056,G,A,Sub,0,17,50,28,na,IRF1,CCDS4155.1,c.554C>T,p.P185L,UNKNOWN 208545853,PD6837a,5,131822294,-,T,I,0,74,91.17647059,34,0,IRF1,CCDS4155.1,c.498_499insA,p.P167fs*42,ONCOGENIC 58139960,PD7083a,5,131822516,G,A,Sub,0,216,18.37,49,na,IRF1,CCDS4155.1,c.385C>T,p.R129*,ONCOGENIC 58139674,PD6898a,5,131822653,C,G,Sub,0,90,46.43,56,na,IRF1,CCDS4155.1,c.357G>C,p.Q119H,UNKNOWN 208185167,PD7075a,5,131822816,-,T,I,0,30,27.27272727,33,1,IRF1,CCDS4155.1,c.193_194insA,p.Y65fs*0,ONCOGENIC 58156373,PD6813a,5,131823626,A,C,Sub,0,81,32.43,37,na,IRF1,CCDS4155.1,c.179T>G,p.I60S,UNKNOWN 58074234,PD5789a,5,131825128,G,A,Sub,0,51,16,25,na,IRF1,CCDS4155.1,c.43C>T,p.Q15*,ONCOGENIC 58165128,PD6176a,5,131895051,G,T,Sub,0,67,44.94,267,na,RAD50,CCDS34233.1,c.205G>T,p.D69Y,UNKNOWN 66948313,PD7373a,5,131911487,G,T,Sub,0,113,79.72,217,na,RAD50,CCDS34233.1,c.232G>T,p.V78L,UNKNOWN 58107831,PD6215a,5,131924399,G,C,Sub,0,112,75.23,218,na,RAD50,CCDS34233.1,c.1072G>C,p.D358H,UNKNOWN 58086444,PD7017a,5,131924572,G,A,Sub,0,148,10.2,500,na,RAD50,CCDS34233.1,c.1245G>A,p.M415I,UNKNOWN 58082049,PD6072a,5,131925462,A,T,Sub,0,142,49.49,487,na,RAD50,CCDS34233.1,c.1385A>T,p.Y462F,UNKNOWN 58169931,PD6075a,5,131925463,T,G,Sub,0,141,47.24,489,na,RAD50,CCDS34233.1,c.1386T>G,p.Y462*,UNKNOWN 58132605,PD6161a,5,131931499,T,A,Sub,0,192,46.13,323,na,RAD50,CCDS34233.1,c.2204T>A,p.M735K,UNKNOWN 58147508,PD6057a,5,131939122,C,T,Sub,0,75,17,500,na,RAD50,CCDS34233.1,c.2338C>T,p.P780S,UNKNOWN 58132760,PD6160a,5,131939162,C,G,Sub,0,98,42.2,500,na,RAD50,CCDS34233.1,c.2378C>G,p.T793R,UNKNOWN 58073105,PD6985a,5,131944310,G,A,Sub,0,12,35.29,17,na,RAD50,CCDS34233.1,c.2722G>A,p.A908T,UNKNOWN 66783214,PD7371a,5,131953812,A,G,Sub,0,54,47.2,286,na,RAD50,CCDS34233.1,c.3215A>G,p.N1072S,UNKNOWN 58073310,PD6941a,5,131973790,A,G,Sub,0,67,49.39,407,na,RAD50,CCDS34233.1,c.3493A>G,p.I1165V,UNKNOWN 68092840,PD8728a,5,131976398,C,T,Sub,0,196,14,500,na,RAD50,CCDS34233.1,c.3653C>T,p.A1218V,UNKNOWN 58095566,PD6100a,5,131976442,C,T,Sub,0,230,47.6,500,na,RAD50,CCDS34233.1,c.3697C>T,p.P1233S,UNKNOWN 58071753,PD6110a,5,131977943,C,T,Sub,0,200,12.38,105,na,RAD50,CCDS34233.1,c.3826C>T,p.L1276F,UNKNOWN 58100189,PD6137a,5,138160340,A,G,Sub,0,67,48.15,324,na,CTNNA1,CCDS34243.1,c.710A>G,p.Y237C,UNKNOWN 58194366,PD6881a,5,138160378,C,T,Sub,0,71,16.42,335,na,CTNNA1,CCDS34243.1,c.748C>T,p.Q250*,ONCOGENIC 66878242,PD8934a,5,138160443,G,T,Sub,0,87,20.77,130,na,CTNNA1,CCDS34243.1,c.813G>T,p.Q271H,UNKNOWN 68195926,PD8730a,5,138223174,A,G,Sub,0,163,12.27,163,na,CTNNA1,CCDS34243.1,c.1144-5A>G,p.?,UNKNOWN 58178798,PD6481a,5,138266607,C,T,Sub,0,73,50.81,307,na,CTNNA1,CCDS34243.1,c.2281C>T,p.R761C,UNKNOWN 58179168,PD6784a,5,138268378,G,A,Sub,0,107,12,50,na,CTNNA1,CCDS34243.1,c.2410G>A,p.G804S,UNKNOWN 58104611,PD6065a,5,142311683,G,A,Sub,0,58,46.07,382,na,ARHGAP26,CCDS4277.1,c.1100G>A,p.R367Q,UNKNOWN 58181385,PD6816a,5,142437488,C,T,Sub,0,285,13.16,38,na,ARHGAP26,ENST00000443045,c.176C>T,p.A59V,UNKNOWN 58093839,PD6313a,5,142513670,C,T,Sub,0,80,50.12,401,na,ARHGAP26,CCDS4277.1,c.1837C>T,p.Q613*,UNKNOWN 58119450,PD6113a,5,149433979,T,C,Sub,0,31,14.29,140,na,CSF1R,CCDS4302.1,c.2669A>G,p.Q890R,UNKNOWN 58152118,PD6156a,5,149435618,C,T,Sub,0,78,15.52,58,na,CSF1R,CCDS4302.1,c.2525G>A,p.G842D,UNKNOWN 58165465,PD6787a,5,149435624,G,T,Sub,0,87,32.48,157,na,CSF1R,CCDS4302.1,c.2519C>A,p.S840Y,UNKNOWN 58188590,PD6945a,5,149439303,C,T,Sub,0,43,30,20,na,CSF1R,CCDS4302.1,c.2092G>A,p.D698N,UNKNOWN 58159089,PD6532a,5,149439322,C,G,Sub,0,46,50,28,na,CSF1R,CCDS4302.1,c.2073G>C,p.Q691H,UNKNOWN 58092443,PD7016a,5,149447790,G,C,Sub,0,10,30.51,59,na,CSF1R,CCDS4302.1,c.1614C>G,p.Y538*,UNKNOWN 208116606,PD6823a,5,149449864,-,G,I,0,31,21.05263158,38,6,CSF1R,CCDS4302.1,c.1199_1200insC,p.E403fs*37,UNKNOWN 58117829,PD6522a,5,149450031,G,T,Sub,0,15,53.85,78,na,CSF1R,CCDS4302.1,c.1186C>A,p.L396I,UNKNOWN 58070956,PD7092a,5,149452993,C,T,Sub,0,26,11.8,500,na,CSF1R,CCDS4302.1,c.953G>A,p.G318E,UNKNOWN 58137995,PD6803a,5,149459707,G,A,Sub,0,20,15.79,38,na,CSF1R,CCDS4302.1,c.500C>T,p.A167V,UNKNOWN 58112426,PD7040a,5,170819740,G,A,Sub,0,67,39.3,369,na,NPM1,CCDS4376.1,c.379G>A,p.D127N,UNKNOWN 207900486,PD6286a,5,170837544,-,CTGC,I,0,88,15.03067485,326,0,NPM1,CCDS4376.1,c.860_861insCTGC,p.W288fs*12,ONCOGENIC 208036013,PD6202a,5,170837547,-,TCTG,I,0,89,41.06145251,311,1,NPM1,CCDS4376.1,c.863_864insTCTG,p.W288fs*12,ONCOGENIC 208148445,PD6849a,5,170837547,-,TCTG,I,0,89,38.18681319,312,1,NPM1,CCDS4376.1,c.863_864insTCTG,p.W288fs*12,ONCOGENIC 208288872,PD6861a,5,170837547,-,TCTG,I,0,89,11.89189189,350,1,NPM1,CCDS4376.1,c.863_864insTCTG,p.W288fs*12,ONCOGENIC 208407131,PD7032a,5,170837547,-,TATG,I,0,88,32.39740821,406,0,NPM1,CCDS4376.1,c.863_864insTATG,p.W288fs*12,ONCOGENIC 208508996,PD6135a,5,170837547,-,TCTG,I,0,88,28.18991098,299,1,NPM1,CCDS4376.1,c.863_864insTCTG,p.W288fs*12,ONCOGENIC 209719870,PD6869a,5,170837547,-,CATG,I,0,85,24.20382166,284,0,NPM1,CCDS4376.1,c.863_864insCATG,p.W288fs*12,ONCOGENIC 343926616,PD8734a,5,170837547,-,CATG,I,0,85,36.75213675,109,0,NPM1,CCDS4376.1,c.863_864insCATG,p.W288fs*12,ONCOGENIC 58142196,PD6201a,5,176562124,T,C,Sub,0.35,285,50.5,301,na,NSD1,CCDS4412.1,c.20T>C,p.L7P,UNKNOWN 209722865,PD6934a,5,176562270,-,A,I,0,179,19.1011236,178,2,NSD1,CCDS4412.1,c.166_167insA,p.T56fs*37,UNKNOWN 58179693,PD6780a,5,176562492,C,T,Sub,0,112,12.79,86,na,NSD1,CCDS4412.1,c.388C>T,p.P130S,UNKNOWN 58105311,PD6944a,5,176562742,G,A,Sub,0,50,40.2,500,na,NSD1,CCDS4412.1,c.638G>A,p.S213N,UNKNOWN 58154777,PD6518a,5,176562900,A,G,Sub,0,97,48.55,379,na,NSD1,CCDS4412.1,c.796A>G,p.T266A,UNKNOWN 58100670,PD6829a,5,176618903,T,G,Sub,0,83,45.95,407,na,NSD1,CCDS4412.1,c.946T>G,p.S316A,UNKNOWN 58147502,PD6057a,5,176631136,C,T,Sub,0,72,12.45,233,na,NSD1,CCDS4412.1,c.1079C>T,p.P360L,UNKNOWN 58076727,PD6914a,5,176631183,G,T,Sub,0,59,12.62,206,na,NSD1,CCDS4412.1,c.1126G>T,p.A376S,UNKNOWN 58150792,PD5755a,5,176636857,C,G,Sub,0,121,50.4,500,na,NSD1,CCDS4412.1,c.1457C>G,p.S486C,UNKNOWN 58128528,PD6956a,5,176636944,C,G,Sub,0,112,48.16,461,na,NSD1,CCDS4412.1,c.1544A>G,p.H515R,UNKNOWN 58191875,PD6965a,5,176636944,A,G,Sub,0,112,67.4,500,na,NSD1,CCDS4412.1,c.1544A>G,p.H515R,UNKNOWN 58115696,PD6278a,5,176636958,G,A,Sub,0,114,51.6,500,na,NSD1,CCDS4412.1,c.1558G>A,p.A520T,UNKNOWN 58141113,PD6987a,5,176637306,A,G,Sub,0,150,51.4,500,na,NSD1,CCDS4412.1,c.1906A>G,p.I636V,UNKNOWN 58197724,PD6788a,5,176637376,A,G,Sub,0,176,50,500,na,NSD1,CCDS4412.1,c.1976A>G,p.D659G,UNKNOWN 66878244,PD8934a,5,176637468,G,A,Sub,0,70,11.26,222,na,NSD1,CCDS4412.1,c.2068G>A,p.A690T,UNKNOWN 58151283,PD6984a,5,176638025,T,A,Sub,0,232,51,500,na,NSD1,CCDS4412.1,c.2625T>A,p.D875E,UNKNOWN 58090257,PD6105a,5,176638050,C,T,Sub,0,225,50.8,500,na,NSD1,CCDS4412.1,c.2650C>T,p.P884S,UNKNOWN 58197725,PD6788a,5,176638293,T,G,Sub,0,142,49.01,453,na,NSD1,CCDS4412.1,c.2893T>G,p.S965A,UNKNOWN 66964179,PD7367a,5,176638312,G,A,Sub,0,143,12.67,221,na,NSD1,CCDS4412.1,c.2912G>A,p.G971D,UNKNOWN 58099814,PD6888a,5,176638497,G,A,Sub,0,270,10.61,424,na,NSD1,CCDS4412.1,c.3097G>A,p.A1033T,UNKNOWN 58091836,PD6782a,5,176638504,C,A,Sub,0,261,12.41,145,na,NSD1,CCDS4412.1,c.3104C>A,p.S1035*,UNKNOWN 58086977,PD6125a,5,176638730,C,A,Sub,0,89,50.6,500,na,NSD1,CCDS4412.1,c.3330C>A,p.F1110L,UNKNOWN 58089705,PD6872a,5,176638761,A,G,Sub,0,69,43.8,500,na,NSD1,CCDS4412.1,c.3361A>G,p.K1121E,UNKNOWN 58171271,PD6862a,5,176639098,G,A,Sub,0,106,36.56,413,na,NSD1,CCDS4412.1,c.3698G>A,p.R1233Q,UNKNOWN 58125243,PD6194a,5,176639178,G,T,Sub,0,86,49.23,323,na,NSD1,CCDS4412.1,c.3778G>T,p.A1260S,UNKNOWN 66763483,PD8936a,5,176665365,C,T,Sub,0,36,15.67,134,na,NSD1,CCDS4412.1,c.4049C>T,p.P1350L,UNKNOWN 58164260,PD6159a,5,176684044,G,A,Sub,0,101,43.2,500,na,NSD1,CCDS4412.1,c.4858G>A,p.V1620I,UNKNOWN 58172087,PD6967a,5,176684044,T,A,Sub,0,101,45.2,500,na,NSD1,CCDS4412.1,c.4858G>A,p.V1620I,UNKNOWN 58197515,PD6873a,5,176684044,G,A,Sub,0,101,48.6,500,na,NSD1,CCDS4412.1,c.4858G>A,p.V1620I,UNKNOWN 66878245,PD8934a,5,176694614,G,A,Sub,0,323,61.4,500,na,NSD1,CCDS4412.1,c.5198G>A,p.C1733Y,UNKNOWN 58120495,PD5740a,5,176709485,A,G,Sub,0,280,44.4,500,na,NSD1,CCDS4412.1,c.5912A>G,p.Y1971C,UNKNOWN 208015886,PD6110a,5,176719019,g,-,D,0,97,14.5631068,206,2,NSD1,CCDS4412.1,c.6323delG,p.T2109fs*41,UNKNOWN 66878246,PD8934a,5,176719114,A,T,Sub,0,150,10.64,94,na,NSD1,CCDS4412.1,c.6418A>T,p.K2140*,UNKNOWN 58079330,PD6822a,5,176721042,C,T,Sub,0,55,20.91,263,na,NSD1,CCDS4412.1,c.6673C>T,p.P2225S,UNKNOWN 58194364,PD6881a,5,176721108,G,A,Sub,0,28,16.23,302,na,NSD1,CCDS4412.1,c.6739G>A,p.A2247T,UNKNOWN 208452547,PD6050a,5,176721164,-,A,I,0,25,13.33333333,45,6,NSD1,CCDS4412.1,c.6795_6796insA,p.A2268fs*13,UNKNOWN 58118030,PD7038a,5,176721337,C,T,Sub,0,63,12.21,213,na,NSD1,CCDS4412.1,c.6968C>T,p.A2323V,UNKNOWN 58136336,PD6962a,5,176721736,T,C,Sub,0,44,49.42,257,na,NSD1,CCDS4412.1,c.7367T>C,p.M2456T,UNKNOWN 344723052,PD8939a,5,176722186,c,-,D,0,93,7.352941176,68,4,NSD1,CCDS4412.1,c.7817delC,p.P2607fs*12,UNKNOWN 66858136,PD8935a,7,4947047,G,T,Sub,0,111,16.67,30,na,MMD2,CCDS47529.1,c.793C>A,p.Q265K,UNKNOWN 58149300,PD6260a,7,4947082,G,A,Sub,1.75,114,60.29,68,na,MMD2,CCDS47529.1,c.758C>T,p.A253V,UNKNOWN 58138903,PD6814a,7,4947083,C,T,Sub,0,116,11.21,107,na,MMD2,CCDS47529.1,c.757G>A,p.A253T,UNKNOWN 58149841,PD6219a,7,4949371,C,A,Sub,0,67,57.89,19,na,MMD2,ENST00000406755,c.576G>T,p.R192S,UNKNOWN 58200931,PD6288a,7,4949371,C,A,Sub,0,67,56,25,na,MMD2,ENST00000406755,c.576G>T,p.R192S,UNKNOWN 58089077,PD5742a,7,4949651,G,A,Sub,0,102,53.44,189,na,MMD2,CCDS47529.1,c.470C>T,p.T157I,UNKNOWN 58077442,PD6303a,7,4950787,G,T,Sub,0,21,47.37,57,na,MMD2,CCDS47529.1,c.456C>A,p.F152L,UNKNOWN 68196083,PD8645a,7,50367293,C,T,Sub,0,120,33.33,54,na,IKZF1,ENST00000331340,c.100C>T,p.P34S,UNKNOWN 58092489,PD7016a,7,50367446,C,T,Sub,0,59,12.62,103,na,IKZF1,ENST00000440768,c.253C>T,p.P85S,UNKNOWN 347362293,PD7383a,7,50367473,aaag,-,D,0,39,39.18918919,71,1,IKZF1,ENST00000440768,c.280_283delAAAG,p.E95fs*>25,UNKNOWN 58089843,PD6934a,7,50450378,C,T,Sub,0,19,46.15,13,na,IKZF1,ENST00000331340,c.562C>T,p.L188F,UNKNOWN 58181336,PD6816a,7,50467951,G,A,Sub,0,14,11.02,127,na,IKZF1,ENST00000331340,c.1186G>A,p.D396N,UNKNOWN 58076929,PD6115a,7,50468138,A,C,Sub,0,19,53.35,373,na,IKZF1,ENST00000331340,c.1373A>C,p.Q458P,UNKNOWN 58179734,PD6780a,7,55209985,A,G,Sub,0,235,10.79,380,na,EGFR,CCDS5514.1,c.95A>G,p.Q32R,UNKNOWN 58106751,PD6511a,7,55214397,A,G,Sub,0,57,50.3,165,na,EGFR,CCDS5514.1,c.523A>G,p.N175D,UNKNOWN 58195716,PD6907a,7,55220337,C,T,Sub,0,12,57.14,49,na,EGFR,CCDS5514.1,c.727C>T,p.P243S,UNKNOWN 58164477,PD6883a,7,55221710,C,G,Sub,0,175,43.33,90,na,EGFR,CCDS5514.1,c.754C>G,p.R252G,UNKNOWN 58136739,PD7043a,7,55227950,A,G,Sub,0,91,52.78,216,na,EGFR,CCDS5514.1,c.1417A>G,p.N473D,UNKNOWN 58203375,PD7044a,7,55229263,G,A,Sub,0,39,89.29,84,na,EGFR,CCDS5514.1,c.1570G>A,p.V524I,UNKNOWN 58093753,PD6313a,7,55229294,A,C,Sub,0,45,67.31,104,na,EGFR,CCDS5514.1,c.1601A>C,p.E534A,UNKNOWN 208149117,PD7089a,7,55238010,ga,-,D,0,30,37.64705882,84,2,EGFR,CCDS5515.1,c.1891_1892delGA,p.L633fs*9,UNKNOWN 66822089,PD8733a,7,55241674,A,G,Sub,0,46,26.32,19,na,EGFR,CCDS5514.1,c.2122A>G,p.K708E,UNKNOWN 58151591,PD6112a,7,55259490,C,G,Sub,0,38,45.43,328,na,EGFR,CCDS5514.1,c.2548C>G,p.H850D,UNKNOWN 58127282,PD5757a,7,55266434,C,G,Sub,0,103,10.4,500,na,EGFR,CCDS5514.1,c.2726C>G,p.T909S,UNKNOWN 66878182,PD8934a,7,55268026,G,A,Sub,0,36,20.43,186,na,EGFR,CCDS5514.1,c.2866G>A,p.D956N,UNKNOWN 66964089,PD7367a,7,101671377,G,A,Sub,0,93,10.37,164,na,CUX1,CCDS5721.1,c.142-1G>A,p.?,ONCOGENIC 58073934,PD6891a,7,101740676,C,T,Sub,0,128,43.26,141,na,CUX1,CCDS5721.1,c.301C>T,p.L101F,UNKNOWN 207996087,PD6488a,7,101740714,ag,-,D,0,123,27.23004695,213,2,CUX1,CCDS5721.1,c.339_340delAG,p.N115fs*4,ONCOGENIC 209553411,PD6506a,7,101747619,t,-,D,0,26,37.94642857,222,2,CUX1,CCDS5721.1,c.410delT,p.T138fs*2,ONCOGENIC 208536515,PD6201a,7,101747630,g,-,D,0,29,9.523809524,262,1,CUX1,CCDS5721.1,c.421delG,p.A141fs*13,ONCOGENIC 66780833,PD8736a,7,101833156,G,A,Sub,0,87,22.34,94,na,CUX1,CCDS5721.1,c.1076+5G>A,p.?,ONCOGENIC 58079502,PD6098a,7,101840091,G,T,Sub,0,125,14,100,na,CUX1,CCDS5721.1,c.1400G>T,p.S467I,UNKNOWN 58086315,PD7017a,7,101843399,C,T,Sub,0,17,13.95,86,na,CUX1,CCDS5721.1,c.2009C>T,p.A670V,UNKNOWN 208039078,PD6189a,7,101844801,-,C,I,0,27,51.1627907,85,5,CUX1,CCDS5721.1,c.2224_2225insC,p.K744fs*29,ONCOGENIC 207915787,PD6283a,7,101844917,c,-,D,0,8,38.02816901,71,6,CUX1,CCDS5721.1,c.2340delC,p.P782fs*26,ONCOGENIC 208256649,PD6318a,7,101844922,-,C,I,0,8,30.55555556,36,6,CUX1,CCDS5721.1,c.2345_2346insC,p.A783fs*31,ONCOGENIC 208099555,PD6792a,7,101845133,-,GGCAGCGGT,I,0,12,34.28571429,28,1,CUX1,CCDS5721.1,c.2556_2557insGGCAGCGGT,p.K852_G853insGSG,ONCOGENIC 208253481,PD6100a,7,101845357,-,C,I,0,39,33.51351351,184,6,CUX1,CCDS5721.1,c.2780_2781insC,p.L930fs*24,ONCOGENIC 209555442,PD6789a,7,101845446,-,G,I,0,29,11.57894737,95,2,CUX1,CCDS5721.1,c.2869_2870insG,p.A957fs*102,ONCOGENIC 58067330,PD6090a,7,101847749,C,T,Sub,0,22,16.67,24,na,CUX1,CCDS5721.1,c.2986C>T,p.R996*,ONCOGENIC 58167764,PD6875a,7,101847788,G,A,Sub,0,26,56,25,na,CUX1,CCDS5721.1,c.3025G>A,p.E1009K,UNKNOWN 208111426,PD6812a,7,101870691,-,C,I,0,34,18.84057971,69,4,CUX1,CCDS5721.1,c.3175_3176insC,p.M1061fs*5,ONCOGENIC 66808028,PD7370a,7,101877403,A,C,Sub,0,55,18.75,16,na,CUX1,CCDS5721.1,c.3505A>C,p.K1169Q,UNKNOWN 58188542,PD6945a,7,101877411,G,A,Sub,0,54,18.99,79,na,CUX1,CCDS5721.1,c.3513G>A,p.W1171*,ONCOGENIC 58169884,PD6075a,7,101877433,C,T,Sub,0,59,93.07,101,na,CUX1,CCDS5721.1,c.3535C>T,p.R1179*,ONCOGENIC 58188543,PD6945a,7,101877433,C,T,Sub,0,59,69.86,73,na,CUX1,CCDS5721.1,c.3535C>T,p.R1179*,ONCOGENIC 58081219,PD5768a,7,101882758,C,T,Sub,0,14,33.33,33,na,CUX1,CCDS5721.1,c.3781C>T,p.R1261*,ONCOGENIC 58179177,PD6784a,7,101916708,G,A,Sub,0,10,10.42,48,na,CUX1,CCDS5720.1,c.1327G>A,p.A443T,UNKNOWN 58078973,PD6504a,7,101918611,G,A,Sub,0,10,47.62,42,na,CUX1,CCDS5720.1,c.1544G>A,p.R515Q,UNKNOWN 58200233,PD5758a,7,101918611,G,A,Sub,0,10,45.45,55,na,CUX1,CCDS5720.1,c.1544G>A,p.R515Q,UNKNOWN 66976628,PD8939a,7,101925160,C,T,Sub,0,38,50,8,na,CUX1,CCDS5720.1,c.1850C>T,p.A617V,UNKNOWN 58166621,PD6828a,7,104719410,G,T,Sub,0,145,46,500,na,MLL5,CCDS34723.1,c.1248G>T,p.E416D,UNKNOWN 58197428,PD6979a,7,104730466,G,A,Sub,0,143,10.2,49,na,MLL5,CCDS34723.1,c.1369G>A,p.V457M,UNKNOWN 66878185,PD8934a,7,104730585,A,T,Sub,0,91,70.34,118,na,MLL5,CCDS34723.1,c.1488A>T,p.K496N,UNKNOWN 58118379,PD6081a,7,104730587,A,T,Sub,0,88,13.33,90,na,MLL5,CCDS34723.1,c.1490A>T,p.D497V,UNKNOWN 58121186,PD6791a,7,104742595,T,C,Sub,0,65,43.65,488,na,MLL5,CCDS34723.1,c.2150T>C,p.V717A,UNKNOWN 58147831,PD6850a,7,104745964,C,T,Sub,0,176,48,500,na,MLL5,CCDS34723.1,c.2275C>T,p.R759C,UNKNOWN 343926791,PD8734a,7,104746372,-,A,I,0,239,36.57587549,257,2,MLL5,CCDS34723.1,c.2518_2519insA,p.E841fs*11,POSSIBLE ONCOGENIC 58100470,PD7019a,7,104747085,C,T,Sub,0,160,49.6,377,na,MLL5,CCDS34723.1,c.2713C>T,p.P905S,UNKNOWN 58179738,PD6780a,7,104747962,G,A,Sub,0,94,20.69,174,na,MLL5,CCDS34723.1,c.3058G>A,p.V1020M,UNKNOWN 58071056,PD6083a,7,104748347,C,T,Sub,0,98,10.79,241,na,MLL5,CCDS34723.1,c.3443C>T,p.P1148L,UNKNOWN 58071705,PD6110a,7,104748347,C,T,Sub,0,98,37.04,135,na,MLL5,CCDS34723.1,c.3443C>T,p.P1148L,UNKNOWN 58079507,PD6098a,7,104748347,C,T,Sub,0,98,46.32,95,na,MLL5,CCDS34723.1,c.3443C>T,p.P1148L,UNKNOWN 58137457,PD6050a,7,104748347,C,T,Sub,0,98,55.46,119,na,MLL5,CCDS34723.1,c.3443C>T,p.P1148L,UNKNOWN 58135246,PD6084a,7,104752648,A,G,Sub,0,107,47.4,500,na,MLL5,CCDS34723.1,c.4445A>G,p.H1482R,UNKNOWN 66763426,PD8936a,7,104752957,G,A,Sub,0,91,22.22,36,na,MLL5,CCDS34723.1,c.4754G>A,p.G1585E,UNKNOWN 58144497,PD6884a,7,104753178,G,A,Sub,0,114,50.29,171,na,MLL5,CCDS34723.1,c.4975G>A,p.V1659I,UNKNOWN 58095544,PD6100a,7,104753268,C,T,Sub,0,10,47.83,23,na,MLL5,CCDS34723.1,c.5065C>T,p.H1689Y,UNKNOWN 58177097,PD6524a,7,104753287,A,G,Sub,0,18,48.04,102,na,MLL5,CCDS34723.1,c.5084A>G,p.H1695R,UNKNOWN 58080309,PD7002a,7,104753365,C,T,Sub,0,35,55.21,96,na,MLL5,CCDS34723.1,c.5162C>T,p.P1721L,UNKNOWN 58107319,PD6149a,7,104753365,C,T,Sub,0,35,47.62,105,na,MLL5,CCDS34723.1,c.5162C>T,p.P1721L,UNKNOWN 58176692,PD7075a,7,104753371,C,T,Sub,0,53,43.48,138,na,MLL5,CCDS34723.1,c.5168C>T,p.P1723L,UNKNOWN 58140487,PD6538a,7,104753482,C,T,Sub,0,139,46.74,184,na,MLL5,CCDS34723.1,c.5279C>T,p.P1760L,UNKNOWN 58196789,PD6258a,7,104753626,C,T,Sub,0,77,52.03,123,na,MLL5,CCDS34723.1,c.5423C>T,p.A1808V,UNKNOWN 58070879,PD7092a,7,104753707,G,A,Sub,0,43,15.69,51,na,MLL5,CCDS34723.1,c.5504G>A,p.G1835E,UNKNOWN 58119388,PD6113a,7,104773542,C,A,Sub,0,166,12.39,460,na,SRPK2,CCDS34724.1,c.1553G>T,p.R518L,UNKNOWN 207908102,PD6064a,7,104844186,-,TGG,I,,6,23.67601246,321,0,SRPK2,CCDS34724.1,c.117_118insCCA,p.P39_P40insP,UNKNOWN 58179179,PD6784a,7,105029143,G,T,Sub,0,20,11.29,62,na,SRPK2,CCDS34724.1,c.23C>A,p.A8D,UNKNOWN 58078348,PD5747a,7,105177036,T,C,Sub,0,306,46.45,394,na,RINT1,CCDS34726.1,c.113T>C,p.I38T,UNKNOWN 58117439,PD6120a,7,105182934,A,G,Sub,0,123,50,238,na,RINT1,CCDS34726.1,c.353A>G,p.N118S,UNKNOWN 58106756,PD6511a,7,105187425,C,T,Sub,0,158,46.04,278,na,RINT1,CCDS34726.1,c.584C>T,p.A195V,UNKNOWN 58106757,PD6511a,7,105187710,C,G,Sub,0,249,45.21,334,na,RINT1,CCDS34726.1,c.769C>G,p.P257A,UNKNOWN 67012452,PD7390a,7,105187735,G,C,Sub,0,303,51.09,137,na,RINT1,CCDS34726.1,c.794G>C,p.S265T,UNKNOWN 58140127,PD6135a,7,105187754,T,G,Sub,0,324,50.31,477,na,RINT1,CCDS34726.1,c.813T>G,p.F271L,UNKNOWN 208048702,PD6881a,7,105189057,c,-,D,0,171,17.29106628,346,4,RINT1,CCDS34726.1,c.896delC,p.P300fs*11,UNKNOWN 58179180,PD6784a,7,105190867,G,A,Sub,0,257,11,500,na,RINT1,CCDS34726.1,c.1267G>A,p.A423T,UNKNOWN 66767813,PD8938a,7,105204177,C,T,Sub,0,46,43.38,136,na,RINT1,CCDS34726.1,c.1672-3C>T,p.?,UNKNOWN 58147046,PD6070a,7,105204183,T,C,Sub,0,52,53.09,194,na,RINT1,CCDS34726.1,c.1675T>C,p.F559L,UNKNOWN 208015130,PD6110a,7,105204353,-,T,I,0,106,10.98726115,628,5,RINT1,CCDS34726.1,c.1845_1846insT,p.R616fs*0,UNKNOWN 58127284,PD5757a,7,105207667,C,T,Sub,0,111,42.71,480,na,RINT1,CCDS34726.1,c.2288C>T,p.A763V,UNKNOWN 58134503,PD6906a,7,105207713,G,C,Sub,0,127,15.86,353,na,RINT1,CCDS34726.1,c.2334G>C,p.E778D,UNKNOWN 58173874,PD6951a,7,105251037,C,A,Sub,0,90,48.98,147,na,ATXN7L1,CCDS47682.1,c.2486G>T,p.S829I,UNKNOWN 58140295,PD7112a,7,105254232,G,A,Sub,0,58,48.78,41,na,ATXN7L1,ENST00000388807,c.2207C>T,p.A736V,UNKNOWN 58111555,PD6968a,7,105254350,C,T,Sub,0,97,38.66,119,na,ATXN7L1,CCDS47682.1,c.2431G>A,p.A811T,UNKNOWN 345150038,PD7376a,7,105254801,-,GAG,I,0,321,18.18181818,77,8,ATXN7L1,CCDS47682.1,c.1979_1980insCTC,p.S661_L662insS,UNKNOWN 58139432,PD6865a,7,105254971,C,T,Sub,0,194,52.49,181,na,ATXN7L1,CCDS47682.1,c.1810G>A,p.V604M,UNKNOWN 58076446,PD6789a,7,105254974,C,T,Sub,0,190,47.74,155,na,ATXN7L1,CCDS47682.1,c.1807G>A,p.A603T,UNKNOWN 58079513,PD6098a,7,105255018,G,A,Sub,0,127,14.02,435,na,ATXN7L1,CCDS47682.1,c.1763C>T,p.A588V,UNKNOWN 58115303,PD6887a,7,105255165,C,T,Sub,0,27,43.82,178,na,ATXN7L1,CCDS47682.1,c.1616G>A,p.S539N,UNKNOWN 58068851,PD5735a,7,105255172,T,G,Sub,0,27,55.12,205,na,ATXN7L1,CCDS47682.1,c.1609A>C,p.N537H,UNKNOWN 58069524,PD6839a,7,105278887,G,C,Sub,0,500,50.79,380,na,ATXN7L1,CCDS47682.1,c.1115C>G,p.S372C,UNKNOWN 58071708,PD6110a,7,105305513,C,A,Sub,0,45,15.79,38,na,ATXN7L1,CCDS47682.1,c.578G>T,p.C193F,UNKNOWN 58082571,PD6825a,7,105305682,A,T,Sub,0,66,57.75,71,na,ATXN7L1,CCDS47682.1,c.409T>A,p.S137T,UNKNOWN 58205459,PD6178a,7,105305703,G,C,Sub,0,58,42,50,na,ATXN7L1,CCDS47682.1,c.388C>G,p.P130A,UNKNOWN 58114342,PD6166a,7,105516317,A,G,Sub,0,11,44.83,145,na,ATXN7L1,CCDS47682.1,c.191T>C,p.L64S,UNKNOWN 58130082,PD6146a,7,116339609,G,C,Sub,0,282,45.37,313,na,MET,CCDS47689.1,c.471G>C,p.E157D,UNKNOWN 58188539,PD6945a,7,116339673,G,A,Sub,0,257,39.88,331,na,MET,CCDS47689.1,c.535G>A,p.A179T,UNKNOWN 58105284,PD6944a,7,116340214,G,T,Sub,0,133,41.38,29,na,MET,CCDS47689.1,c.1076G>T,p.R359L,UNKNOWN 58112151,PD7010a,7,116340246,G,A,Sub,0,136,11.95,385,na,MET,CCDS47689.1,c.1108G>A,p.V370I,UNKNOWN 66951344,PD7391a,7,116371903,G,A,Sub,0,51,47.12,312,na,MET,CCDS47689.1,c.1382G>A,p.R461H,UNKNOWN 58148500,PD6143a,7,116381023,G,A,Sub,0,46,37.61,226,na,MET,CCDS47689.1,c.1645G>A,p.E549K,UNKNOWN 66846605,PD7374a,7,116395456,A,G,Sub,0,209,47.32,298,na,MET,CCDS47689.1,c.1749A>G,p.I583M,UNKNOWN 66763421,PD8936a,7,116397688,C,T,Sub,0,211,24.83,145,na,MET,ENST00000436117,c.1993C>T,p.P665S,UNKNOWN 58179184,PD6784a,7,116397728,C,T,Sub,0,264,10.95,338,na,MET,CCDS47689.1,c.2002C>T,p.P668S,UNKNOWN 66763422,PD8936a,7,116398618,C,A,Sub,0,239,6.71,149,na,MET,CCDS47689.1,c.2208C>A,p.F736L,UNKNOWN 66763423,PD8936a,7,116403177,T,C,Sub,0,153,39.66,237,na,MET,CCDS47689.1,c.2492T>C,p.L831P,UNKNOWN 58181334,PD6816a,7,116411630,G,A,Sub,0,353,29.37,303,na,MET,CCDS47689.1,c.2863G>A,p.G955S,UNKNOWN 58188679,PD5727a,7,116417488,G,A,Sub,0,314,42.25,445,na,MET,CCDS47689.1,c.3359G>A,p.G1120D,UNKNOWN 58091798,PD6782a,7,116423387,C,A,Sub,0,138,26.81,414,na,MET,CCDS47689.1,c.3716C>A,p.A1239D,UNKNOWN 58116526,PD6312a,7,139268484,A,G,Sub,0,103,44.27,131,na,HIPK2,ENST00000342645,c.2744T>C,p.L915P,UNKNOWN 208131779,PD6121a,7,139268536,-,C,I,0,149,40.35087719,171,2,HIPK2,ENST00000342645,c.2691_2692insG,p.N898fs*>22,UNKNOWN 208008561,PD7001a,7,139268538,-,C,I,0,147,59.68992248,128,2,HIPK2,ENST00000342645,c.2689_2690insG,p.N898fs*>22,UNKNOWN 58163126,PD7077a,7,139281533,G,A,Sub,0,16,46.88,128,na,HIPK2,ENST00000406875,c.2647C>T,p.P883S,UNKNOWN 58164301,PD6159a,7,139281628,C,T,Sub,0,16,46.15,26,na,HIPK2,ENST00000406875,c.2552G>A,p.C851Y,UNKNOWN 66859610,PD7366a,7,139288959,G,A,Sub,0,162,46.21,132,na,HIPK2,ENST00000406875,c.2123C>T,p.P708L,UNKNOWN 58203559,PD7074a,7,139299124,C,T,Sub,0,24,49.83,287,na,HIPK2,ENST00000406875,c.1898G>A,p.R633Q,UNKNOWN 58203827,PD6250a,7,139311476,C,T,Sub,0,146,46.93,375,na,HIPK2,ENST00000406875,c.1490G>A,p.R497Q,UNKNOWN 58111552,PD6968a,7,139415783,T,C,Sub,0,127,43.33,300,na,HIPK2,ENST00000406875,c.1051A>G,p.S351G,UNKNOWN 58092488,PD7016a,7,139415902,G,A,Sub,0,177,11.38,378,na,HIPK2,ENST00000406875,c.932C>T,p.A311V,UNKNOWN 58084519,PD6299a,7,139416089,G,A,Sub,0,220,50,500,na,HIPK2,ENST00000406875,c.745C>T,p.R249W,UNKNOWN 58141759,PD6500a,7,139416356,C,T,Sub,0,204,51.6,500,na,HIPK2,ENST00000406875,c.478G>A,p.G160R,UNKNOWN 58119084,PD6080a,7,139416625,G,A,Sub,0,269,47.6,500,na,HIPK2,ENST00000406875,c.209C>T,p.S70F,UNKNOWN 66971137,PD7379a,7,139416691,C,T,Sub,0.41,241,10.13,158,na,HIPK2,ENST00000406875,c.143G>A,p.S48N,UNKNOWN 343998140,PD8935a,7,139416709,-,CAG,I,0,269,27.91666667,227,1,HIPK2,ENST00000406875,c.124_125insCTG,p.T42_G43insA,UNKNOWN 58117309,PD5785a,7,140453154,T,C,Sub,0,68,43.99,341,na,BRAF,CCDS5863.1,c.1781A>G,p.D594G,ONCOGENIC 58105287,PD6944a,7,140477801,G,A,Sub,0,87,13.85,130,na,BRAF,CCDS5863.1,c.1507G>T,p.G503*,UNKNOWN 58110173,PD6203a,7,140477811,T,A,Sub,0,93,15.88,359,na,BRAF,CCDS5863.1,c.1497A>T,p.K499N,UNKNOWN 58182448,PD6809a,7,140482877,G,A,Sub,0,127,51.57,382,na,BRAF,CCDS5863.1,c.1258C>T,p.P420S,UNKNOWN 58134189,PD6121a,7,140549946,C,T,Sub,0,88,49.8,500,na,BRAF,CCDS5863.1,c.205G>A,p.G69S,UNKNOWN 58080036,PD6843a,7,148506167,A,T,Sub,0,103,33.2,500,na,EZH2,CCDS5891.1,c.2191T>A,p.Y731N,ONCOGENIC 58101825,PD6295a,7,148506167,A,C,Sub,0,103,18.8,500,na,EZH2,CCDS5891.1,c.2191T>G,p.Y731D,ONCOGENIC 207924347,PD6779a,7,148506205,tt,-,D,0,111,9.335219236,706,2,EZH2,CCDS5891.1,c.2152_2153delAA,p.K718fs*12,ONCOGENIC 58067485,PD6948a,7,148506428,G,A,Sub,0,53,27.4,500,na,EZH2,CCDS5891.1,c.2084C>T,p.S695L,ONCOGENIC 58203156,PD7037a,7,148506428,G,A,Sub,0,53,12.6,500,na,EZH2,CCDS5891.1,c.2084C>T,p.S695L,ONCOGENIC 58130291,PD5756a,7,148506443,C,T,Sub,0,53,29.2,500,na,EZH2,CCDS5891.1,c.2069G>A,p.R690H,ONCOGENIC 58172583,PD6155a,7,148506443,C,T,Sub,0,53,45.6,500,na,EZH2,CCDS5891.1,c.2069G>A,p.R690H,ONCOGENIC 58201719,PD6894a,7,148506443,C,T,Sub,0,53,34.29,315,na,EZH2,CCDS5891.1,c.2069G>A,p.R690H,ONCOGENIC 58130292,PD5756a,7,148506444,G,A,Sub,0,52,56,500,na,EZH2,CCDS5891.1,c.2068C>T,p.R690C,ONCOGENIC 58194325,PD6881a,7,148506461,C,T,Sub,0,46,11.74,494,na,EZH2,CCDS5891.1,c.2051G>A,p.R684H,ONCOGENIC 58184876,PD6276a,7,148506464,G,A,Sub,0,45,47.29,351,na,EZH2,CCDS5891.1,c.2048C>T,p.T683I,ONCOGENIC 58131387,PD6126a,7,148506477,C,T,Sub,0,36,48.05,487,na,EZH2,CCDS5891.1,c.2035G>A,p.V679M,ONCOGENIC 58182142,PD6244a,7,148507445,A,G,Sub,0,14,26.3,346,na,EZH2,CCDS5891.1,c.2009T>C,p.F670S,ONCOGENIC 66858139,PD8935a,7,148507447,G,C,Sub,0,17,46.06,165,na,EZH2,CCDS5891.1,c.2007C>G,p.S669R,ONCOGENIC 58093755,PD6313a,7,148507463,T,C,Sub,0,19,47.41,405,na,EZH2,CCDS5891.1,c.1991A>G,p.D664G,ONCOGENIC 58148113,PD7105a,7,148507473,T,C,Sub,0,20,10.53,494,na,EZH2,CCDS5891.1,c.1981A>G,p.K661E,ONCOGENIC 58129548,PD6980a,7,148507476,C,T,Sub,0,19,43.72,382,na,EZH2,CCDS5891.1,c.1978G>A,p.G660R,ONCOGENIC 58151892,PD6268a,7,148507478,C,T,Sub,0,19,11.62,198,na,EZH2,CCDS5891.1,c.1976G>A,p.R659K,ONCOGENIC 207992381,PD6268a,7,148507478,c,-,D,,20,41.27906977,344,1,EZH2,CCDS5891.1,c.1976delG,p.R659fs*16,ONCOGENIC 68195652,PD8731a,7,148507487,G,A,Sub,0,19,52.52,139,na,EZH2,CCDS5891.1,c.1967C>T,p.A656V,ONCOGENIC 58080637,PD6785a,7,148508788,C,T,Sub,0,62,23,500,na,EZH2,CCDS5891.1,c.1876G>A,p.V626M,ONCOGENIC 66858140,PD8935a,7,148508788,C,T,Sub,0,62,56.76,185,na,EZH2,CCDS5891.1,c.1876G>A,p.V626M,ONCOGENIC 58109635,PD6119a,7,148511068,G,A,Sub,0,108,92.31,208,na,EZH2,CCDS5891.1,c.1834C>T,p.Q612*,ONCOGENIC 207938649,PD6950a,7,148511104,c,-,D,0,129,20.62937063,286,3,EZH2,CCDS5891.1,c.1798delG,p.D600fs*75,ONCOGENIC 208561843,PD6075a,7,148511122,ca,-,D,0,140,84.2377261,381,2,EZH2,CCDS5891.1,c.1779_1780delTG,p.C593fs*4,ONCOGENIC 66804316,PD7389a,7,148511132,A,C,Sub,0,155,94.27,157,na,EZH2,CCDS5891.1,c.1770T>G,p.C590W,POSSIBLE ONCOGENIC 58171326,PD6862a,7,148511194,G,A,Sub,0,193,20.42,284,na,EZH2,CCDS5891.1,c.1708C>T,p.Q570*,ONCOGENIC 58080981,PD6251a,7,148511220,C,G,Sub,0,174,63.57,140,na,EZH2,CCDS5891.1,c.1682G>C,p.R561P,ONCOGENIC 66786143,PD7384a,7,148511233,C,G,Sub,0,147,52.21,113,na,EZH2,CCDS5891.1,c.1673-4G>C,p.?,POSSIBLE ONCOGENIC 58109827,PD6139a,7,148512096,A,G,Sub,0,69,28.17,355,na,EZH2,CCDS5891.1,c.1582T>C,p.C528R,ONCOGENIC 58092056,PD6840a,7,148514316,G,A,Sub,0,85,43.48,253,na,EZH2,CCDS5891.1,c.1408C>T,p.Q470*,ONCOGENIC 209681742,PD5725a,7,148514369,t,-,D,0,97,38.671875,256,1,EZH2,CCDS5891.1,c.1355delA,p.Y452fs*11,ONCOGENIC 58105270,PD6944a,7,148515050,C,A,Sub,0,117,53.85,13,na,EZH2,CCDS5891.1,c.1159A>T,p.R387W,POSSIBLE ONCOGENIC 58079514,PD6098a,7,148515055,C,G,Sub,0,110,26.76,71,na,EZH2,CCDS5891.1,c.1154G>C,p.S385T,POSSIBLE ONCOGENIC 207911038,PD6806a,7,148515094,-,G,I,0,84,11.71548117,239,5,EZH2,CCDS5891.1,c.1114_1115insC,p.T374fs*3,ONCOGENIC 58094705,PD5718a,7,148515206,C,T,Sub,0,64,23.4,94,na,EZH2,CCDS5891.1,c.1003G>A,p.G335R,POSSIBLE ONCOGENIC 58201421,PD6815a,7,148516722,T,C,Sub,0,139,42.93,417,na,EZH2,CCDS5891.1,c.965A>G,p.N322S,ONCOGENIC 58110175,PD6203a,7,148523591,G,A,Sub,0,117,55,500,na,EZH2,CCDS5891.1,c.862C>T,p.R288*,ONCOGENIC 58163290,PD6214a,7,148523636,G,A,Sub,0,155,94.8,500,na,EZH2,CCDS5891.1,c.817C>T,p.Q273*,ONCOGENIC 208171461,PD6276a,7,148523671,G,T,DI,0,201,45.70765661,256,1,EZH2,-,-,-,ONCOGENIC 58131388,PD6126a,7,148523708,C,A,Sub,0,185,48.4,500,na,EZH2,CCDS5891.1,c.745G>T,p.E249*,ONCOGENIC 68195653,PD8731a,7,148523708,C,T,Sub,0,185,45.89,231,na,EZH2,CCDS5891.1,c.745G>A,p.E249K,ONCOGENIC 58144012,PD6847a,7,148526846,T,C,Sub,0,137,22.93,410,na,EZH2,CCDS5891.1,c.458A>G,p.Y153C,POSSIBLE ONCOGENIC 58101114,PD6539a,7,148526867,A,T,Sub,0,128,69.6,500,na,EZH2,CCDS5891.1,c.437T>A,p.I146N,POSSIBLE ONCOGENIC 58080039,PD6843a,7,148526870,A,G,Sub,0,128,27.69,455,na,EZH2,CCDS5891.1,c.434T>C,p.F145S,ONCOGENIC 58203157,PD7037a,7,148526930,T,A,Sub,0,105,37.92,298,na,EZH2,CCDS5891.1,c.374A>T,p.E125V,POSSIBLE ONCOGENIC 58202977,PD6946a,7,148529751,C,T,Sub,0,69,91.58,392,na,EZH2,CCDS5891.1,c.338G>A,p.W113*,ONCOGENIC 58082435,PD6977a,7,151845676,C,T,Sub,0,236,94.12,17,na,MLL3,CCDS5931.1,c.13336G>A,p.A4446T,UNKNOWN 58105275,PD6944a,7,151848583,T,T,Sub,0,62,15,60,na,MLL3,CCDS5931.1,c.12610G>A,p.V4204I,UNKNOWN 68092898,PD8728a,7,151851151,C,T,Sub,0,147,49.47,95,na,MLL3,CCDS5931.1,c.12220G>A,p.G4074S,UNKNOWN 58155779,PD5774a,7,151859914,G,A,Sub,0,73,48,500,na,MLL3,CCDS5931.1,c.10748C>T,p.P3583L,UNKNOWN 58136023,PD5743a,7,151860124,G,A,Sub,0,434,51,500,na,MLL3,CCDS5931.1,c.10538C>T,p.P3513L,UNKNOWN 58108892,PD6051a,7,151860169,G,A,Sub,0,427,51.8,500,na,MLL3,CCDS5931.1,c.10493C>T,p.T3498I,UNKNOWN 58097999,PD6231a,7,151860307,G,A,Sub,0,335,51.2,500,na,MLL3,CCDS5931.1,c.10355C>T,p.P3452L,UNKNOWN 58189268,PD6220a,7,151860307,G,A,Sub,0,335,49,500,na,MLL3,CCDS5931.1,c.10355C>T,p.P3452L,UNKNOWN 58077163,PD6827a,7,151860454,C,T,Sub,0,312,55.8,500,na,MLL3,CCDS5931.1,c.10208G>A,p.R3403H,UNKNOWN 66875966,PD8735a,7,151864263,G,A,Sub,0,158,9.8,102,na,MLL3,CCDS5931.1,c.9718C>T,p.Q3240*,POSSIBLE ONCOGENIC 58086324,PD7017a,7,151864368,G,A,Sub,0,157,14,500,na,MLL3,CCDS5931.1,c.9613C>T,p.H3205Y,UNKNOWN 58081318,PD6997a,7,151864451,C,T,Sub,0,98,50.28,177,na,MLL3,CCDS5931.1,c.9530G>A,p.R3177H,UNKNOWN 58176395,PD7113a,7,151864451,C,T,Sub,0,98,57.6,217,na,MLL3,CCDS5931.1,c.9530G>A,p.R3177H,UNKNOWN 58073062,PD6985a,7,151871243,G,G,Sub,0,71,32.16,171,na,MLL3,CCDS5931.1,c.9347T>C,p.L3116P,UNKNOWN 58137942,PD6803a,7,151873885,C,T,Sub,0,221,50.6,500,na,MLL3,CCDS5931.1,c.8653G>A,p.E2885K,UNKNOWN 58154796,PD6518a,7,151874446,C,T,Sub,0,160,56.45,248,na,MLL3,CCDS5931.1,c.8092G>A,p.E2698K,UNKNOWN 58198083,PD6938a,7,151874520,T,C,Sub,0,186,47.55,265,na,MLL3,CCDS5931.1,c.8018A>G,p.D2673G,UNKNOWN 58186844,PD6184a,7,151875015,G,A,Sub,0,142,11.5,113,na,MLL3,CCDS5931.1,c.7523C>T,p.S2508F,UNKNOWN 66904415,PD7369a,7,151878035,T,C,Sub,0,165,50.52,289,na,MLL3,CCDS5931.1,c.6910A>G,p.M2304V,UNKNOWN 58152049,PD6156a,7,151878506,G,A,Sub,0.53,187,36.31,358,na,MLL3,CCDS5931.1,c.6439C>T,p.Q2147*,POSSIBLE ONCOGENIC 58072374,PD6926a,7,151878646,G,A,Sub,0,133,45.74,411,na,MLL3,CCDS5931.1,c.6299C>T,p.T2100I,UNKNOWN 58138911,PD6814a,7,151878797,G,A,Sub,0,108,16.88,462,na,MLL3,CCDS5931.1,c.6148C>T,p.P2050S,UNKNOWN 58202419,PD7078a,7,151879501,C,A,Sub,0,73,47,500,na,MLL3,CCDS5931.1,c.5444G>T,p.G1815V,UNKNOWN 344755175,PD7374a,7,151882660,c,-,D,0,151,5.642633229,319,1,MLL3,CCDS5931.1,c.5065delG,p.E1689fs*28,POSSIBLE ONCOGENIC 58099774,PD6888a,7,151884485,C,T,Sub,0,96,12.2,369,na,MLL3,CCDS5931.1,c.4870G>A,p.G1624R,UNKNOWN 58179478,PD5773a,7,151884538,G,A,Sub,0,94,47.09,499,na,MLL3,CCDS5931.1,c.4817C>T,p.P1606L,UNKNOWN 58141089,PD6987a,7,151891124,G,A,Sub,0,266,47.34,433,na,MLL3,CCDS5931.1,c.4630C>T,p.P1544S,UNKNOWN 68196089,PD8645a,7,151900134,A,C,Sub,0,147,6,500,na,MLL3,CCDS5931.1,c.3977T>G,p.L1326*,POSSIBLE ONCOGENIC 58161033,PD6878a,7,151902233,A,C,Sub,0,100,50.8,500,na,MLL3,CCDS5931.1,c.3919T>G,p.S1307A,UNKNOWN 344720464,PD8936a,7,151935825,c,-,D,0,126,2.513227513,753,1,MLL3,CCDS5931.1,c.2619delG,p.Q873fs*40,POSSIBLE ONCOGENIC 58133860,PD6058a,7,151945112,C,G,Sub,0,500,16.6,500,na,MLL3,CCDS5931.1,c.2407G>C,p.A803P,UNKNOWN 58075317,PD6871a,7,151945133,T,C,Sub,0,500,18,500,na,MLL3,CCDS5931.1,c.2386A>G,p.M796V,UNKNOWN 58176706,PD7075a,7,151945305,G,C,Sub,0,354,33,500,na,MLL3,CCDS5931.1,c.2214C>G,p.D738E,UNKNOWN 58134209,PD6121a,7,151945604,C,A,Sub,0,238,47.6,500,na,MLL3,CCDS5931.1,c.1915G>T,p.G639C,UNKNOWN 67012472,PD7390a,7,151945696,T,C,Sub,0,183,45.86,133,na,MLL3,CCDS5931.1,c.1823A>G,p.Q608R,UNKNOWN 58105282,PD6944a,7,151947010,C,A,Sub,0,85,72.73,11,na,MLL3,CCDS5931.1,c.1764G>T,p.Q588H,UNKNOWN 58188343,PD6933a,7,151970890,C,T,Sub,0,449,10.6,500,na,MLL3,ENST00000446791,c.74G>A,p.R25K,UNKNOWN 58145569,PD6855a,7,152008951,A,G,Sub,0,49,49.34,458,na,MLL3,CCDS5931.1,c.671T>C,p.L224P,UNKNOWN 58109535,PD6114a,7,152012343,G,A,Sub,0,245,50,500,na,MLL3,CCDS5931.1,c.470C>T,p.P157L,UNKNOWN 66951389,PD7391a,8,92982959,G,A,Sub,0,49,39.77,88,na,RUNX1T1,CCDS6256.1,c.1466C>T,p.T489M,UNKNOWN 58109898,PD6139a,8,92983013,G,A,Sub,0,79,21.59,227,na,RUNX1T1,CCDS6256.1,c.1412C>T,p.A471V,UNKNOWN 58166144,PD6830a,8,92999181,T,A,Sub,0,170,47.6,500,na,RUNX1T1,CCDS6256.1,c.1011A>T,p.Q337H,UNKNOWN 58076817,PD6914a,8,93003906,G,A,Sub,0,212,16.45,389,na,RUNX1T1,CCDS6256.1,c.952C>T,p.P318S,UNKNOWN 58095608,PD6100a,8,93003974,G,A,Sub,0,157,42.89,464,na,RUNX1T1,CCDS6256.1,c.884C>T,p.P295L,UNKNOWN 58138041,PD6803a,8,93004089,G,A,Sub,0,110,13.92,273,na,RUNX1T1,CCDS6256.1,c.769C>T,p.P257S,UNKNOWN 66876125,PD8735a,8,93026806,C,T,Sub,0,103,11.86,194,na,RUNX1T1,CCDS6256.1,c.468+1G>A,p.?,UNKNOWN 58194402,PD6881a,8,93027009,C,T,Sub,0,64,17.69,130,na,RUNX1T1,CCDS6256.1,c.266G>A,p.G89D,UNKNOWN 66858213,PD8935a,8,93027036,G,A,Sub,0,62,70.21,94,na,RUNX1T1,CCDS6256.1,c.239C>T,p.T80M,UNKNOWN 68092999,PD8728a,8,93029558,G,A,Sub,0,123,10.4,500,na,RUNX1T1,CCDS6256.1,c.122C>T,p.S41L,UNKNOWN 58093376,PD6318a,8,117864824,C,T,Sub,0,161,51.23,285,na,RAD21,CCDS6321.1,c.1285G>A,p.D429N,UNKNOWN 207993312,PD6481a,8,117866694,tt,-,D,0,249,15.49295775,354,4,RAD21,CCDS6321.1,c.950_951delAA,p.K317fs*10,ONCOGENIC 58114228,PD6157a,8,117868423,C,A,Sub,0.37,270,23.13,147,na,RAD21,CCDS6321.1,c.919G>T,p.E307*,ONCOGENIC 58073130,PD6985a,8,117868912,A,T,Sub,0,47,42.27,194,na,RAD21,CCDS6321.1,c.787G>A,p.D263N,UNKNOWN 208007175,PD6275a,8,117869001,-,A,I,0,89,19.32114883,383,2,RAD21,CCDS6321.1,c.697_698insT,p.I234fs*2,ONCOGENIC 208021278,PD6233a,8,117874081,-,T,I,,46,35.64356436,606,1,RAD21,CCDS6321.1,c.372_373insA,p.D125fs*2,ONCOGENIC 58094745,PD5718a,8,117875491,A,G,Sub,0,266,20.6,500,na,RAD21,CCDS6321.1,c.152T>C,p.M51T,UNKNOWN 66887244,PD7382a,8,128750684,C,T,Sub,0,10,30,60,na,MYC,CCDS6359.2,c.221C>T,p.P74L,UNKNOWN 58089810,PD6934a,8,128752656,G,A,Sub,0,69,50.33,151,na,MYC,CCDS6359.2,c.817G>A,p.D273N,UNKNOWN 58166670,PD6828a,8,128752728,C,G,Sub,0,71,44.3,158,na,MYC,CCDS6359.2,c.889C>G,p.P297A,UNKNOWN 58163847,PD5722a,8,128753197,G,C,Sub,0,40,48.39,155,na,MYC,CCDS6359.2,c.1358G>C,p.C453S,UNKNOWN 58095499,PD6100a,9,5022072,G,A,Sub,0,135,57.02,114,na,JAK2,CCDS6457.1,c.85G>A,p.A29T,UNKNOWN 58119178,PD6080a,9,5029878,A,T,Sub,0,83,46.4,500,na,JAK2,CCDS6457.1,c.322A>T,p.T108S,UNKNOWN 58159474,PD6864a,9,5044417,G,A,Sub,0,161,46.55,391,na,JAK2,CCDS6457.1,c.365G>A,p.R122H,UNKNOWN 58167991,PD6988a,9,5044417,G,A,Sub,0,161,62.08,240,na,JAK2,CCDS6457.1,c.365G>A,p.R122H,UNKNOWN 58070142,PD6153a,9,5054666,T,C,Sub,0,168,47.8,500,na,JAK2,CCDS6457.1,c.718T>C,p.F240L,UNKNOWN 58151583,PD6112a,9,5054695,C,G,Sub,0,154,48.8,500,na,JAK2,CCDS6457.1,c.747C>G,p.N249K,UNKNOWN 58070857,PD7092a,9,5064925,T,C,Sub,0,165,47.4,500,na,JAK2,CCDS6457.1,c.1099T>C,p.S367P,UNKNOWN 58109623,PD6119a,9,5065019,G,C,Sub,0,107,48.39,465,na,JAK2,CCDS6457.1,c.1193G>C,p.S398T,UNKNOWN 207941869,PD6185a,9,5069097,aag,-,D,0,114,41.82194617,418,2,JAK2,CCDS6457.1,c.1402_1404delAAG,p.K468delK,UNKNOWN 58184962,PD6332a,9,5069154,C,T,Sub,0,113,48.16,461,na,JAK2,CCDS6457.1,c.1459C>T,p.R487C,UNKNOWN 58078112,PD6517a,9,5073770,G,T,Sub,0,44,22.83,368,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58099440,PD6928a,9,5073770,G,T,Sub,0,44,65.2,500,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58104209,PD7085a,9,5073770,G,T,Sub,0,44,10.02,419,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58114674,PD6093a,9,5073770,G,T,Sub,0,44,40.18,438,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58141418,PD6192a,9,5073770,G,T,Sub,0,44,27.99,393,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58143840,PD6835a,9,5073770,G,T,Sub,0,44,16.97,436,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58144188,PD7091a,9,5073770,G,T,Sub,0,44,14.29,420,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58147239,PD6807a,9,5073770,G,T,Sub,0,44,53.59,362,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58150332,PD6338a,9,5073770,G,T,Sub,0,44,39.11,473,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58159221,PD6972a,9,5073770,G,T,Sub,0,44,24.24,396,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58184386,PD6305a,9,5073770,G,T,Sub,0,44,31.38,392,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58186789,PD6184a,9,5073770,G,T,Sub,0,44,32.29,449,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58189525,PD5737a,9,5073770,G,T,Sub,0,44,52.18,435,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58196525,PD6339a,9,5073770,G,T,Sub,0,44,47.33,300,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58202262,PD6949a,9,5073770,G,T,Sub,0,44,42.73,447,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 66763405,PD8936a,9,5073770,G,T,Sub,0,44,29,500,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 66962351,PD9663a,9,5073770,G,T,Sub,0,44,18.52,189,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 67016596,PD7386a,9,5073770,G,T,Sub,0,44,31.77,447,na,JAK2,CCDS6457.1,c.1849G>T,p.V617F,ONCOGENIC 58171569,PD5765a,9,5080241,G,C,Sub,0,95,48.68,302,na,JAK2,CCDS6457.1,c.2144G>C,p.R715T,UNKNOWN 58102921,PD6232a,9,5080279,A,G,Sub,0,78,52.49,482,na,JAK2,CCDS6457.1,c.2182A>G,p.K728E,UNKNOWN 58150333,PD6338a,9,5089798,T,C,Sub,0,122,68.31,344,na,JAK2,CCDS6457.1,c.2696T>C,p.I899T,UNKNOWN 58203364,PD7044a,9,5090451,C,T,Sub,0,195,43.3,224,na,JAK2,CCDS6457.1,c.2767C>T,p.R923C,UNKNOWN 58190533,PD6793a,9,5090452,G,A,Sub,0.51,197,42.86,224,na,JAK2,CCDS6457.1,c.2768G>A,p.R923H,UNKNOWN 58198716,PD6958a,9,5123108,A,G,Sub,0,172,46.2,500,na,JAK2,CCDS6457.1,c.3164A>G,p.K1055R,UNKNOWN 58105236,PD6944a,9,21968238,C,G,Sub,0,25,24.16,298,na,CDKN2A,CCDS6510.1,c.461T>C,p.I154T,UNKNOWN 58086272,PD7017a,9,21970993,C,T,Sub,0,39,32.69,104,na,CDKN2A,CCDS6510.1,c.365G>A,p.G122D,ONCOGENIC 58163911,PD6860a,9,21971057,C,A,Sub,0,11,49.76,209,na,CDKN2A,CCDS6510.1,c.301G>T,p.G101W,ONCOGENIC 58131160,PD5778a,9,21971155,G,C,Sub,0,11,40.14,147,na,CDKN2A,CCDS6510.1,c.203C>G,p.A68G,ONCOGENIC 58147242,PD6807a,9,21971188,G,A,Sub,0,20,55.96,109,na,CDKN2A,CCDS6510.1,c.170C>T,p.A57V,ONCOGENIC 68092618,PD8648a,9,22008897,G,T,Sub,0,15,26.67,60,na,CDKN2B,CCDS6512.1,c.56C>A,p.A19D,UNKNOWN 58139563,PD6898a,9,22008948,C,T,Sub,0,29,32.89,76,na,CDKN2B,CCDS6512.1,c.5G>A,p.R2H,UNKNOWN 58165471,PD6787a,9,102888724,G,A,Sub,0,96,39,500,na,INVS,CCDS6746.1,c.166G>A,p.V56M,UNKNOWN 58130791,PD6824a,9,102888754,G,T,Sub,0,87,10.74,242,na,INVS,CCDS6746.1,c.196G>T,p.A66S,UNKNOWN 58101454,PD6164a,9,102988400,G,C,Sub,0,30,48.52,237,na,INVS,CCDS6746.1,c.330G>C,p.K110N,UNKNOWN 58091618,PD6506a,9,102988419,C,G,Sub,0,26,51.27,197,na,INVS,CCDS6746.1,c.349C>G,p.P117A,UNKNOWN 58196586,PD5762a,9,103002499,C,T,Sub,0,105,29.4,500,na,INVS,CCDS6746.1,c.773C>T,p.P258L,UNKNOWN 58113820,PD6810a,9,103004888,A,G,Sub,0,11,54.2,500,na,INVS,CCDS6746.1,c.833A>G,p.N278S,UNKNOWN 58204283,PD6484a,9,103008997,G,T,Sub,0,175,48.02,479,na,INVS,CCDS6746.1,c.1006G>T,p.D336Y,UNKNOWN 58089054,PD5742a,9,103015413,A,C,Sub,0,105,47.16,282,na,INVS,CCDS6746.1,c.1459A>C,p.K487Q,UNKNOWN 58202837,PD6909a,9,103015413,A,C,Sub,0,105,40.96,293,na,INVS,CCDS6746.1,c.1459A>C,p.K487Q,UNKNOWN 58095673,PD6187a,9,103035235,T,C,Sub,0,35,50,500,na,INVS,CCDS6746.1,c.1661T>C,p.I554T,UNKNOWN 58131343,PD6126a,9,103046655,G,A,Sub,0,28,53.77,199,na,INVS,CCDS6746.1,c.1838G>A,p.R613Q,UNKNOWN 58179713,PD6780a,9,103054849,C,A,Sub,0,27,57.58,33,na,INVS,CCDS6746.1,c.2310C>A,p.H770Q,UNKNOWN 66964075,PD7367a,9,103055144,G,A,Sub,0,92,27.74,274,na,INVS,CCDS6746.1,c.2605G>A,p.E869K,UNKNOWN 58152006,PD6156a,9,103055177,G,A,Sub,0,104,10.31,262,na,INVS,CCDS6746.1,c.2638G>A,p.A880T,UNKNOWN 67016595,PD7386a,9,103062915,C,A,Sub,0,51,4.2,500,na,INVS,CCDS6746.1,c.3157C>A,p.L1053M,UNKNOWN 58078742,PD6519a,9,133589722,G,A,Sub,0,203,49.48,192,na,ABL1,CCDS35165.1,c.16G>A,p.G6R,UNKNOWN 58155102,PD6821a,9,133729471,G,C,Sub,0,35,41.8,189,na,ABL1,CCDS35165.1,c.157G>C,p.A53P,UNKNOWN 58095497,PD6100a,9,133729493,G,A,Sub,0,37,11.36,44,na,ABL1,CCDS35165.1,c.179G>A,p.G60D,UNKNOWN 66878140,PD8934a,9,133750263,C,T,Sub,0,232,12.39,226,na,ABL1,CCDS35165.1,c.1151C>T,p.A384V,UNKNOWN 58088268,PD6483a,9,133755463,T,C,Sub,0,113,52.03,123,na,ABL1,CCDS35165.1,c.1489T>C,p.W497R,UNKNOWN 58181501,PD6816a,9,133755481,C,A,Sub,0,123,12.73,55,na,ABL1,CCDS35165.1,c.1507C>A,p.P503T,UNKNOWN 58109803,PD6139a,9,133755919,C,A,Sub,0,11,54.24,59,na,ABL1,CCDS35165.1,c.1603C>A,p.R535S,UNKNOWN 66951270,PD7391a,9,133755928,G,A,Sub,0,13,71.88,32,na,ABL1,CCDS35165.1,c.1612G>A,p.V538M,UNKNOWN 58155280,PD6165a,9,133759362,T,C,Sub,0,62,44.44,27,na,ABL1,CCDS35165.1,c.1742T>C,p.L581P,UNKNOWN 58155949,PD6899a,9,133759821,G,A,Sub,0,21,61.9,21,na,ABL1,CCDS35165.1,c.2201G>A,p.R734H,UNKNOWN 58076924,PD6115a,9,133760036,G,A,Sub,0,34,39.68,63,na,ABL1,CCDS35165.1,c.2416G>A,p.V806M,UNKNOWN 58148649,PD6963a,9,133760042,A,G,Sub,0,37,57.58,33,na,ABL1,CCDS35165.1,c.2422A>G,p.K808E,UNKNOWN 58165305,PD6221a,9,133760750,G,A,Sub,0,37,50.62,81,na,ABL1,CCDS35165.1,c.3130G>A,p.G1044S,UNKNOWN 58083188,PD6223a,9,139391635,C,T,Sub,0,50,58.82,17,na,NOTCH1,CCDS43905.1,c.6556G>A,p.G2186S,UNKNOWN 58120974,PD6265a,9,139396343,G,C,Sub,0,18,33.33,12,na,NOTCH1,CCDS43905.1,c.5495C>G,p.P1832R,UNKNOWN 58195710,PD6907a,9,139402485,G,T,Sub,0,25,63.64,11,na,NOTCH1,CCDS43905.1,c.3432C>A,p.D1144E,UNKNOWN 58143839,PD6835a,9,139402543,G,A,Sub,0,12,60,5,na,NOTCH1,CCDS43905.1,c.3374C>T,p.A1125V,UNKNOWN 58171765,PD6130a,9,139402738,C,T,Sub,0,24,42.86,7,na,NOTCH1,CCDS43905.1,c.3271G>A,p.G1091S,UNKNOWN 58148650,PD6963a,9,139407844,C,T,Sub,0,24,54.55,11,na,NOTCH1,CCDS43905.1,c.2353G>A,p.G785S,UNKNOWN 58130574,PD6482a,9,139409753,G,A,Sub,0,46,83.33,6,na,NOTCH1,CCDS43905.1,c.2003C>T,p.P668L,UNKNOWN 58183765,PD6897a,10,70332196,C,T,Sub,0,44,41.11,180,na,TET1,CCDS7281.1,c.101C>T,p.A34V,UNKNOWN 58130158,PD6146a,10,70332303,G,A,Sub,0,57,57.35,68,na,TET1,CCDS7281.1,c.208G>A,p.V70I,UNKNOWN 58076645,PD6914a,10,70332379,A,G,Sub,0,58,65.2,500,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58091748,PD6782a,10,70332379,A,G,Sub,0,58,32.94,337,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58130906,PD6824a,10,70332379,A,G,Sub,0,58,35.06,174,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58138015,PD6803a,10,70332379,A,G,Sub,0,58,21.04,385,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58138800,PD6814a,10,70332379,A,G,Sub,0,58,23.27,202,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58160869,PD6806a,10,70332379,A,G,Sub,0,58,21.24,306,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58179043,PD6784a,10,70332379,A,G,Sub,0,58,23.8,500,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58181489,PD6816a,10,70332379,A,G,Sub,0,58,77.97,413,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58194296,PD6881a,10,70332379,A,G,Sub,0,58,71.4,500,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58194882,PD6905a,10,70332379,A,G,Sub,0,58,15.32,124,na,TET1,CCDS7281.1,c.284A>G,p.E95G,UNKNOWN 58173346,PD6879a,10,70332573,A,G,Sub,0,116,45.22,471,na,TET1,CCDS7281.1,c.478A>G,p.M160V,UNKNOWN 58130907,PD6824a,10,70332718,G,A,Sub,0,97,25.9,166,na,TET1,CCDS7281.1,c.623G>A,p.G208D,UNKNOWN 58130908,PD6824a,10,70333222,G,A,Sub,0,63,13.14,137,na,TET1,CCDS7281.1,c.1127G>A,p.W376*,UNKNOWN 58163887,PD6860a,10,70333695,T,G,Sub,0,68,48.9,227,na,TET1,CCDS7281.1,c.1600T>G,p.S534A,UNKNOWN 68092772,PD8728a,10,70333741,G,A,Sub,0,78,15,40,na,TET1,CCDS7281.1,c.1646G>A,p.S549N,UNKNOWN 58182554,PD6171a,10,70333785,A,G,Sub,0,85,48.97,145,na,TET1,CCDS7281.1,c.1690A>G,p.M564V,UNKNOWN 66971186,PD7379a,10,70333787,G,A,Sub,0,84,12.95,139,na,TET1,CCDS7281.1,c.1692G>A,p.M564I,UNKNOWN 58141031,PD6987a,10,70404527,G,A,Sub,0,156,49.6,500,na,TET1,CCDS7281.1,c.2041G>A,p.E681K,UNKNOWN 58105203,PD6944a,10,70405251,A,T,Sub,0,57,53.85,26,na,TET1,CCDS7281.1,c.2765G>T,p.R922I,UNKNOWN 58147343,PD6057a,10,70405550,G,A,Sub,0,164,15.78,431,na,TET1,CCDS7281.1,c.3064G>A,p.A1022T,UNKNOWN 58092716,PD7028a,10,70405634,C,A,Sub,0,229,48.6,500,na,TET1,CCDS7281.1,c.3148C>A,p.Q1050K,UNKNOWN 58182732,PD6210a,10,70426842,T,A,Sub,0,76,11.19,295,na,TET1,CCDS7281.1,c.4502T>A,p.L1501*,UNKNOWN 58136647,PD7043a,10,70426857,C,T,Sub,0,83,49.85,343,na,TET1,CCDS7281.1,c.4517C>T,p.T1506I,UNKNOWN 58102157,PD6270a,10,70446360,C,T,Sub,0,78,45.17,476,na,TET1,CCDS7281.1,c.5300C>T,p.A1767V,UNKNOWN 58141411,PD6192a,10,70450816,G,A,Sub,0,95,14.77,264,na,TET1,CCDS7281.1,c.5656G>A,p.G1886R,UNKNOWN 58083729,PD6131a,10,70451014,C,T,Sub,0,138,45.01,471,na,TET1,CCDS7281.1,c.5854C>T,p.L1952F,UNKNOWN 58106388,PD6237a,10,70451567,T,C,Sub,0,237,44.17,326,na,TET1,CCDS7281.1,c.6407T>C,p.V2136A,UNKNOWN 58069700,PD5721a,10,89692797,A,T,Sub,0,168,43,500,na,PTEN,ENST00000371953,c.281A>T,p.N94I,ONCOGENIC 58147337,PD6057a,10,89711968,C,T,Sub,0,390,10.54,427,na,PTEN,ENST00000371953,c.586C>T,p.H196Y,ONCOGENIC 58156252,PD6813a,10,126490436,T,C,Sub,0,23,36.84,38,na,FAM175B,CCDS31308.2,c.38T>C,p.V13A,UNKNOWN 58200691,PD6275a,10,126515266,G,A,Sub,0,265,52.6,289,na,FAM175B,CCDS31308.2,c.370G>A,p.D124N,UNKNOWN 66937097,PD7383a,10,126515272,G,A,Sub,0.39,254,45.33,150,na,FAM175B,CCDS31308.2,c.376G>A,p.V126I,UNKNOWN 58116302,PD6263a,10,126523433,A,T,Sub,0,129,45.63,206,na,FAM175B,CCDS31308.2,c.1141A>T,p.I381F,UNKNOWN 58187660,PD6833a,10,126523433,A,T,Sub,0,129,54.6,315,na,FAM175B,CCDS31308.2,c.1141A>T,p.I381F,UNKNOWN 66848051,PD7377a,10,126523502,G,T,Sub,0,147,36.67,60,na,FAM175B,CCDS31308.2,c.1210G>T,p.D404Y,UNKNOWN 58077315,PD6937a,11,404319,G,A,Sub,2.22,45,42.86,7,na,PKP3,CCDS7695.1,c.2354G>A,p.R785Q,UNKNOWN 58192709,PD6922a,11,3697595,G,A,Sub,0,29,53.64,110,na,NUP98,CCDS7746.1,c.5197C>T,p.R1733C,UNKNOWN 58183309,PD6124a,11,3697784,C,T,Sub,0,126,49.34,302,na,NUP98,CCDS7746.1,c.5140G>A,p.E1714K,UNKNOWN 58148469,PD6143a,11,3697846,T,C,Sub,0,101,50.29,348,na,NUP98,CCDS7746.1,c.5078A>G,p.D1693G,UNKNOWN 58159158,PD6532a,11,3700812,A,G,Sub,0,83,46.04,417,na,NUP98,CCDS7746.1,c.5045T>C,p.V1682A,UNKNOWN 209723094,PD6934a,11,3720395,-,T,I,0,48,13.12741313,258,7,NUP98,CCDS7746.1,c.3925_3926insA,p.N1311fs*18,UNKNOWN 58190511,PD6793a,11,3726535,C,T,Sub,0,211,48.07,441,na,NUP98,CCDS7746.1,c.2977G>A,p.D993N,UNKNOWN 58092020,PD6840a,11,3733837,G,A,Sub,0,84,46.88,433,na,NUP98,CCDS7746.1,c.2699C>T,p.P900L,UNKNOWN 58169966,PD6269a,11,3740760,C,T,Sub,0,192,49.8,500,na,NUP98,CCDS7746.1,c.2281G>A,p.E761K,UNKNOWN 58096540,PD6846a,11,3744428,C,T,Sub,0.45,223,50.66,452,na,NUP98,CCDS7746.1,c.2105G>A,p.R702Q,UNKNOWN 58108502,PD7000a,11,3744515,G,C,Sub,0,150,48.2,500,na,NUP98,CCDS7746.1,c.2018C>G,p.A673G,UNKNOWN 58188835,PD5780a,11,3744656,T,C,Sub,0,117,48.69,495,na,NUP98,CCDS7746.1,c.1877A>G,p.E626G,UNKNOWN 58202141,PD6923a,11,3746385,G,A,Sub,0,96,52.04,465,na,NUP98,CCDS7746.1,c.1795C>T,p.R599C,UNKNOWN 208033814,PD6805a,11,3746406,tgc,-,D,0,95,24.04692082,309,1,NUP98,CCDS7746.1,c.1772_1774delGCA,p.S591delS,UNKNOWN 58068190,PD6142a,11,3746442,T,C,Sub,0,75,53.1,290,na,NUP98,CCDS7746.1,c.1738A>G,p.I580V,UNKNOWN 58076875,PD6115a,11,3752636,C,G,Sub,0,99,47.64,424,na,NUP98,CCDS7746.1,c.1715G>C,p.G572A,UNKNOWN 58138080,PD6803a,11,3752696,C,T,Sub,0,77,15.23,302,na,NUP98,CCDS7746.1,c.1655G>A,p.G552D,UNKNOWN 58163155,PD7077a,11,3752720,C,T,Sub,0,64,12.66,308,na,NUP98,CCDS7746.1,c.1631G>A,p.R544Q,UNKNOWN 58130744,PD6824a,11,3752786,G,A,Sub,0,31,12.2,164,na,NUP98,CCDS7746.1,c.1565C>T,p.A522V,UNKNOWN 58112132,PD7010a,11,3774575,C,T,Sub,0,15,23.68,359,na,NUP98,CCDS7746.1,c.1238G>A,p.G413E,UNKNOWN 68092824,PD8728a,11,3781781,C,T,Sub,0,40,9.93,151,na,NUP98,CCDS7746.1,c.1162G>A,p.G388S,UNKNOWN 58159336,PD6972a,11,3789881,G,A,Sub,0,51,48.84,172,na,NUP98,CCDS7746.1,c.878C>T,p.T293I,UNKNOWN 66763465,PD8936a,11,3792978,T,C,Sub,0,15,32.47,154,na,NUP98,CCDS7746.1,c.784A>G,p.S262G,UNKNOWN 58147377,PD6057a,11,3797138,G,A,Sub,0,26,10,310,na,NUP98,CCDS7746.1,c.469C>T,p.P157S,UNKNOWN 58179164,PD6784a,11,3797143,G,A,Sub,0,27,11.8,500,na,NUP98,CCDS7746.1,c.464C>T,p.A155V,UNKNOWN 58165146,PD6176a,11,3800199,T,A,Sub,0,157,44.96,347,na,NUP98,CCDS7746.1,c.259A>T,p.T87S,UNKNOWN 58186616,PD6064a,11,3800244,C,G,Sub,0,178,45.01,351,na,NUP98,CCDS7746.1,c.214G>C,p.A72P,UNKNOWN 58196825,PD6258a,11,32410684,T,G,Sub,0,172,46.4,500,na,WT1,CCDS7878.2,c.1474A>C,p.K492Q,UNKNOWN 343926648,PD8734a,11,32417914,-,GACCG,I,0,112,25.86206897,52,1,WT1,CCDS7878.2,c.1137_1138insCGGTC,p.A382fs*69,ONCOGENIC 209720001,PD6869a,11,32417923,-,ACAAGAGT,I,0,115,24.09638554,134,1,WT1,CCDS7878.2,c.1128_1129insACTCTTGT,p.R380fs*72,ONCOGENIC 208053435,PD6854a,11,32417942,-,GGAC,I,0,91,49.13793103,99,0,WT1,CCDS7878.2,c.1109_1110insGTCC,p.V371fs*15,ONCOGENIC 344723408,PD7388a,11,32417942,-,A,I,0,90,27.63157895,76,1,WT1,CCDS7878.2,c.1109_1110insT,p.V371fs*14,ONCOGENIC 344723409,PD7388a,11,32417948,-,C,I,0,85,32.85714286,70,1,WT1,CCDS7878.2,c.1103_1104insG,p.R369fs*16,ONCOGENIC 58071627,PD6110a,11,32421511,C,G,Sub,0,13,18.97,58,na,WT1,CCDS7878.2,c.1081G>C,p.V361L,UNKNOWN 68195700,PD8731a,11,32421537,G,A,Sub,0,13,17.46,63,na,WT1,CCDS7878.2,c.1055C>T,p.A352V,UNKNOWN 58108659,PD7032a,11,32456333,G,A,Sub,0,15,10.71,56,na,WT1,CCDS7878.2,c.559C>T,p.Q187*,ONCOGENIC 58181395,PD6816a,11,64537878,G,T,Sub,0,26,20,140,na,SF1,CCDS31599.1,c.239C>A,p.S80Y,UNKNOWN 58119291,PD6113a,11,118339512,G,A,Sub,0,237,11.4,500,na,MLL,CCDS31686.1,c.455G>A,p.G152D,UNKNOWN 58152500,PD5748a,11,118343276,C,A,Sub,0,44,32.8,500,na,MLL,CCDS31686.1,c.1402C>A,p.H468N,UNKNOWN 58136139,PD6236a,11,118344431,A,C,Sub,0,46,49.47,285,na,MLL,CCDS31686.1,c.2557A>C,p.K853Q,UNKNOWN 58113876,PD6810a,11,118344558,A,G,Sub,0,72,47.42,388,na,MLL,CCDS31686.1,c.2684A>G,p.K895R,UNKNOWN 58082417,PD6977a,11,118352550,C,A,Sub,0,41,59.26,27,na,MLL,CCDS31686.1,c.3755C>A,p.P1252H,UNKNOWN 58096718,PD6889a,11,118352605,G,C,Sub,0,26,49.61,383,na,MLL,CCDS31686.1,c.3810G>C,p.K1270N,UNKNOWN 208443800,PD6098a,11,118355649,a,-,D,,55,25.81423402,829,1,MLL,CCDS31686.1,c.4291delA,p.R1431fs*13,POSSIBLE ONCOGENIC 58118303,PD6081a,11,118361952,G,A,Sub,0,175,11.4,500,na,MLL,CCDS31686.1,c.4738G>A,p.D1580N,UNKNOWN 66960476,PD8734a,11,118361983,A,G,Sub,0,177,50.6,332,na,MLL,CCDS31686.1,c.4769A>G,p.K1590R,UNKNOWN 58115017,PD6982a,11,118363925,C,T,Sub,0,105,27.53,287,na,MLL,CCDS31686.1,c.5149C>T,p.Q1717*,POSSIBLE ONCOGENIC 58105229,PD6944a,11,118366474,G,A,Sub,0,39,99.8,500,na,MLL,CCDS31686.1,c.5414G>A,p.W1805*,POSSIBLE ONCOGENIC 58086909,PD6125a,11,118372440,C,T,Sub,0,105,46.2,500,na,MLL,CCDS31686.1,c.6364C>T,p.P2122S,UNKNOWN 58135572,PD6869a,11,118373744,G,C,Sub,0,64,43.8,500,na,MLL,CCDS31686.1,c.7128G>C,p.R2376S,UNKNOWN 58114898,PD5734a,11,118373839,G,A,Sub,0,92,12.17,460,na,MLL,CCDS31686.1,c.7223G>A,p.S2408N,UNKNOWN 58142508,PD6177a,11,118374153,C,A,Sub,0,118,50,500,na,MLL,CCDS31686.1,c.7537C>A,p.Q2513K,UNKNOWN 58072994,PD6985a,11,118374280,C,T,Sub,0,115,13.6,397,na,MLL,CCDS31686.1,c.7664C>T,p.P2555L,UNKNOWN 58188762,PD5727a,11,118374411,A,G,Sub,0,89,49,500,na,MLL,CCDS31686.1,c.7795A>G,p.M2599V,UNKNOWN 58117442,PD6120a,11,118374837,C,G,Sub,0,65,52,500,na,MLL,CCDS31686.1,c.8221C>G,p.P2741A,UNKNOWN 58200545,PD6235a,11,118375065,G,A,Sub,0,103,47,500,na,MLL,CCDS31686.1,c.8449G>A,p.D2817N,UNKNOWN 66846673,PD7374a,11,118375653,G,C,Sub,0,149,42.38,453,na,MLL,CCDS31686.1,c.9037G>C,p.E3013Q,UNKNOWN 58201844,PD6894a,11,118375723,C,A,Sub,0,154,34.46,354,na,MLL,CCDS31686.1,c.9107C>A,p.S3036Y,UNKNOWN 58076706,PD6914a,11,118375840,C,G,Sub,0,193,15.73,445,na,MLL,CCDS31686.1,c.9224C>G,p.T3075S,UNKNOWN 58093253,PD6318a,11,118376290,G,A,Sub,0,65,49.09,383,na,MLL,CCDS31686.1,c.9674G>A,p.R3225H,UNKNOWN 58144002,PD6847a,11,118376683,C,T,Sub,0,118,48.05,385,na,MLL,CCDS31686.1,c.10067C>T,p.A3356V,UNKNOWN 58092630,PD7016a,11,118377103,A,G,Sub,0,37,11.4,500,na,MLL,CCDS31686.1,c.10487A>G,p.E3496G,UNKNOWN 58119121,PD6080a,11,118390409,G,C,Sub,0,75,54.74,422,na,MLL,CCDS31686.1,c.11214G>C,p.K3738N,UNKNOWN 58174880,PD6503a,11,118390478,C,A,Sub,0,135,40.88,274,na,MLL,CCDS31686.1,c.11283C>A,p.H3761Q,UNKNOWN 66971065,PD7379a,11,118391552,G,A,Sub,0,102,15.38,117,na,MLL,CCDS31686.1,c.11456G>A,p.R3819H,UNKNOWN 66786023,PD7384a,11,118392082,G,A,Sub,0,193,41.45,152,na,MLL,CCDS31686.1,c.11584G>A,p.V3862I,UNKNOWN 58166763,PD6861a,11,118392769,T,C,Sub,0,140,46,500,na,MLL,CCDS31686.1,c.11792T>C,p.M3931T,UNKNOWN 58119977,PD6848a,11,119142563,G,T,Sub,0,77,21.6,500,na,CBL,CCDS8418.1,c.562G>T,p.E188*,ONCOGENIC 58104386,PD6240a,11,119144603,C,T,Sub,0,114,12,500,na,CBL,CCDS8418.1,c.616C>T,p.R206*,ONCOGENIC 209602730,PD6787a,11,119144625,-,A,I,0,124,34.18530351,626,1,CBL,CCDS8418.1,c.638_639insA,p.H213fs*17,ONCOGENIC 58184625,PD6876a,11,119145564,A,G,Sub,0,166,22.2,500,na,CBL,CCDS8418.1,c.770A>G,p.N257S,POSSIBLE ONCOGENIC 58131512,PD6537a,11,119145587,C,T,Sub,0,176,48.4,500,na,CBL,CCDS8418.1,c.793C>T,p.H265Y,POSSIBLE ONCOGENIC 208093202,PD6816a,11,119145603,-,T,I,0,193,15.78947368,151,6,CBL,CCDS8418.1,c.809_810insT,p.L272fs*4,ONCOGENIC 58068269,PD6197a,11,119148486,C,T,Sub,0,104,25.49,455,na,CBL,CCDS8418.1,c.1027C>T,p.R343*,ONCOGENIC 58071220,PD6122a,11,119148486,C,T,Sub,0,104,10,390,na,CBL,CCDS8418.1,c.1027C>T,p.R343*,ONCOGENIC 58078618,PD6255a,11,119148486,C,T,Sub,0,104,40.57,318,na,CBL,CCDS8418.1,c.1027C>T,p.R343*,ONCOGENIC 58205475,PD6178a,11,119148892,A,C,Sub,0,125,24.81,262,na,CBL,CCDS8418.1,c.1112A>C,p.Y371S,ONCOGENIC 58098282,PD6543a,11,119148919,T,G,Sub,0,139,22.82,447,na,CBL,CCDS8418.1,c.1139T>G,p.L380R,ONCOGENIC 58139663,PD6898a,11,119148922,G,A,Sub,0,137,47.13,331,na,CBL,CCDS8418.1,c.1142G>A,p.C381Y,ONCOGENIC 66866701,PD7387a,11,119148922,G,A,Sub,0,137,40.86,186,na,CBL,CCDS8418.1,c.1142G>A,p.C381Y,ONCOGENIC 66866702,PD7387a,11,119148928,T,C,Sub,0,139,29.17,192,na,CBL,CCDS8418.1,c.1148T>C,p.I383T,ONCOGENIC 58133800,PD6058a,11,119148931,G,A,Sub,0,139,15.36,306,na,CBL,CCDS8418.1,c.1151G>A,p.C384Y,ONCOGENIC 58085072,PD6212a,11,119148947,G,T,Sub,0,138,10.28,214,na,CBL,CCDS8418.1,c.1167G>T,p.K389N,ONCOGENIC 58171526,PD6277a,11,119148973,A,C,Sub,0,168,47.27,330,na,CBL,CCDS8418.1,c.1193A>C,p.H398P,ONCOGENIC 58171527,PD6277a,11,119148990,T,G,Sub,0,181,22.35,349,na,CBL,CCDS8418.1,c.1210T>G,p.C404G,ONCOGENIC 58074240,PD5789a,11,119148991,G,A,Sub,0,181,24.73,368,na,CBL,CCDS8418.1,c.1211G>A,p.C404Y,ONCOGENIC 58085073,PD6212a,11,119148991,G,A,Sub,0,181,12.26,261,na,CBL,CCDS8418.1,c.1211G>A,p.C404Y,ONCOGENIC 58141861,PD6500a,11,119148991,G,A,Sub,0,181,83.63,391,na,CBL,CCDS8418.1,c.1211G>A,p.C404Y,ONCOGENIC 58165457,PD6787a,11,119148991,G,A,Sub,0,181,25.54,372,na,CBL,CCDS8418.1,c.1211G>A,p.C404Y,ONCOGENIC 58151881,PD6268a,11,119148992,T,G,Sub,0,183,80.17,242,na,CBL,CCDS8418.1,c.1212T>G,p.C404W,ONCOGENIC 58097629,PD6106a,11,119149002,T,A,Sub,0,188,49.14,409,na,CBL,CCDS8418.1,c.1222T>A,p.W408R,ONCOGENIC 58174584,PD6099a,11,119149002,T,A,Sub,0,188,42.34,359,na,CBL,CCDS8418.1,c.1222T>A,p.W408R,ONCOGENIC 58081489,PD6079a,11,119149242,C,T,Sub,0,62,16.39,299,na,CBL,CCDS8418.1,c.1250C>T,p.P417L,ONCOGENIC 66816548,PD7375a,11,119149242,C,A,Sub,0,62,8.82,136,na,CBL,CCDS8418.1,c.1250C>A,p.P417H,ONCOGENIC 58078619,PD6255a,11,119149251,G,A,Sub,0,65,51.59,252,na,CBL,CCDS8418.1,c.1259G>A,p.R420Q,ONCOGENIC 58190517,PD6793a,11,119149251,G,A,Sub,0,65,84.85,231,na,CBL,CCDS8418.1,c.1259G>A,p.R420Q,ONCOGENIC 58085074,PD6212a,11,119149259,A,T,Sub,0,72,12.34,235,na,CBL,CCDS8418.1,c.1267A>T,p.I423F,ONCOGENIC 58179677,PD6780a,11,119149268,A,C,Sub,0,72,82.86,210,na,CBL,CCDS8418.1,c.1276A>C,p.T426P,ONCOGENIC 58179572,PD5773a,11,119149311,G,T,Sub,0,83,50.92,489,na,CBL,CCDS8418.1,c.1319G>T,p.G440V,POSSIBLE ONCOGENIC 66763471,PD8936a,11,119149311,G,A,Sub,0,83,8.21,195,na,CBL,CCDS8418.1,c.1319G>A,p.G440D,POSSIBLE ONCOGENIC 66763472,PD8936a,11,119149413,C,T,Sub,0,58,40.2,102,na,CBL,CCDS8418.1,c.1421C>T,p.A474V,POSSIBLE ONCOGENIC 68082193,PD8646a,11,119156025,C,T,Sub,0,93,4.83,290,na,CBL,CCDS8418.1,c.1690C>T,p.P564S,UNKNOWN 66816549,PD7375a,11,119156172,G,T,Sub,0,135,10.76,223,na,CBL,CCDS8418.1,c.1837G>T,p.E613*,ONCOGENIC 66875944,PD8735a,11,119158652,G,A,Sub,0,89,8.2,500,na,CBL,CCDS8418.1,c.2032G>A,p.A678T,ONCOGENIC 58118510,PD7098a,11,119167692,A,G,Sub,0,105,49.15,352,na,CBL,CCDS8418.1,c.2101A>G,p.M701V,POSSIBLE ONCOGENIC 66904522,PD7369a,11,119168123,G,A,Sub,0,61,4.93,304,na,CBL,CCDS8418.1,c.2183G>A,p.S728N,POSSIBLE ONCOGENIC 58173287,PD6286a,12,417051,G,A,Sub,0,293,50,500,na,KDM5A,CCDS41736.1,c.3499C>T,p.R1167C,UNKNOWN 58198708,PD6958a,12,417108,T,C,Sub,0,279,32.8,500,na,KDM5A,CCDS41736.1,c.3442A>G,p.N1148D,UNKNOWN 207955146,PD5737a,12,418983,-,TCT,I,0,136,46.34831461,324,1,KDM5A,CCDS41736.1,c.3363_3364insAGA,p.R1121_D1122insR,UNKNOWN 58112211,PD7010a,12,420176,G,A,Sub,0,110,10.85,212,na,KDM5A,CCDS41736.1,c.3091C>T,p.R1031C,UNKNOWN 58090767,PD6206a,12,431676,T,C,Sub,0,40,52.6,500,na,KDM5A,CCDS41736.1,c.2333A>G,p.N778S,UNKNOWN 58139841,PD5782a,12,432298,C,A,Sub,0,122,51.61,248,na,KDM5A,CCDS41736.1,c.2225G>T,p.W742L,UNKNOWN 58087843,PD6509a,12,443527,G,A,Sub,0,78,52,500,na,KDM5A,CCDS41736.1,c.1370C>T,p.A457V,UNKNOWN 58081591,PD6079a,12,463301,T,C,Sub,0,172,51.8,500,na,KDM5A,CCDS41736.1,c.970A>G,p.I324V,UNKNOWN 66858218,PD8935a,12,498215,G,A,Sub,0,43,14.89,47,na,KDM5A,CCDS41736.1,c.43C>T,p.P15S,UNKNOWN 58092640,PD7016a,12,11803084,G,A,Sub,0,30,22.22,180,na,ETV6,CCDS8643.1,c.23G>A,p.C8Y,UNKNOWN 58092641,PD7016a,12,11905384,C,T,Sub,0,31,18.23,351,na,ETV6,CCDS8643.1,c.34C>T,p.Q12*,ONCOGENIC 208535415,PD6997a,12,11992193,ctg,-,D,0,155,16.74208145,216,3,ETV6,CCDS8643.1,c.283_285delCTG,p.L95delL,POSSIBLE ONCOGENIC 208172042,PD7113a,12,11992193,ctg,-,D,0,155,11.05527638,199,na,ETV6,CCDS8643.1,c.283_285delCTG,p.L95delL,POSSIBLE ONCOGENIC 208335401,PD6088a,12,11992218,-,CTAT,I,0,140,13.06532663,194,1,ETV6,CCDS8643.1,c.308_309insCTAT,p.R105fs*8,ONCOGENIC 207928317,PD6809a,12,12006482,-,A,I,0,146,52.12527964,893,2,ETV6,CCDS8643.1,c.450_451insA,p.N151fs*3,ONCOGENIC 58172305,PD6128a,12,12022525,C,T,Sub,0,17,55.94,143,na,ETV6,CCDS8643.1,c.631C>T,p.R211C,UNKNOWN 58092393,PD6890a,12,12037426,C,T,Sub,0,198,26.91,301,na,ETV6,CCDS8643.1,c.1057C>T,p.R353W,UNKNOWN 58099317,PD6322a,12,12037475,G,A,Sub,0,199,47.8,205,na,ETV6,CCDS8643.1,c.1106G>A,p.R369Q,POSSIBLE ONCOGENIC 58145593,PD6855a,12,12037475,G,A,Sub,0,199,40.54,296,na,ETV6,CCDS8643.1,c.1106G>A,p.R369Q,POSSIBLE ONCOGENIC 58202781,PD6909a,12,12037499,C,T,Sub,0,194,10.89,303,na,ETV6,CCDS8643.1,c.1130C>T,p.A377V,UNKNOWN 58083394,PD6942a,12,12870824,C,A,Sub,0,36,51.43,70,na,CDKN1B,CCDS8653.1,c.51C>A,p.D17E,UNKNOWN 58199944,PD5733a,12,12870891,G,A,Sub,0,67,54.31,116,na,CDKN1B,CCDS8653.1,c.118G>A,p.E40K,UNKNOWN 58125503,PD6490a,12,12870897,A,G,Sub,0,71,39.13,115,na,CDKN1B,CCDS8653.1,c.124A>G,p.T42A,UNKNOWN 58178085,PD6792a,12,12870979,C,T,Sub,0,88,48.39,124,na,CDKN1B,CCDS8653.1,c.206C>T,p.P69L,UNKNOWN 58163051,PD7077a,12,12871042,G,A,Sub,0,46,14.52,62,na,CDKN1B,CCDS8653.1,c.269G>A,p.R90K,UNKNOWN 208060413,PD7016a,12,12871058,-,C,I,0,48,41.50943396,53,6,CDKN1B,CCDS8653.1,c.285_286insC,p.K96fs*29,UNKNOWN 58098248,PD6256a,12,12871789,G,A,Sub,0,50,42.42,132,na,CDKN1B,CCDS8653.1,c.506G>A,p.R169K,UNKNOWN 58079647,PD6098a,12,12871828,G,A,Sub,0,37,25.49,408,na,CDKN1B,CCDS8653.1,c.545G>A,p.G182D,UNKNOWN 58138775,PD6479a,12,25378561,G,A,Sub,0,345,11.17,349,na,KRAS,CCDS8703.1,c.437C>T,p.A146V,ONCOGENIC 58175301,PD6273a,12,25378647,T,G,Sub,0,275,33.67,395,na,KRAS,CCDS8703.1,c.351A>C,p.K117N,ONCOGENIC 58184933,PD6276a,12,25378647,T,A,Sub,0,275,48.09,235,na,KRAS,CCDS8703.1,c.351A>T,p.K117N,ONCOGENIC 58082057,PD6072a,12,25380174,T,A,Sub,0,93,46.32,136,na,KRAS,CCDS8703.1,c.284A>T,p.H95L,UNKNOWN 58184675,PD6876a,12,25380280,C,G,Sub,0,73,24.22,256,na,KRAS,CCDS8703.1,c.178G>C,p.G60R,UNKNOWN 58111452,PD6955a,12,25398220,A,C,Sub,0,62,33.49,209,na,KRAS,CCDS8703.1,c.99T>G,p.D33E,UNKNOWN 58181424,PD6816a,12,25398276,C,T,Sub,0,87,23.53,119,na,KRAS,CCDS8703.1,c.43G>A,p.G15S,ONCOGENIC 58151798,PD6475a,12,25398282,C,A,Sub,0,86,42.91,289,na,KRAS,CCDS8703.1,c.37G>T,p.G13C,ONCOGENIC 58107876,PD6215a,12,25398284,C,T,Sub,0,88,15.41,279,na,KRAS,CCDS8703.1,c.35G>A,p.G12D,ONCOGENIC 58174332,PD6913a,12,25398284,C,T,Sub,0,88,27.94,340,na,KRAS,CCDS8703.1,c.35G>A,p.G12D,ONCOGENIC 58201853,PD6894a,12,25398284,C,A,Sub,0,88,34.44,180,na,KRAS,CCDS8703.1,c.35G>T,p.G12V,ONCOGENIC 66914357,PD8737a,12,25398284,C,T,Sub,0,88,11.33,203,na,KRAS,CCDS8703.1,c.35G>A,p.G12D,ONCOGENIC 58079648,PD6098a,12,49415572,C,T,Sub,0,141,27.35,117,na,MLL2,CCDS44873.1,c.16605G>A,p.W5535*,ONCOGENIC 58091896,PD6782a,12,49415572,C,T,Sub,0,141,19.19,99,na,MLL2,CCDS44873.1,c.16605G>A,p.W5535*,ONCOGENIC 58098599,PD6783a,12,49416497,C,T,Sub,0,118,59.65,171,na,MLL2,CCDS44873.1,c.16214G>A,p.R5405H,UNKNOWN 58138777,PD6479a,12,49416543,G,A,Sub,0,25,43.7,238,na,MLL2,CCDS44873.1,c.16168C>T,p.R5390W,UNKNOWN 58138946,PD6814a,12,49418401,A,G,Sub,0,131,10.34,87,na,MLL2,CCDS44873.1,c.16012T>C,p.C5338R,UNKNOWN 58158129,PD6536a,12,49418463,T,C,Sub,0,126,10.22,186,na,MLL2,CCDS44873.1,c.15950A>G,p.Y5317C,UNKNOWN 58076829,PD6914a,12,49420387,G,A,Sub,0,209,12.71,425,na,MLL2,CCDS44873.1,c.15362C>T,p.A5121V,UNKNOWN 66971232,PD7379a,12,49420687,C,T,Sub,0,58,15.79,76,na,MLL2,CCDS44873.1,c.15062G>A,p.R5021Q,UNKNOWN 66878366,PD8934a,12,49421026,G,A,Sub,0,42,42.86,7,na,MLL2,CCDS44873.1,c.14723C>T,p.A4908V,UNKNOWN 58130941,PD6824a,12,49421833,C,T,Sub,0,130,59.41,101,na,MLL2,CCDS44873.1,c.14474G>A,p.R4825Q,UNKNOWN 58179242,PD6784a,12,49422714,G,A,Sub,0,278,10.71,56,na,MLL2,CCDS44873.1,c.14279C>T,p.A4760V,UNKNOWN 58145589,PD6855a,12,49422949,C,T,Sub,0,104,64.29,28,na,MLL2,CCDS44873.1,c.14146G>A,p.G4716R,UNKNOWN 208463686,PD6824a,12,49423212,-,G,I,0,43,19.35483871,31,5,MLL2,CCDS44873.1,c.14046_14047insC,p.H4685fs*5,ONCOGENIC 58080532,PD6545a,12,49425037,C,T,Sub,0,184,44.9,98,na,MLL2,CCDS44873.1,c.13451G>A,p.R4484Q,UNKNOWN 58145275,PD6225a,12,49425098,G,A,Sub,1.04,96,50,90,na,MLL2,CCDS44873.1,c.13390C>T,p.Q4464*,ONCOGENIC 58160645,PD5715a,12,49425806,G,C,Sub,0,52,32.35,34,na,MLL2,CCDS44873.1,c.12682C>G,p.Q4228E,UNKNOWN 58176943,PD6961a,12,49426129,C,T,Sub,0,79,35,40,na,MLL2,CCDS44873.1,c.12359G>A,p.G4120D,UNKNOWN 58125259,PD6194a,12,49426265,G,A,Sub,0,70,36.84,19,na,MLL2,CCDS44873.1,c.12223C>T,p.L4075F,UNKNOWN 208171610,PD6276a,12,49426639,tgttg<14>gctgt,-,D,0,8,24.76190476,85,0,MLL2,CCDS44873.1,c.11826_11849del24,p.Q3947_Q3954delQLQQQQQQ,UNKNOWN 58190982,PD6919a,12,49427158,A,C,Sub,0,24,40,85,na,MLL2,CCDS44873.1,c.11330T>G,p.M3777R,UNKNOWN 208406244,PD6961a,12,49427265,-,TGC,I,3.448275862,29,25.66371681,113,7,MLL2,CCDS44873.1,c.11222_11223insGCA,p.Q3745_H3746insQ,UNKNOWN 344328491,PD7370a,12,49427265,-,TGC,I,0,34,16.32653061,49,7,MLL2,CCDS44873.1,c.11222_11223insGCA,p.Q3745_H3746insQ,UNKNOWN 208185926,PD7075a,12,49427286,-,TGC,I,3.448275862,29,20,203,7,MLL2,CCDS44873.1,c.11201_11202insGCA,p.Q3745_H3746insQ,UNKNOWN 58094086,PD6779a,12,49427455,G,A,Sub,0,58,19.61,51,na,MLL2,CCDS44873.1,c.11033C>T,p.A3678V,UNKNOWN 58098777,PD6052a,12,49427521,C,T,Sub,0,33,44.95,109,na,MLL2,CCDS44873.1,c.10967G>A,p.R3656H,UNKNOWN 58073140,PD6985a,12,49427710,C,A,Sub,0,365,10.71,84,na,MLL2,CCDS44873.1,c.10778C>T,p.A3593V,UNKNOWN 58194509,PD6881a,12,49431273,G,A,Sub,0,43,13.51,37,na,MLL2,CCDS44873.1,c.9866C>T,p.P3289L,UNKNOWN 207936135,PD6964a,12,49431549,-,G,I,0,44,27.65957447,47,5,MLL2,CCDS44873.1,c.9589_9590insC,p.S3199fs*16,ONCOGENIC 58141686,PD6989a,12,49431577,T,C,Sub,0,49,47.46,59,na,MLL2,CCDS44873.1,c.9562A>G,p.T3188A,UNKNOWN 58102300,PD6270a,12,49431840,G,C,Sub,0,93,43.24,37,na,MLL2,CCDS44873.1,c.9299C>G,p.P3100R,UNKNOWN 58094769,PD5718a,12,49431855,C,T,Sub,0,100,15.56,90,na,MLL2,CCDS44873.1,c.9284G>A,p.G3095D,UNKNOWN 58116254,PD7114a,12,49431856,C,T,Sub,0,100,15.63,32,na,MLL2,CCDS44873.1,c.9283G>A,p.G3095S,UNKNOWN 58181426,PD6816a,12,49432230,C,A,Sub,0,33,35.29,34,na,MLL2,CCDS44873.1,c.8909G>T,p.S2970I,UNKNOWN 58112366,PD7040a,12,49433030,G,A,Sub,0,78,23.53,17,na,MLL2,CCDS44873.1,c.8341C>T,p.P2781S,UNKNOWN 58116255,PD7114a,12,49433361,G,A,Sub,0,31,70,20,na,MLL2,CCDS44873.1,c.8086C>T,p.Q2696*,ONCOGENIC 58071768,PD6110a,12,49436030,G,A,Sub,0,13,23.26,43,na,MLL2,CCDS44873.1,c.5951C>T,p.P1984L,UNKNOWN 58079650,PD6098a,12,49437157,G,A,Sub,0,121,15,40,na,MLL2,CCDS44873.1,c.5522C>T,p.A1841V,UNKNOWN 58189326,PD6220a,12,49437523,C,T,Sub,0,69,50.84,297,na,MLL2,CCDS44873.1,c.5362G>A,p.A1788T,UNKNOWN 66808069,PD7370a,12,49438263,G,A,Sub,0,142,17.07,41,na,MLL2,CCDS44873.1,c.5006C>T,p.P1669L,UNKNOWN 208387413,PD6155a,12,49439724,gcacca,-,D,0,97,21.42857143,95,1,MLL2,CCDS44873.1,c.4715_4720delTGGTGC,p.L1572_V1573delLV,UNKNOWN 58134758,PD7020a,12,49442904,G,A,Sub,0,18,12,50,na,MLL2,CCDS44873.1,c.4004C>T,p.S1335F,UNKNOWN 58163049,PD7077a,12,49443607,C,T,Sub,0,270,17.65,85,na,MLL2,CCDS44873.1,c.3764G>A,p.G1255D,UNKNOWN 58112208,PD7010a,12,49443670,C,T,Sub,0,118,20,20,na,MLL2,CCDS44873.1,c.3701G>A,p.G1234E,UNKNOWN 58179244,PD6784a,12,49443862,G,A,Sub,0,26,43.75,64,na,MLL2,CCDS44873.1,c.3509C>T,p.P1170L,UNKNOWN 58116781,PD6929a,12,49443899,C,T,Sub,0,31,52.48,101,na,MLL2,CCDS44873.1,c.3472G>A,p.E1158K,UNKNOWN 58071152,PD6083a,12,49443968,G,A,Sub,0,22,25,20,na,MLL2,CCDS44873.1,c.3403C>T,p.P1135S,UNKNOWN 58090007,PD6940a,12,49444544,T,C,Sub,0,16,45.83,24,na,MLL2,CCDS44873.1,c.2827A>G,p.I943V,UNKNOWN 58160034,PD6998a,12,50027295,G,A,Sub,0,36,47.08,274,na,PRPF40B,CCDS31796.1,c.479G>A,p.R160H,UNKNOWN 58073142,PD6985a,12,50029230,G,T,Sub,0,22,47.83,345,na,PRPF40B,CCDS31796.1,c.1183C>T,p.L395F,UNKNOWN 66887257,PD7382a,12,50029711,C,T,Sub,0,16,49.51,103,na,PRPF40B,CCDS31796.1,c.1295C>T,p.T432M,UNKNOWN 58088056,PD6805a,12,50030618,C,T,Sub,0,69,10.84,166,na,PRPF40B,CCDS31796.1,c.1480C>T,p.R494W,UNKNOWN 66971233,PD7379a,12,50031265,G,A,Sub,0,88,48.89,45,na,PRPF40B,CCDS31796.1,c.1507G>A,p.E503K,UNKNOWN 58115162,PD6982a,12,50031312,G,A,Sub,0,127,13.89,36,na,PRPF40B,ENST00000395012,c.41G>A,p.S14N,UNKNOWN 58076833,PD6914a,12,50031357,G,A,Sub,0,104,10.2,98,na,PRPF40B,ENST00000395012,c.86G>A,p.W29*,UNKNOWN 66962428,PD9663a,12,50037956,A,G,Sub,0,32,62.12,66,na,PRPF40B,CCDS31796.1,c.2597A>G,p.Q866R,UNKNOWN 58101367,PD6133a,12,111884645,C,A,Sub,0,40,10.58,397,na,SH2B3,CCDS9153.1,c.821C>A,p.T274N,UNKNOWN 66816581,PD7375a,12,111884779,G,T,Sub,0,83,7.09,141,na,SH2B3,CCDS9153.1,c.868G>T,p.G290*,ONCOGENIC 58162881,PD6950a,12,111884785,G,C,Sub,0,75,46.53,389,na,SH2B3,CCDS9153.1,c.874G>C,p.E292Q,UNKNOWN 58068029,PD5744a,12,111884819,C,A,Sub,0,49,46.78,342,na,SH2B3,CCDS9153.1,c.908C>A,p.S303*,ONCOGENIC 58071765,PD6110a,12,111885286,C,T,Sub,0,47,20.59,170,na,SH2B3,CCDS9153.1,c.1174C>T,p.R392W,UNKNOWN 58165825,PD6496a,12,111885286,C,T,Sub,0,47,48.28,87,na,SH2B3,CCDS9153.1,c.1174C>T,p.R392W,UNKNOWN 58130520,PD6314a,12,111885490,C,T,Sub,0,55,27.66,47,na,SH2B3,CCDS9153.1,c.1267C>T,p.Q423*,ONCOGENIC 58183883,PD6897a,12,111885942,C,A,Sub,0,45,50,18,na,SH2B3,CCDS9153.1,c.1564C>A,p.P522T,UNKNOWN 58079386,PD6822a,12,111885967,C,T,Sub,0,47,19.61,51,na,SH2B3,CCDS9153.1,c.1589C>T,p.P530L,UNKNOWN 58082851,PD6335a,12,112888189,G,A,Sub,0,164,39,500,na,PTPN11,CCDS9163.1,c.205G>A,p.E69K,ONCOGENIC 66846759,PD7374a,12,112888190,A,T,Sub,0,161,19.27,192,na,PTPN11,CCDS9163.1,c.206A>T,p.E69V,ONCOGENIC 58197732,PD6788a,12,112888198,G,A,Sub,0,169,19.88,322,na,PTPN11,CCDS9163.1,c.214G>A,p.A72T,ONCOGENIC 58144591,PD6884a,12,112888199,C,T,Sub,0,165,50.16,317,na,PTPN11,CCDS9163.1,c.215C>T,p.A72V,ONCOGENIC 58076553,PD6789a,12,112915523,A,G,Sub,0,144,19.29,420,na,PTPN11,CCDS9163.1,c.922A>G,p.N308D,ONCOGENIC 58092634,PD7016a,12,112924384,C,T,Sub,0,75,11.63,86,na,PTPN11,CCDS9163.1,c.1330C>T,p.H444Y,POSSIBLE ONCOGENIC 58197733,PD6788a,12,112926885,C,T,Sub,0,84,15.2,500,na,PTPN11,CCDS9163.1,c.1505C>T,p.S502L,ONCOGENIC 58204070,PD6280a,12,112926887,G,C,Sub,0,82,10.8,500,na,PTPN11,CCDS9163.1,c.1507G>C,p.G503R,ONCOGENIC 58112363,PD7040a,12,121877734,G,A,Sub,0,66,12.86,70,na,KDM2B,CCDS41850.1,c.3755C>T,p.T1252I,UNKNOWN 58073137,PD6985a,12,121877763,C,A,Sub,0,49,16.33,49,na,KDM2B,CCDS41850.1,c.3726G>T,p.K1242N,UNKNOWN 58148569,PD6143a,12,121878765,A,G,Sub,0,11,29.41,17,na,KDM2B,CCDS41850.1,c.3464T>C,p.V1155A,UNKNOWN 58163047,PD7077a,12,121880063,C,T,Sub,0,14,12.5,48,na,KDM2B,CCDS41850.1,c.3181G>A,p.A1061T,UNKNOWN 66831586,PD7372a,12,121880330,G,A,Sub,0,19,17.46,63,na,KDM2B,CCDS41850.1,c.2914C>T,p.P972S,UNKNOWN 209690509,PD6786a,12,121881577,-,G,I,0,23,12.5,48,5,KDM2B,CCDS41850.1,c.2470_2471insC,p.G826fs*38,UNKNOWN 58138302,PD6823a,12,121881910,G,T,Sub,0,93,42.86,42,na,KDM2B,CCDS41850.1,c.2356C>A,p.P786T,UNKNOWN 58101714,PD6186a,12,121947503,T,C,Sub,0,60,27.27,11,na,KDM2B,CCDS41850.1,c.1514A>G,p.K505R,UNKNOWN 58088472,PD6920a,12,121986856,C,A,Sub,0,31,58.25,103,na,KDM2B,CCDS41850.1,c.609G>T,p.W203C,UNKNOWN 66816582,PD7375a,12,122012472,C,T,Sub,0,66,33.33,24,na,KDM2B,CCDS41850.1,c.377G>A,p.R126Q,UNKNOWN 58079383,PD6822a,12,122012494,G,T,Sub,0,66,44.68,47,na,KDM2B,CCDS41850.1,c.355C>A,p.P119T,UNKNOWN 58126313,PD7095a,12,122016734,C,A,Sub,0,14,13.33,30,na,KDM2B,CCDS41850.1,c.244G>T,p.D82Y,UNKNOWN 58183882,PD6897a,12,122016839,G,A,Sub,0,31,17.24,29,na,KDM2B,CCDS41850.1,c.139C>T,p.R47C,UNKNOWN 208228411,PD7077a,12,122017957,t,-,D,0,41,12.19512195,41,7,KDM2B,CCDS41849.1,c.1delA,p.?,UNKNOWN 346913169,PD8730a,12,122018733,tg,-,D,0,56,27.02702703,37,1,KDM2B,CCDS41850.1,c.83_84delCA,p.T28fs*8,UNKNOWN 58138483,PD6918a,12,122018734,G,T,Sub,0,53,10.87,92,na,KDM2B,CCDS41850.1,c.83C>A,p.T28K,UNKNOWN 58162094,PD7026a,13,28589321,T,C,Sub,0,82,49.45,451,na,FLT3,CCDS31953.1,c.2726A>G,p.D909G,UNKNOWN 58181422,PD6816a,13,28592705,C,A,Sub,0.53,190,11.8,500,na,FLT3,CCDS31953.1,c.2440G>T,p.A814S,UNKNOWN 58185578,PD6183a,13,28599077,C,G,Sub,0,118,39.79,191,na,FLT3,CCDS31953.1,c.2211G>C,p.M737I,ONCOGENIC 58191791,PD6965a,13,28608282,C,T,Sub,0,190,23,500,na,FLT3,CCDS31953.1,c.1774G>A,p.V592I,ONCOGENIC 58147528,PD6057a,13,28609712,G,A,Sub,0,174,18,500,na,FLT3,CCDS31953.1,c.1517C>T,p.A506V,UNKNOWN 66808077,PD7370a,13,28610151,C,T,Sub,0,33,4.46,426,na,FLT3,CCDS31953.1,c.1339G>A,p.A447T,UNKNOWN 66791767,PD7364a,13,28610180,C,T,Sub,0,22,5.23,344,na,FLT3,CCDS31953.1,c.1310G>A,p.R437K,UNKNOWN 58107867,PD6215a,13,28623783,C,G,Sub,0,239,35,500,na,FLT3,CCDS31953.1,c.871G>C,p.A291P,UNKNOWN 66887260,PD7382a,13,28623919,A,G,Sub,0,139,45.2,323,na,FLT3,CCDS31953.1,c.743-8T>C,p.?,UNKNOWN 58193062,PD6245a,13,28624288,A,T,Sub,0,185,47.27,440,na,FLT3,CCDS31953.1,c.686T>A,p.I229K,UNKNOWN 58096975,PD6939a,13,28626797,T,C,Sub,0,144,52,500,na,FLT3,CCDS31953.1,c.499A>G,p.T167A,POSSIBLE ONCOGENIC 58157328,PD7009a,13,28626797,T,C,Sub,0,144,46.65,493,na,FLT3,CCDS31953.1,c.499A>G,p.T167A,POSSIBLE ONCOGENIC 66837978,PD8739a,13,28636086,C,T,Sub,0,208,46.25,80,na,FLT3,CCDS31953.1,c.286G>A,p.D96N,UNKNOWN 58091885,PD6782a,13,28636157,G,A,Sub,0,212,10.58,104,na,FLT3,CCDS31953.1,c.215C>T,p.S72L,UNKNOWN 58115001,PD6982a,13,28636176,C,T,Sub,0,203,44.87,78,na,FLT3,CCDS31953.1,c.196G>A,p.A66T,UNKNOWN 58094078,PD6779a,13,41515294,C,A,Sub,0,222,42.03,345,na,ELF1,CCDS9374.1,c.1019G>T,p.G340V,UNKNOWN 208503528,PD6850a,13,41515416,ta,-,D,0,142,13.94849785,465,5,ELF1,CCDS9374.1,c.896_897delTA,p.I299fs*2,UNKNOWN 58197351,PD6979a,13,41517167,A,C,Sub,0,92,11.07,262,na,ELF1,CCDS9374.1,c.727T>G,p.S243A,UNKNOWN 58128160,PD6284a,13,41517203,T,G,Sub,0,80,40.84,262,na,ELF1,CCDS9374.1,c.691A>C,p.T231P,UNKNOWN 208506061,PD6855a,13,41517988,tttg,-,D,0,108,12.5,382,1,ELF1,CCDS9374.1,c.600_603delCAAA,p.N200fs*59,UNKNOWN 58177590,PD7025a,13,48916769,G,C,Sub,0,255,49.56,343,na,RB1,CCDS31973.1,c.299G>C,p.G100A,UNKNOWN 66878256,PD8934a,13,48937017,G,A,Sub,0,205,13.47,193,na,RB1,CCDS31973.1,c.785G>A,p.R262Q,UNKNOWN 58094336,PD6853a,13,49030384,C,T,Sub,0,110,51.2,500,na,RB1,CCDS31973.1,c.1859C>T,p.T620M,UNKNOWN 68195776,PD8731a,13,49033866,G,A,Sub,0,139,38,500,na,RB1,CCDS31973.1,c.2003G>A,p.R668H,UNKNOWN 58140552,PD6538a,13,49039371,C,T,Sub,0,261,44.8,500,na,RB1,CCDS31973.1,c.2356C>T,p.P786S,UNKNOWN 58184282,PD5776a,13,49039371,C,T,Sub,0,261,48.8,500,na,RB1,CCDS31973.1,c.2356C>T,p.P786S,UNKNOWN 58162099,PD7026a,13,49039375,G,A,Sub,0,255,45.62,445,na,RB1,CCDS31973.1,c.2360G>A,p.R787Q,UNKNOWN 58080002,PD6843a,14,93392966,T,C,Sub,0,67,50,46,na,CHGA,CCDS9906.1,c.110T>C,p.V37A,UNKNOWN 58070846,PD7092a,14,93392990,C,T,Sub,0,90,26.47,34,na,CHGA,CCDS9906.1,c.134C>T,p.S45F,UNKNOWN 58130560,PD6482a,14,93397847,G,C,Sub,0,12,48,25,na,CHGA,CCDS9906.1,c.608G>C,p.S203T,UNKNOWN 58160929,PD6806a,15,28096554,G,A,Sub,0,21,10.81,74,na,OCA2,CCDS10020.1,c.2312C>T,p.A771V,UNKNOWN 58151155,PD6529a,15,28096587,T,A,Sub,0,16,39.66,116,na,OCA2,CCDS10020.1,c.2279A>T,p.E760V,UNKNOWN 58179825,PD6780a,15,28171353,T,A,Sub,0,76,47.6,500,na,OCA2,CCDS10020.1,c.1999A>T,p.I667F,UNKNOWN 58126583,PD5745a,15,28202755,C,T,Sub,0,22,50.91,55,na,OCA2,CCDS10020.1,c.1763G>A,p.R588Q,UNKNOWN 58133751,PD6163a,15,28202804,G,T,Sub,0,27,43.08,65,na,OCA2,CCDS10020.1,c.1714C>A,p.R572S,UNKNOWN 58199240,PD6230a,15,28228511,T,C,Sub,0,74,49.2,376,na,OCA2,CCDS10020.1,c.1483A>G,p.N495D,UNKNOWN 58196949,PD5728a,15,28259930,T,A,Sub,0,57,48.04,102,na,OCA2,CCDS10020.1,c.1036A>T,p.I346L,UNKNOWN 58097272,PD7108a,15,28260070,G,A,Sub,0,57,19.64,168,na,OCA2,CCDS10020.1,c.896C>T,p.T299M,UNKNOWN 66876140,PD8735a,15,28261321,G,T,Sub,0,13,14.23,253,na,OCA2,CCDS10020.1,c.819C>A,p.N273K,UNKNOWN 58194147,PD6826a,15,28267662,G,C,Sub,0,77,38.05,452,na,OCA2,CCDS10020.1,c.631C>G,p.P211A,UNKNOWN 58084255,PD7086a,15,28269995,C,T,Sub,0,24,37.89,256,na,OCA2,CCDS10020.1,c.569G>A,p.C190Y,UNKNOWN 58104881,PD6932a,15,67835739,G,C,Sub,0,89,50.85,352,na,MAP2K5,CCDS10224.1,c.66G>C,p.K22N,UNKNOWN 58165196,PD6176a,15,67842387,G,C,Sub,0,200,29.7,404,na,MAP2K5,CCDS10224.1,c.151G>C,p.V51L,UNKNOWN 208218376,PD6972a,15,67885267,aaa,-,D,0,35,45.79945799,334,3,MAP2K5,CCDS10224.1,c.435_437delAAA,p.K147delK,UNKNOWN 58118359,PD6081a,15,67938581,G,A,Sub,0,96,20.19,104,na,MAP2K5,CCDS10224.1,c.598G>A,p.D200N,UNKNOWN 58204831,PD6844a,15,67985892,A,C,Sub,0,37,41.6,500,na,MAP2K5,CCDS10224.1,c.958A>C,p.N320H,UNKNOWN 208569190,PD6234a,15,68020271,tc,-,D,0,363,37.5,342,1,MAP2K5,CCDS10224.1,c.1062_1063delTC,p.P355fs*22,UNKNOWN 58082488,PD6977a,15,90627503,G,A,Sub,0,39,11.85,211,na,IDH2,CCDS10359.1,c.1354C>T,p.Q452*,UNKNOWN 66971246,PD7379a,15,90631593,C,T,Sub,0,29,42.31,26,na,IDH2,CCDS10359.1,c.676G>A,p.E226K,UNKNOWN 58128166,PD6284a,15,90631838,C,T,Sub,0,57,33.82,68,na,IDH2,CCDS10359.1,c.515G>A,p.R172K,ONCOGENIC 58157683,PD6271a,15,90631838,C,T,Sub,0,57,30.65,62,na,IDH2,CCDS10359.1,c.515G>A,p.R172K,ONCOGENIC 67016672,PD7386a,15,90631838,C,T,Sub,0,57,46.67,15,na,IDH2,CCDS10359.1,c.515G>A,p.R172K,ONCOGENIC 58178076,PD6792a,15,90631878,G,C,Sub,0,70,66.67,51,na,IDH2,CCDS10359.1,c.475C>G,p.R159G,UNKNOWN 343679425,PD7377a,15,90631924,-,C,I,0,72,13.88888889,36,7,IDH2,CCDS10359.1,c.428_429insG,p.T146fs*126,UNKNOWN 58081230,PD5768a,15,90631934,C,T,Sub,0,67,32.76,116,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58085465,PD6095a,15,90631934,C,T,Sub,0,67,28.57,70,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58089994,PD6940a,15,90631934,C,T,Sub,0,67,24.78,113,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58091715,PD6506a,15,90631934,C,T,Sub,0,67,43.24,111,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58092777,PD7028a,15,90631934,C,T,Sub,0,67,20.83,96,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58094353,PD6853a,15,90631934,C,T,Sub,0,67,35.21,71,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58098241,PD6256a,15,90631934,C,T,Sub,0,67,50,88,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58105958,PD6182a,15,90631934,C,T,Sub,0,67,48.21,56,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58111602,PD6968a,15,90631934,C,T,Sub,0,67,20,120,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58113056,PD6191a,15,90631934,C,T,Sub,0,67,49.02,51,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58116766,PD6929a,15,90631934,C,T,Sub,0,67,22.31,121,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58133908,PD6058a,15,90631934,C,T,Sub,0,67,40.22,92,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58142440,PD5754a,15,90631934,C,T,Sub,0,67,37.98,129,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58152780,PD5749a,15,90631934,C,T,Sub,0,67,47.42,97,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58154956,PD6491a,15,90631934,C,T,Sub,0,67,31.97,122,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58160027,PD6998a,15,90631934,C,T,Sub,0,67,38.83,103,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58162641,PD6904a,15,90631934,C,T,Sub,0,67,24.19,124,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58167207,PD6797a,15,90631934,C,T,Sub,0,67,59.65,57,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58172965,PD6233a,15,90631934,C,T,Sub,0,67,42.11,114,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58180697,PD6474a,15,90631934,C,T,Sub,0,67,38.94,113,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58194137,PD6826a,15,90631934,C,T,Sub,0,67,35.59,59,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 66831589,PD7372a,15,90631934,C,T,Sub,0,67,21.57,51,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 68092480,PD8729a,15,90631934,C,T,Sub,0,67,21.43,28,na,IDH2,CCDS10359.1,c.419G>A,p.R140Q,ONCOGENIC 58071020,PD6083a,16,3778138,G,A,Sub,0,54,22.5,40,na,CREBBP,CCDS10509.1,c.6910C>T,p.Q2304*,ONCOGENIC 58194716,PD6924a,16,3778225,C,T,Sub,0,22,51.28,39,na,CREBBP,CCDS10509.1,c.6823G>A,p.G2275R,UNKNOWN 58086243,PD7017a,16,3778684,G,A,Sub,0,19,11.36,44,na,CREBBP,CCDS10509.1,c.6364C>T,p.Q2122*,ONCOGENIC 58130717,PD6824a,16,3779427,G,A,Sub,0,10,32.31,65,na,CREBBP,CCDS10509.1,c.5621C>T,p.T1874I,UNKNOWN 58119112,PD6080a,16,3779475,C,T,Sub,0,21,18.97,58,na,CREBBP,CCDS10509.1,c.5573G>A,p.R1858H,UNKNOWN 58182700,PD6171a,16,3779503,T,A,Sub,0,23,40,35,na,CREBBP,CCDS10509.1,c.5545A>T,p.K1849*,ONCOGENIC 58070638,PD6486a,16,3779814,C,T,Sub,0,69,36.36,77,na,CREBBP,CCDS10509.1,c.5234G>A,p.W1745*,ONCOGENIC 66878131,PD8934a,16,3786125,T,C,Sub,0,136,70.83,24,na,CREBBP,CCDS10509.1,c.4640A>G,p.N1547S,UNKNOWN 58075560,PD6063a,16,3799635,G,C,Sub,0,87,46.08,102,na,CREBBP,CCDS10509.1,c.3829C>G,p.P1277A,UNKNOWN 58188983,PD6116a,16,3807935,T,C,Sub,0,157,48.64,368,na,CREBBP,CCDS10509.1,c.3484A>G,p.N1162D,UNKNOWN 58079447,PD6098a,16,3808914,G,A,Sub,0,120,18.24,159,na,CREBBP,CCDS10509.1,c.3310C>T,p.Q1104*,ONCOGENIC 58155255,PD6165a,16,3831287,G,T,Sub,0,12,45.69,232,na,CREBBP,CCDS10509.1,c.1594C>A,p.P532T,UNKNOWN 58132412,PD7103a,16,3860711,C,T,Sub,0,142,48.2,500,na,CREBBP,CCDS10509.1,c.868G>A,p.A290T,UNKNOWN 66976561,PD8939a,16,3900611,G,A,Sub,0,91,57.02,121,na,CREBBP,CCDS10509.1,c.485C>T,p.A162V,UNKNOWN 58179667,PD6780a,16,3900768,C,A,Sub,0,61,17.24,29,na,CREBBP,CCDS10509.1,c.328G>T,p.A110S,UNKNOWN 58108854,PD6051a,16,68835678,G,A,Sub,0,66,49.06,371,na,CDH1,CCDS10869.1,c.269G>A,p.R90Q,UNKNOWN 58083893,PD7027a,16,68847261,A,G,Sub,0,493,52.26,243,na,CDH1,CCDS10869.1,c.1183A>G,p.T395A,UNKNOWN 208109940,PD6782a,16,68847285,-,C,I,0,429,11.9266055,109,5,CDH1,CCDS10869.1,c.1207_1208insC,p.N405fs*14,UNKNOWN 58105022,PD6283a,17,5405145,T,C,Sub,0,18,58.05,236,na,ENSG00000091592,CCDS32537.1,c.4117A>G,p.R1373G,UNKNOWN 58167908,PD6097a,17,5418277,C,A,Sub,0,81,50,390,na,ENSG00000091592,CCDS42246.1,c.4219G>T,p.V1407L,UNKNOWN 68092700,PD8728a,17,5424929,G,T,Sub,0,91,50,38,na,ENSG00000091592,CCDS42246.1,c.3698C>A,p.S1233Y,UNKNOWN 58067986,PD5744a,17,5434014,G,A,Sub,0,20,47.86,117,na,ENSG00000091592,CCDS42246.1,c.3307C>T,p.P1103S,UNKNOWN 58138065,PD6803a,17,5440205,G,A,Sub,0,21,10.71,84,na,ENSG00000091592,CCDS42246.1,c.2926C>T,p.Q976*,UNKNOWN 58086287,PD7017a,17,5445279,G,A,Sub,0,22,15.11,483,na,ENSG00000091592,CCDS42246.1,c.2597C>T,p.T866I,UNKNOWN 58077811,PD6521a,17,5461675,G,T,Sub,0,97,46.78,171,na,ENSG00000091592,CCDS42246.1,c.2341C>A,p.P781T,UNKNOWN 58077282,PD6937a,17,5461680,C,G,Sub,0,94,49.04,261,na,ENSG00000091592,CCDS42246.1,c.2336G>C,p.W779S,UNKNOWN 58116471,PD6312a,17,5461980,C,T,Sub,0,254,49.2,498,na,ENSG00000091592,CCDS42246.1,c.2036G>A,p.S679N,UNKNOWN 66937290,PD9711a,17,5462113,A,G,Sub,0,155,48.9,409,na,ENSG00000091592,CCDS42246.1,c.1903T>C,p.Y635H,UNKNOWN 58172664,PD6155a,17,5462276,C,G,Sub,0,113,24.48,286,na,ENSG00000091592,CCDS42246.1,c.1740G>C,p.W580C,UNKNOWN 66878151,PD8934a,17,5462329,G,A,Sub,0,156,12.46,297,na,ENSG00000091592,CCDS42246.1,c.1687C>T,p.P563S,UNKNOWN 58128421,PD6956a,17,5463093,A,T,Sub,0,46,47.78,473,na,ENSG00000091592,CCDS42246.1,c.923G>A,p.R308Q,UNKNOWN 58079468,PD6098a,17,5485304,G,A,Sub,0,17,50,24,na,ENSG00000091592,CCDS42246.1,c.527C>T,p.A176V,UNKNOWN 58180672,PD6474a,17,7574030,G,A,Sub,0,18,46.42,293,na,TP53,CCDS11118.1,c.997C>T,p.R333C,ONCOGENIC 58192850,PD6293a,17,7576626,T,G,Sub,0,113,51.63,459,na,TP53,CCDS45606.1,c.1025A>C,p.*342S,ONCOGENIC 66971193,PD7379a,17,7576857,A,G,Sub,1.47,68,56.64,226,na,TP53,CCDS11118.1,c.989T>C,p.L330P,ONCOGENIC 58171255,PD6862a,17,7577097,C,A,Sub,0,93,20.86,441,na,TP53,CCDS11118.1,c.841G>T,p.D281Y,ONCOGENIC 58165598,PD7116a,17,7577105,G,C,Sub,0,80,17.28,272,na,TP53,CCDS11118.1,c.833C>G,p.P278R,ONCOGENIC 58117976,PD7038a,17,7577114,C,T,Sub,0,75,15.9,283,na,TP53,CCDS11118.1,c.824G>A,p.C275Y,ONCOGENIC 58075765,PD6289a,17,7577121,G,A,Sub,0,72,32.29,384,na,TP53,CCDS11118.1,c.817C>T,p.R273C,ONCOGENIC 58150688,PD5755a,17,7577121,G,A,Sub,0,72,15.31,320,na,TP53,CCDS11118.1,c.817C>T,p.R273C,ONCOGENIC 58120238,PD6981a,17,7577127,C,T,Sub,0,65,39.51,286,na,TP53,CCDS11118.1,c.811G>A,p.E271K,ONCOGENIC 58118287,PD6081a,17,7577507,T,A,Sub,0,76,16.91,207,na,TP53,CCDS11118.1,c.774A>T,p.E258D,ONCOGENIC 58137486,PD6050a,17,7577507,T,A,Sub,0,76,45.71,70,na,TP53,CCDS11118.1,c.774A>T,p.E258D,ONCOGENIC 209553887,PD6506a,17,7577522,g,-,D,0,95,35.75268817,370,2,TP53,CCDS11118.1,c.759delC,p.I254fs*91,ONCOGENIC 207948361,PD6862a,17,7577528,-,A,I,0,104,30.08849558,450,1,TP53,CCDS11118.1,c.752_753insT,p.L252fs*12,ONCOGENIC 58120433,PD5740a,17,7577539,G,A,Sub,0,106,44.94,445,na,TP53,CCDS11118.1,c.742C>T,p.R248W,ONCOGENIC 58149476,PD6818a,17,7577539,G,A,Sub,0,106,17.3,445,na,TP53,CCDS11118.1,c.742C>T,p.R248W,ONCOGENIC 58136467,PD6246a,17,7577568,C,T,Sub,0,99,36.29,394,na,TP53,CCDS11118.1,c.713G>A,p.C238Y,ONCOGENIC 208051441,PD6192a,17,7577594,ac,-,D,0,77,4.887218045,265,1,TP53,CCDS11118.1,c.686_687delGT,p.C229fs*10,ONCOGENIC 58117738,PD6522a,17,7578190,T,C,Sub,0,65,18.4,500,na,TP53,CCDS11118.1,c.659A>G,p.Y220C,ONCOGENIC 58132468,PD7103a,17,7578190,T,C,Sub,0,65,33.2,500,na,TP53,CCDS11118.1,c.659A>G,p.Y220C,ONCOGENIC 58107763,PD6215a,17,7578203,C,T,Sub,0,65,39.2,500,na,TP53,CCDS11118.1,c.646G>A,p.V216M,ONCOGENIC 58195771,PD6907a,17,7578203,C,T,Sub,0,65,33.2,500,na,TP53,CCDS11118.1,c.646G>A,p.V216M,ONCOGENIC 58172754,PD6831a,17,7578208,T,C,Sub,0,64,12.97,424,na,TP53,CCDS11118.1,c.641A>G,p.H214R,ONCOGENIC 58134458,PD6906a,17,7578234,A,T,Sub,0,72,39.66,464,na,TP53,CCDS11118.1,c.615T>A,p.Y205*,ONCOGENIC 58107764,PD6215a,17,7578259,A,T,Sub,0,66,41,500,na,TP53,CCDS11118.1,c.590T>A,p.V197E,ONCOGENIC 58182296,PD6502a,17,7578263,G,A,Sub,0,68,82.13,319,na,TP53,CCDS11118.1,c.586C>T,p.R196*,ONCOGENIC 208561693,PD6305a,17,7578267,A,T,DI,0,71,42.62589928,453,0,TP53,-,-,-,ONCOGENIC 58132469,PD7103a,17,7578268,A,C,Sub,0,64,37.8,500,na,TP53,CCDS11118.1,c.581T>G,p.L194R,ONCOGENIC 58120724,PD7120a,17,7578388,C,T,Sub,0,52,47.47,217,na,TP53,CCDS11118.1,c.542G>A,p.R181H,ONCOGENIC 58095771,PD6187a,17,7578395,G,C,Sub,0,58,14.44,180,na,TP53,CCDS11118.1,c.535C>G,p.H179D,ONCOGENIC 58200513,PD6235a,17,7578440,T,C,Sub,0,46,61.82,220,na,TP53,CCDS11118.1,c.490A>G,p.K164E,ONCOGENIC 208353759,PD6187a,17,7578474,-,G,I,0,25,11.33004926,203,5,TP53,CCDS11118.1,c.455_456insC,p.P153fs*28,ONCOGENIC 58067592,PD6285a,17,7578542,G,C,Sub,0,13,86.36,44,na,TP53,CCDS11118.1,c.388C>G,p.L130V,ONCOGENIC 345163700,PD7380a,17,7579330,g,-,D,0,45,35.13513514,37,2,TP53,CCDS11118.1,c.357delC,p.K120fs*3,ONCOGENIC 58086292,PD7017a,17,7579334,G,A,Sub,0,39,20.73,82,na,TP53,CCDS11118.1,c.353C>T,p.T118I,ONCOGENIC 58086293,PD7017a,17,7579365,C,T,Sub,0,29,40.91,110,na,TP53,CCDS11118.1,c.322G>A,p.G108S,ONCOGENIC 58092419,PD7016a,17,7579365,C,T,Sub,0,29,33.86,127,na,TP53,CCDS11118.1,c.322G>A,p.G108S,ONCOGENIC 58112002,PD7010a,17,7579365,C,T,Sub,0,29,10.2,98,na,TP53,CCDS11118.1,c.322G>A,p.G108S,ONCOGENIC 58112247,PD7040a,17,7579373,C,A,Sub,0,25,11.31,168,na,TP53,CCDS11118.1,c.314G>T,p.G105V,ONCOGENIC 58160891,PD6806a,17,7579545,C,T,Sub,0,29,16.5,394,na,TP53,CCDS11118.1,c.142G>A,p.D48N,ONCOGENIC 66878154,PD8934a,17,7579550,G,A,Sub,0,30,11.9,126,na,TP53,CCDS11118.1,c.137C>T,p.S46F,ONCOGENIC 58194379,PD6881a,17,27233295,C,T,Sub,0,71,24,50,na,PHF12,CCDS32598.1,c.2921G>A,p.G974D,UNKNOWN 58194380,PD6881a,17,27233328,C,T,Sub,0,75,15.25,59,na,PHF12,CCDS32598.1,c.2888G>A,p.S963N,UNKNOWN 66858174,PD8935a,17,27233400,G,C,Sub,0,97,51.61,31,na,PHF12,CCDS32598.1,c.2816C>G,p.A939G,UNKNOWN 58167670,PD6875a,17,27233530,G,A,Sub,2.44,41,33.33,18,na,PHF12,CCDS32598.1,c.2686C>T,p.R896C,UNKNOWN 356395590,PD7386a,17,27233908,-,G,I,0,246,32.03125,128,7,PHF12,CCDS32598.1,c.2645_2646insC,p.S883fs*25,UNKNOWN 66858175,PD8935a,17,27235899,A,C,Sub,0,25,65.89,129,na,PHF12,CCDS32598.1,c.2360T>G,p.V787G,UNKNOWN 58108287,PD6900a,17,27239923,G,T,Sub,0,53,51.76,170,na,PHF12,CCDS32598.1,c.1666C>A,p.P556T,UNKNOWN 58154657,PD6808a,17,27240018,G,C,Sub,0,50,45.33,150,na,PHF12,CCDS32598.1,c.1571C>G,p.S524C,UNKNOWN 68092705,PD8728a,17,27244452,T,C,Sub,0,116,11.11,90,na,PHF12,CCDS32598.1,c.985A>G,p.M329V,UNKNOWN 58071213,PD6122a,17,27251074,C,T,Sub,0,28,49.86,347,na,PHF12,CCDS32598.1,c.568G>A,p.V190M,UNKNOWN 58072632,PD7119a,17,27254017,G,A,Sub,0,113,47,500,na,PHF12,CCDS32598.1,c.313C>T,p.R105C,UNKNOWN 58112006,PD7010a,17,27254074,G,A,Sub,0,145,12.2,500,na,PHF12,CCDS32598.1,c.256C>T,p.P86S,UNKNOWN 58174687,PD6145a,17,29509643,A,G,Sub,0,96,46.4,444,na,NF1,CCDS42292.1,c.848A>G,p.D283G,UNKNOWN 58113775,PD6810a,17,29527456,G,A,Sub,0,245,52.51,438,na,NF1,CCDS42292.1,c.905G>A,p.S302N,UNKNOWN 58194386,PD6881a,17,29533267,G,A,Sub,0,367,11.42,254,na,NF1,CCDS42292.1,c.1270G>A,p.D424N,UNKNOWN 67016510,PD7386a,17,29533276,C,T,Sub,0,361,15.91,176,na,NF1,CCDS42292.1,c.1279C>T,p.P427S,UNKNOWN 58109778,PD6139a,17,29553483,C,A,Sub,0,134,47.91,407,na,NF1,CCDS42292.1,c.2032C>A,p.P678T,UNKNOWN 58112011,PD7010a,17,29554277,C,T,Sub,0,39,21.8,500,na,NF1,CCDS42292.1,c.2293C>T,p.R765C,UNKNOWN 58098296,PD6543a,17,29556166,T,C,Sub,0,49,22.4,500,na,NF1,CCDS42292.1,c.2533T>C,p.C845R,UNKNOWN 58205232,PD6259a,17,29556380,A,G,Sub,0,221,47.4,500,na,NF1,CCDS42292.1,c.2747A>G,p.N916S,UNKNOWN 58072021,PD6477a,17,29556431,T,C,Sub,0,213,45.8,500,na,NF1,CCDS42292.1,c.2798T>C,p.L933P,UNKNOWN 58095440,PD6100a,17,29556898,G,A,Sub,0,62,11.04,163,na,NF1,CCDS42292.1,c.2896G>A,p.A966T,UNKNOWN 58138675,PD6479a,17,29556920,A,C,Sub,0,57,48.6,500,na,NF1,CCDS42292.1,c.2918A>C,p.D973A,UNKNOWN 58089969,PD6940a,17,29562747,G,T,Sub,0,419,18,500,na,NF1,CCDS42292.1,c.3827G>T,p.R1276L,UNKNOWN 67016514,PD7386a,17,29576111,C,T,Sub,0,228,16.2,284,na,NF1,CCDS42292.1,c.4084C>T,p.R1362*,ONCOGENIC 58087430,PD6282a,17,29585520,G,C,Sub,0,102,14.03,335,na,NF1,CCDS42292.1,c.4332G>C,p.K1444N,ONCOGENIC 58109614,PD6119a,17,29586069,A,G,Sub,0,22,78.76,226,na,NF1,CCDS42292.1,c.4352A>G,p.N1451S,UNKNOWN 58114524,PD6287a,17,29586110,A,T,Sub,0,31,50.37,268,na,NF1,CCDS42292.1,c.4393A>T,p.N1465Y,UNKNOWN 58084311,PD6302a,17,29587505,A,G,Sub,0,282,20.59,340,na,NF1,CCDS42292.1,c.4549A>G,p.K1517E,UNKNOWN 58171590,PD5765a,17,29588751,C,T,Sub,0,186,12.8,500,na,NF1,CCDS42292.1,c.4600C>T,p.R1534*,ONCOGENIC 66858182,PD8935a,17,29592326,G,A,Sub,0,195,12.46,297,na,NF1,CCDS42292.1,c.4804G>A,p.G1602R,UNKNOWN 58192691,PD6922a,17,29652882,T,A,Sub,0,225,13.7,146,na,NF1,CCDS42292.1,c.4880T>A,p.V1627D,UNKNOWN 58161909,PD6917a,17,29652964,T,A,Sub,0,230,53.6,500,na,NF1,CCDS42292.1,c.4962T>A,p.F1654L,UNKNOWN 68092713,PD8728a,17,29654658,G,A,Sub,0,173,25.8,500,na,NF1,CCDS42292.1,c.5410G>A,p.A1804T,UNKNOWN 58192692,PD6922a,17,29654856,C,T,Sub,0,100,16.42,201,na,NF1,CCDS42292.1,c.5608C>T,p.R1870W,UNKNOWN 66971205,PD7379a,17,29657472,C,T,Sub,0,149,10.91,165,na,NF1,CCDS42292.1,c.5768C>T,p.T1923M,UNKNOWN 58079292,PD6822a,17,29663795,C,T,Sub,0,128,11.47,497,na,NF1,CCDS42292.1,c.6290C>T,p.A2097V,UNKNOWN 58138074,PD6803a,17,29665083,C,T,Sub,0,500,16.77,167,na,NF1,CCDS42292.1,c.6745C>T,p.L2249F,UNKNOWN 58086694,PD6091a,17,29676179,A,T,Sub,0,34,44.76,315,na,NF1,CCDS42292.1,c.7231A>T,p.R2411*,ONCOGENIC 66971206,PD7379a,17,29676252,G,A,Sub,0,47,16.67,120,na,NF1,CCDS42292.1,c.7304G>A,p.S2435N,UNKNOWN 66786045,PD7384a,17,29683480,A,G,Sub,0,52,50.65,231,na,NF1,CCDS42292.1,c.7618A>G,p.T2540A,UNKNOWN 58118284,PD6081a,17,29701152,T,C,Sub,0,145,36.09,338,na,NF1,ENST00000444181,c.1873+5T>C,p.?,ONCOGENIC 207909953,PD6925a,17,74732936,ggcgg<14>cgggg,-,D,0,21,23.63636364,47,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 207987665,PD6258a,17,74732936,ggcgg<14>cgggg,-,D,0,21,41.17647059,37,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208018506,PD6491a,17,74732936,ggcgg<14>cgggg,-,D,0,21,24.07407407,46,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208105722,PD6962a,17,74732936,ggcgg<14>cgggg,-,D,0,21,42.85714286,19,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208166957,PD5773a,17,74732936,ggcgg<14>cgggg,-,D,0,21,30.88235294,55,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208200810,PD6963a,17,74732936,ggcgg<14>cgggg,-,D,0,21,38.0952381,34,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208409079,PD6987a,17,74732936,ggcgg<14>cgggg,-,D,0,21,44.18604651,29,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208468449,PD7009a,17,74732936,ggcgg<14>cgggg,-,D,0,21,21.42857143,38,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208492490,PD6106a,17,74732936,ggcgg<14>cgggg,-,D,0,21,41.93548387,27,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 208550052,PD6099a,17,74732936,ggcgg<14>cgggg,-,D,0,21,33.33333333,38,0,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 344722904,PD8939a,17,74732936,ggcgg<14>cgggg,-,D,0,8,38.46153846,12,1,SFRS2,CCDS11749.1,c.284_307del24,p.P95_R102delPPDSHHSR,ONCOGENIC 209683649,PD5754a,17,74732959,-,GGC,I,0,22,22.22222222,51,2,SFRS2,CCDS11749.1,c.283_284insGCC,p.R94_P95insR,ONCOGENIC MANUAL CALL,PD6099a,17,74732959,G,C,Sub,,16,16.66666667,24,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC MANUAL CALL,PD6106a,17,74732959,G,C,Sub,,16,21.42857143,14,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC MANUAL CALL,PD6157a,17,74732959,G,T,Sub,,16,15,2,na,SFRS2,CCDS11749.1,c.284C>T,p.P95H,ONCOGENIC MANUAL CALL,PD6823a,17,74732959,G,T,Sub,,16,21.21212121,33,na,SFRS2,CCDS11749.1,c.284C>T,p.P95H,ONCOGENIC 58067252,PD6090a,17,74732959,G,T,Sub,0,16,17.95,39,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58067731,PD6253a,17,74732959,G,T,Sub,0,16,40,35,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58068794,PD5735a,17,74732959,G,T,Sub,0,16,33.33,18,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58068943,PD5751a,17,74732959,G,A,Sub,0,16,36.36,55,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58072013,PD6477a,17,74732959,G,T,Sub,0,16,33.33,57,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58072474,PD7118a,17,74732959,G,T,Sub,0,16,23.08,39,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58072956,PD6985a,17,74732959,G,C,Sub,0,16,36.36,11,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58073695,PD6320a,17,74732959,G,C,Sub,0,16,22.86,35,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58074208,PD5789a,17,74732959,G,C,Sub,0,16,45,40,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58076423,PD6789a,17,74732959,G,T,Sub,0,16,40.54,37,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58078035,PD6877a,17,74732959,G,A,Sub,0,16,43.48,23,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58078294,PD5747a,17,74732959,G,T,Sub,0,16,26.47,34,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58078939,PD6504a,17,74732959,G,C,Sub,0,16,33.33,48,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58079077,PD6523a,17,74732959,G,A,Sub,0,16,25,44,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58079950,PD6520a,17,74732959,G,T,Sub,0,16,61.29,31,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58080411,PD6545a,17,74732959,G,T,Sub,0,16,41.46,41,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58081165,PD5768a,17,74732959,G,T,Sub,0,16,20,35,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58081901,PD6072a,17,74732959,G,C,Sub,0,16,48.84,43,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58082804,PD6335a,17,74732959,G,T,Sub,0,16,44.83,58,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58083514,PD6957a,17,74732959,G,T,Sub,0,16,10.2,49,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58083926,PD7027a,17,74732959,G,T,Sub,0,16,37.5,32,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58085428,PD6095a,17,74732959,G,T,Sub,0,16,34.78,46,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58085701,PD6179a,17,74732959,G,T,Sub,0,16,52,25,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58087424,PD6282a,17,74732959,G,A,Sub,0,16,55,40,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58089961,PD6940a,17,74732959,G,C,Sub,0,16,44.44,36,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58090298,PD6162a,17,74732959,G,T,Sub,0,16,39.53,43,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58092684,PD7028a,17,74732959,G,T,Sub,0,16,36.11,36,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58093228,PD6318a,17,74732959,G,C,Sub,0,16,22.86,35,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58093431,PD5714a,17,74732959,G,A,Sub,0,16,44.44,45,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58098222,PD6256a,17,74732959,G,A,Sub,0,16,25.64,39,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58098466,PD6783a,17,74732959,G,A,Sub,0,16,40.74,27,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58100717,PD7031a,17,74732959,G,T,Sub,0,16,35.48,31,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58101434,PD6164a,17,74732959,G,T,Sub,0,16,37.14,35,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58102219,PD6270a,17,74732959,G,A,Sub,0,16,44.44,54,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58102879,PD6232a,17,74732959,G,T,Sub,0,16,45.83,48,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58103072,PD6834a,17,74732959,G,T,Sub,0,16,30.3,33,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58103894,PD6970a,17,74732959,G,A,Sub,0,16,41.67,48,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58106008,PD6182a,17,74732959,G,T,Sub,0,16,28.57,28,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58108049,PD6274a,17,74732959,G,T,Sub,0,16,56.82,44,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58113450,PD6516a,17,74732959,G,A,Sub,0,16,22.73,44,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58115196,PD6887a,17,74732959,G,C,Sub,0,16,30.56,36,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58115664,PD6278a,17,74732959,G,T,Sub,0,16,44.44,36,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58116466,PD6312a,17,74732959,G,A,Sub,0,16,34.55,55,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58116632,PD6929a,17,74732959,G,T,Sub,0,16,33.96,53,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58117187,PD5785a,17,74732959,G,T,Sub,0,16,53.19,47,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58120430,PD5740a,17,74732959,G,A,Sub,0,16,47.5,40,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58128059,PD6284a,17,74732959,G,A,Sub,0,16,60.87,23,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58128601,PD5724a,17,74732959,G,T,Sub,0,16,32.35,34,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58131646,PD6842a,17,74732959,G,C,Sub,0,16,42.19,64,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58133778,PD6058a,17,74732959,G,C,Sub,0,16,39.02,41,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58135903,PD5750a,17,74732959,G,A,Sub,0,16,24,50,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58140854,PD6301a,17,74732959,G,A,Sub,0,16,27.08,48,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58141213,PD6189a,17,74732959,G,T,Sub,0,16,28.21,39,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58144994,PD5717a,17,74732959,G,A,Sub,0,16,44.44,27,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58145469,PD6855a,17,74732959,G,A,Sub,0,16,33.33,33,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58147203,PD6807a,17,74732959,G,T,Sub,0,16,28,25,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58148432,PD6143a,17,74732959,G,T,Sub,0,16,12.77,47,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58149655,PD6328a,17,74732959,G,C,Sub,0,16,10.26,39,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58152482,PD5748a,17,74732959,G,C,Sub,0,16,33.33,36,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58153929,PD6800a,17,74732959,G,T,Sub,0,16,38.1,42,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58154115,PD7036a,17,74732959,G,T,Sub,0,16,12.82,39,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58155160,PD6821a,17,74732959,G,T,Sub,0,16,52.17,46,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58159932,PD6998a,17,74732959,G,T,Sub,0,16,46.15,26,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58163533,PD7033a,17,74732959,G,T,Sub,0,16,25,40,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58164066,PD6943a,17,74732959,G,A,Sub,0,16,23.4,47,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58164402,PD6883a,17,74732959,G,A,Sub,0,16,18.18,33,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58164771,PD6786a,17,74732959,G,T,Sub,0,16,42.86,42,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58165595,PD7116a,17,74732959,G,A,Sub,0,16,52,25,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58167069,PD6797a,17,74732959,G,T,Sub,0,16,47.22,36,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58171095,PD6499a,17,74732959,G,C,Sub,0,16,50,46,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58173151,PD6286a,17,74732959,G,T,Sub,0,16,35.9,39,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58180749,PD5759a,17,74732959,G,T,Sub,0,16,22.58,31,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58180918,PD6488a,17,74732959,G,A,Sub,0,16,51.16,43,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58183315,PD6124a,17,74732959,G,C,Sub,0,16,47.92,48,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58183478,PD6819a,17,74732959,G,T,Sub,0,16,42.22,45,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58184562,PD6876a,17,74732959,G,A,Sub,0,16,25.71,35,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58184732,PD6254a,17,74732959,G,T,Sub,0,16,18.18,22,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 58188468,PD6945a,17,74732959,G,A,Sub,0,16,50,32,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58190944,PD6919a,17,74732959,G,A,Sub,0,16,40,35,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58196035,PD6527a,17,74732959,G,C,Sub,0,16,46.51,43,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58198539,PD6181a,17,74732959,G,A,Sub,0,16,50,28,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58199916,PD5733a,17,74732959,G,C,Sub,0,16,48.98,49,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58200794,PD6275a,17,74732959,G,C,Sub,0,16,14.63,41,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 58201596,PD6857a,17,74732959,G,A,Sub,0,16,44.44,45,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 66791595,PD7364a,17,74732959,G,C,Sub,0,16,57.14,14,na,SFRS2,CCDS11749.1,c.284C>G,p.P95R,ONCOGENIC 66866842,PD7387a,17,74732959,G,T,Sub,0,16,23.53,17,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 66951374,PD7391a,17,74732959,G,T,Sub,0,16,33.33,15,na,SFRS2,CCDS11749.1,c.284C>A,p.P95H,ONCOGENIC 66964148,PD7367a,17,74732959,G,A,Sub,0,16,75,8,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 68195953,PD8730a,17,74732959,G,A,Sub,0,16,70,10,na,SFRS2,CCDS11749.1,c.284C>T,p.P95L,ONCOGENIC 58078605,PD6255a,17,74732960,G,T,Sub,0,16,45.83,24,na,SFRS2,CCDS11749.1,c.283C>A,p.P95T,ONCOGENIC 58105021,PD6283a,17,74732960,G,T,Sub,0,16,42.5,40,na,SFRS2,CCDS11749.1,c.283C>A,p.P95T,ONCOGENIC 58115373,PD5711a,17,74732960,G,T,Sub,0,16,23.08,26,na,SFRS2,CCDS11749.1,c.283C>A,p.P95T,ONCOGENIC 58142153,PD6201a,17,74732960,G,C,Sub,0,16,19.44,36,na,SFRS2,CCDS11749.1,c.283C>G,p.P95A,ONCOGENIC 58166109,PD6830a,17,74732960,G,T,Sub,0,16,20.69,29,na,SFRS2,CCDS11749.1,c.283C>A,p.P95T,ONCOGENIC 58182433,PD6809a,17,74732960,G,T,Sub,0,16,54.29,35,na,SFRS2,CCDS11749.1,c.283C>A,p.P95T,ONCOGENIC 58136161,PD6236a,17,74733013,C,T,Sub,0,24,14.08,71,na,SFRS2,CCDS11749.1,c.230G>A,p.G77E,POSSIBLE ONCOGENIC 209605321,PD6945a,18,52896133,G,C,DI,0,151,31.1827957,74,1,TCF4,-,-,-,UNKNOWN 58086245,PD7017a,18,52899869,G,A,Sub,0.53,189,14.21,190,na,TCF4,CCDS42438.1,c.1520C>T,p.S507F,UNKNOWN 58105184,PD6944a,18,52921931,T,A,Sub,0,50,71.43,7,na,TCF4,CCDS42438.1,c.1147C>T,p.Q383*,UNKNOWN 66763307,PD8936a,18,52924587,C,T,Sub,0,118,10.05,189,na,TCF4,CCDS42438.1,c.1105G>A,p.A369T,UNKNOWN 58105185,PD6944a,18,52927208,G,C,Sub,0,77,85.71,14,na,TCF4,CCDS42438.1,c.1041T>G,p.T347T,UNKNOWN 58085257,PD6798a,18,52942947,C,T,Sub,0,57,47.72,438,na,TCF4,CCDS42438.1,c.692G>A,p.S231N,UNKNOWN 58179029,PD6784a,18,52946787,A,G,Sub,0,76,25,76,na,TCF4,CCDS42438.1,c.650T>C,p.M217T,UNKNOWN 66763308,PD8936a,18,53018183,G,A,Sub,0,134,16.74,221,na,TCF4,CCDS42438.1,c.421C>T,p.P141S,UNKNOWN 58199133,PD6150a,18,53254339,G,T,Sub,0,67,48.86,307,na,TCF4,CCDS42438.1,c.9C>A,p.H3Q,UNKNOWN 58172041,PD6967a,19,10244956,T,T,Sub,0,20,20,20,na,DNMT1,CCDS45958.1,c.4801G>A,p.A1601T,UNKNOWN 58096889,PD6939a,19,10246926,G,T,Sub,0,58,56.52,23,na,DNMT1,CCDS45958.1,c.4527C>A,p.N1509K,UNKNOWN 58200537,PD6235a,19,10246961,C,T,Sub,0,51,60,20,na,DNMT1,CCDS45958.1,c.4492G>A,p.G1498S,UNKNOWN 58192703,PD6922a,19,10260239,C,G,Sub,0,27,47.16,229,na,DNMT1,CCDS45958.1,c.2476G>C,p.D826H,UNKNOWN 58144080,PD6847a,19,10265445,G,A,Sub,0,84,48.28,29,na,DNMT1,CCDS45958.1,c.1649C>T,p.T550M,UNKNOWN 58125760,PD6992a,19,10274030,T,G,Sub,0,40,55.93,118,na,DNMT1,CCDS45958.1,c.898A>C,p.K300Q,UNKNOWN 58071011,PD6083a,19,10305529,G,A,Sub,0,11,40,10,na,DNMT1,CCDS45958.1,c.47C>T,p.P16L,UNKNOWN 58140261,PD7112a,19,12911039,A,G,Sub,0,46,45.16,31,na,PRDX2,CCDS12281.1,c.332T>C,p.L111S,UNKNOWN 208473666,PD5742a,19,12911105,-,G,I,0,14,21.42857143,28,6,PRDX2,CCDS12281.1,c.265_266insC,p.R91fs*16,UNKNOWN 58071553,PD6110a,19,12911661,G,A,Sub,0,13,73.37,169,na,PRDX2,CCDS45993.1,c.326C>T,p.S109L,UNKNOWN 58076780,PD6914a,19,12911734,C,T,Sub,0,24,12,75,na,PRDX2,CCDS12281.1,c.253G>A,p.A85T,UNKNOWN 58076767,PD6914a,19,17945687,G,A,Sub,0,38,22.02,109,na,JAK3,CCDS12366.1,c.2173C>T,p.P725S,UNKNOWN 58194889,PD6905a,19,17945973,C,T,Sub,0,28,10.2,49,na,JAK3,CCDS12366.1,c.1966G>A,p.A656T,UNKNOWN 342811953,PD8739a,19,17948787,tctg,-,D,0,34,55.31914894,45,1,JAK3,CCDS12366.1,c.1652_1655delCAGA,p.T551fs*4,UNKNOWN 58134612,PD7020a,19,17954686,G,C,Sub,0,52,63.64,11,na,JAK3,CCDS12366.1,c.208C>G,p.L70V,UNKNOWN 58079559,PD6098a,19,17955153,G,A,Sub,0,14,75,4,na,JAK3,CCDS12366.1,c.74C>T,p.A25V,UNKNOWN 209590010,PD6232a,19,33792423,-,CG,I,0,36,14.72868217,129,0,CEBPA,ENST00000498907,c.897_898insCG,p.D301fs*18,ONCOGENIC 58200895,PD6288a,19,33792468,A,T,Sub,0,10,39.77,88,na,CEBPA,ENST00000498907,c.853T>A,p.Y285N,UNKNOWN 208148253,PD6849a,19,33793089,g,-,D,0,8,41.93548387,31,2,CEBPA,ENST00000498907,c.232delC,p.L78fs*82,ONCOGENIC 58138059,PD6803a,19,54797967,C,T,Sub,0,21,60,5,na,LILRA3,ENST00000251375,c.1405G>A,p.G469S,UNKNOWN 58196916,PD5728a,19,54802109,T,G,Sub,0,147,47.76,201,na,LILRA3,CCDS12887.1,c.1079A>C,p.E360A,UNKNOWN 58171580,PD5765a,19,54802229,C,T,Sub,0,77,97.37,114,na,LILRA3,CCDS12887.1,c.959G>A,p.G320E,UNKNOWN 58071983,PD6477a,19,54802518,G,A,Sub,0,25,46.49,114,na,LILRA3,CCDS12887.1,c.923C>T,p.S308L,UNKNOWN 58089785,PD6934a,19,54803490,C,T,Sub,0,83,19.05,21,na,LILRA3,CCDS12887.1,c.334G>A,p.D112N,UNKNOWN 58175507,PD6333a,19,55086950,T,C,Sub,0,34,40,25,na,LILRA2,CCDS46179.1,c.883T>C,p.C295R,UNKNOWN 58137133,PD6092a,19,56172529,G,A,Sub,0,20,46.5,243,na,U2AF2,CCDS12933.1,c.460G>A,p.G154S,UNKNOWN 58202514,PD7078a,19,56173940,T,G,Sub,0,58,33.93,56,na,U2AF2,CCDS12933.1,c.559T>G,p.L187V,UNKNOWN 209513533,PD6795a,19,56173950,agatt<5>ggaca,-,D,0,70,34.17721519,63,1,U2AF2,CCDS12933.1,c.569_583delAGATTAACCAGGACA,p.I191_K195delINQDK,POSSIBLE ONCOGENIC 209554565,PD6836a,19,56180546,-,C,I,0,6,15.78947368,19,6,U2AF2,CCDS12933.1,c.1043_1044insC,p.S349fs*28,POSSIBLE ONCOGENIC 58110446,PD6306a,19,56180944,G,C,Sub,0,32,42.11,19,na,U2AF2,CCDS12933.1,c.1179G>C,p.E393D,UNKNOWN 58172766,PD6831a,20,31021124,G,A,Sub,0,143,39.05,105,na,ASXL1,CCDS13201.1,c.1123G>A,p.V375M,UNKNOWN 58148446,PD6143a,20,31021206,G,A,Sub,1.92,52,52.22,203,na,ASXL1,CCDS13201.1,c.1205G>A,p.R402Q,UNKNOWN 58092202,PD6890a,20,31021211,C,T,Sub,0,52,19.4,134,na,ASXL1,CCDS13201.1,c.1210C>T,p.R404*,ONCOGENIC 58080013,PD6843a,20,31021226,A,T,Sub,0,57,46.12,206,na,ASXL1,CCDS13201.1,c.1225A>T,p.K409*,ONCOGENIC 58148913,PD5760a,20,31021250,C,T,Sub,0,56,13.71,248,na,ASXL1,CCDS13201.1,c.1249C>T,p.R417*,ONCOGENIC 58164050,PD6943a,20,31021250,C,T,Sub,0,56,40,195,na,ASXL1,CCDS13201.1,c.1249C>T,p.R417*,ONCOGENIC 58180659,PD6474a,20,31021439,G,T,Sub,0,36,46.59,176,na,ASXL1,CCDS13201.1,c.1438G>T,p.E480*,ONCOGENIC 58116128,PD7114a,20,31021489,C,G,Sub,0,45,53.22,171,na,ASXL1,CCDS13201.1,c.1488C>G,p.N496K,ONCOGENIC 58201885,PD6921a,20,31021489,C,G,Sub,0,45,50.57,176,na,ASXL1,CCDS13201.1,c.1488C>G,p.N496K,ONCOGENIC 66902107,PD8937a,20,31021538,G,T,Sub,0,29,49.33,75,na,ASXL1,CCDS13201.1,c.1537G>T,p.E513*,ONCOGENIC 208519377,PD6172a,20,31021628,-,G,I,0,34,50.27027027,185,1,ASXL1,CCDS13201.1,c.1627_1628insG,p.E543fs*8,ONCOGENIC 208262195,PD6153a,20,31021634,-,C,I,0,36,35.33333333,149,1,ASXL1,CCDS13201.1,c.1633_1634insC,p.R545fs*6,ONCOGENIC 66804184,PD7389a,20,31022277,C,T,Sub,0,181,45.24,42,na,ASXL1,CCDS13201.1,c.1762C>T,p.Q588*,ONCOGENIC 207953878,PD6985a,20,31022287,-,A,I,0,193,34.09090909,88,1,ASXL1,CCDS13201.1,c.1772_1773insA,p.Y591fs*0,ONCOGENIC 58128212,PD6325a,20,31022288,C,A,Sub,0,179,19.64,56,na,ASXL1,CCDS13201.1,c.1773C>A,p.Y591*,ONCOGENIC 208472724,PD6179a,20,31022376,G,T,DI,0,45,36.36363636,47,0,ASXL1,-,-,-,ONCOGENIC 207992563,PD6268a,20,31022403,cacca<13>gcggc,-,D,0,21,26.2295082,49,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208019836,PD6112a,20,31022403,cacca<13>gcggc,-,D,0,21,25.83333333,96,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208050620,PD6188a,20,31022403,cacca<13>gcggc,-,D,0,21,27.71084337,67,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208072119,PD5740a,20,31022403,cacca<13>gcggc,-,D,0,21,25,67,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208146335,PD6244a,20,31022403,cacca<13>gcggc,-,D,0,21,35.21126761,55,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208170271,PD6241a,20,31022403,cacca<13>gcggc,-,D,0,21,6.172839506,80,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208202510,PD5733a,20,31022403,cacca<13>gcggc,-,D,0,21,20.2247191,82,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208209467,PD6922a,20,31022403,cacca<13>gcggc,-,D,0,21,33.03571429,84,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208263300,PD6492a,20,31022403,cacca<13>gcggc,-,D,0,21,25.74257426,85,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208358888,PD6279a,20,31022403,cacca<13>gcggc,-,D,0,21,38.15789474,55,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 208551053,PD6785a,20,31022403,cacca<13>gcggc,-,D,0,21,27.38095238,67,0,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 209553651,PD6506a,20,31022403,cacca<13>gcggc,-,D,0,19,27.38095238,70,1,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 209594407,PD6793a,20,31022403,cacca<13>gcggc,-,D,0,19,22,43,1,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 209599963,PD5756a,20,31022403,cacca<13>gcggc,-,D,0,19,29.16666667,61,1,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 344328547,PD7370a,20,31022403,cacca<13>gcggc,-,D,0,19,25.80645161,29,1,ASXL1,CCDS13201.1,c.1888_1910del23,p.E635fs*15,ONCOGENIC 344328548,PD7370a,20,31022415,A,CGGAG,DI,0,12,17.24137931,29,0,ASXL1,,,,ONCOGENIC 208222005,PD6280a,20,31022429,C,T,DI,0,6,50,39,0,ASXL1,-,-,-,ONCOGENIC 208450094,PD5735a,20,31022441,-,G,I,0,4,47.5,35,8,ASXL1,CCDS13201.1,c.1926_1927insG,p.G646fs*12,ONCOGENIC 208528982,PD6191a,20,31022441,-,G,I,0,4,31.91489362,46,8,ASXL1,CCDS13201.1,c.1926_1927insG,p.G646fs*12,ONCOGENIC 208126547,PD6126a,20,31022441,-,A,I,,4,41.26984127,63,1,ASXL1,CCDS13201.1,c.1926_1927insA,p.G643fs*15,ONCOGENIC 208095184,PD6484a,20,31022441,-,A,I,,4,45.76271186,59,1,ASXL1,CCDS13201.1,c.1926_1927insA,p.G643fs*15,ONCOGENIC 208496893,PD5728a,20,31022442,g,-,D,0,4,35.29411765,32,8,ASXL1,CCDS13201.1,c.1927delG,p.G645fs*58,ONCOGENIC 208039470,PD6189a,20,31022449,-,G,I,0,5,44.61538462,60,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208090570,PD6517a,20,31022449,-,G,I,0,5,37.5,55,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208109923,PD6782a,20,31022449,-,G,I,0,5,40,29,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208111521,PD6812a,20,31022449,-,G,I,0,5,31.81818182,43,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208203676,PD6081a,20,31022449,-,G,I,0,5,37.77777778,42,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208224485,PD6514a,20,31022449,-,G,I,0,5,33.33333333,59,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208235228,PD6181a,20,31022449,-,G,I,0,5,40,35,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208281830,PD5789a,20,31022449,-,G,I,0,5,34.88372093,42,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208309015,PD6998a,20,31022449,-,G,I,0,5,42.10526316,35,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208413661,PD6183a,20,31022449,-,G,I,0,5,33.33333333,29,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208420588,PD6496a,20,31022449,-,G,I,0,5,40.6779661,58,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208461445,PD6090a,20,31022449,-,G,I,0,5,30.43478261,46,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208463802,PD6824a,20,31022449,-,G,I,0,5,38.46153846,25,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208476258,PD6171a,20,31022449,-,G,I,0,5,39.39393939,33,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 208527587,PD6545a,20,31022449,-,G,I,0,5,45.3125,62,8,ASXL1,CCDS13201.1,c.1934_1935insG,p.G646fs*12,ONCOGENIC 207912681,PD6091a,20,31022450,-,T,I,0,5,37.93103448,28,1,ASXL1,CCDS13201.1,c.1935_1936insT,p.G646fs*12,ONCOGENIC 58142320,PD5754a,20,31022562,A,G,Sub,0,46,40.52,153,na,ASXL1,CCDS13201.1,c.2047A>G,p.T683A,POSSIBLE ONCOGENIC 58071976,PD6477a,20,31022592,C,T,Sub,0,54,43.58,179,na,ASXL1,CCDS13201.1,c.2077C>T,p.R693*,ONCOGENIC 58098120,PD6256a,20,31022592,C,T,Sub,0,54,44.29,140,na,ASXL1,CCDS13201.1,c.2077C>T,p.R693*,ONCOGENIC 58171409,PD6277a,20,31022592,C,T,Sub,0,54,47.58,124,na,ASXL1,CCDS13201.1,c.2077C>T,p.R693*,ONCOGENIC 66782316,PD7385a,20,31022592,C,T,Sub,0,54,39.51,81,na,ASXL1,CCDS13201.1,c.2077C>T,p.R693*,ONCOGENIC 68195949,PD8730a,20,31022592,C,T,Sub,0,54,11.11,72,na,ASXL1,CCDS13201.1,c.2077C>T,p.R693*,ONCOGENIC 208398581,PD6270a,20,31022600,actactgc,-,D,0,80,42.72727273,138,1,ASXL1,CCDS13201.1,c.2085_2092delACTACTGC,p.Q695fs*20,ONCOGENIC 344738979,PD7391a,20,31022643,G,-,D,0,60,30.34,89,2,ASXL1,CCDS13201.1,c.2128delG,p.G710fs*15,ONCOGENIC 346882448,PD8731a,20,31022668,g,-,D,0,58,29.34782609,92,1,ASXL1,CCDS13201.1,c.2153delG,p.R718fs*7,ONCOGENIC 208471548,PD6486a,20,31022700,agct,-,D,0,53,37.83783784,190,1,ASXL1,CCDS13201.1,c.2185_2188delAGCT,p.S729fs*14,ONCOGENIC 58101422,PD6164a,20,31022712,C,T,Sub,0,46,46.67,165,na,ASXL1,CCDS13201.1,c.2197C>T,p.Q733*,ONCOGENIC 208077937,PD6539a,20,31022716,g,-,D,0,55,32.0610687,261,3,ASXL1,CCDS13201.1,c.2201delG,p.A735fs*9,ONCOGENIC 208218361,PD6972a,20,31022724,g,-,D,0,51,26.84824903,256,1,ASXL1,CCDS13201.1,c.2209delG,p.V737fs*7,ONCOGENIC 208142799,PD6313a,20,31022760,c,-,D,0,42,37.76223776,142,1,ASXL1,CCDS13201.1,c.2245delC,p.L749fs*23,ONCOGENIC 58192349,PD6534a,20,31022793,C,T,Sub,0,48,23.26,215,na,ASXL1,CCDS13201.1,c.2278C>T,p.Q760*,ONCOGENIC 208106081,PD6878a,20,31022831,-,T,I,0,69,25.89285714,224,1,ASXL1,CCDS13201.1,c.2316_2317insT,p.E773fs*0,ONCOGENIC 58100691,PD7031a,20,31022835,A,T,Sub,0,66,22.86,140,na,ASXL1,CCDS13201.1,c.2320A>T,p.R774*,ONCOGENIC 208372514,PD6165a,20,31022838,-,A,I,0,73,39.23444976,209,0,ASXL1,CCDS13201.1,c.2323_2324insA,p.L775fs*12,ONCOGENIC 58114785,PD6093a,20,31022847,C,T,Sub,0,68,40.65,278,na,ASXL1,CCDS13201.1,c.2332C>T,p.Q778*,ONCOGENIC 58164751,PD6786a,20,31022853,C,T,Sub,0,70,40,220,na,ASXL1,CCDS13201.1,c.2338C>T,p.Q780*,ONCOGENIC 208288423,PD6835a,20,31022899,c,-,D,0,60,9.130434783,230,2,ASXL1,CCDS13201.1,c.2384delC,p.W796fs*22,ONCOGENIC 58111307,PD6955a,20,31022902,G,A,Sub,0,57,23.11,238,na,ASXL1,CCDS13201.1,c.2387G>A,p.W796*,ONCOGENIC 58152476,PD5748a,20,31022922,C,T,Sub,0,46,40.27,293,na,ASXL1,CCDS13201.1,c.2407C>T,p.Q803*,ONCOGENIC 208402186,PD6217a,20,31022937,c,-,D,0,51,10.77441077,297,2,ASXL1,CCDS13201.1,c.2422delC,p.P808fs*10,ONCOGENIC 209555464,PD6789a,20,31022937,c,-,D,0,50,21.34146341,162,2,ASXL1,CCDS13201.1,c.2422delC,p.P808fs*10,ONCOGENIC 208385394,PD6783a,20,31022982,t,-,D,0,64,23.93617021,187,2,ASXL1,CCDS13201.1,c.2467delT,p.L823fs*0,ONCOGENIC 344720519,PD8936a,20,31023045,-,C,I,0,68,24.48979592,49,5,ASXL1,CCDS13201.1,c.2530_2531insC,p.S846fs*5,ONCOGENIC 58114474,PD6287a,20,31023087,C,T,Sub,0,54,44.24,278,na,ASXL1,CCDS13201.1,c.2572C>T,p.Q858*,ONCOGENIC 208342263,PD6807a,20,31023153,-,A,I,0,78,48.26254826,259,1,ASXL1,CCDS13201.1,c.2638_2639insA,p.T880fs*2,ONCOGENIC 207965004,PD5726a,20,31023222,-,C,I,0,94,46.70846395,319,1,ASXL1,CCDS13201.1,c.2707_2708insC,p.N904fs*2,ONCOGENIC 209687994,PD6169a,20,31023223,-,GAAT,I,0,100,9.132420091,209,1,ASXL1,CCDS13201.1,c.2708_2709insGAAT,p.D905fs*2,ONCOGENIC 67016468,PD7386a,20,31023270,A,G,Sub,0,84,13.2,500,na,ASXL1,CCDS13201.1,c.2755A>G,p.I919V,UNKNOWN 209616500,PD6278a,20,31023301,A,GG,DI,0,101,31.35135135,181,1,ASXL1,-,-,-,ONCOGENIC 347174154,PD7375a,20,31023409,gag,-,D,0,141,6.428571429,140,2,ASXL1,CCDS13201.1,c.2894_2896delGAG,p.G967delG,ONCOGENIC 208307702,PD6156a,20,31023434,t,-,D,0,158,8.333333333,60,2,ASXL1,CCDS13201.1,c.2919delT,p.Y974fs*10,ONCOGENIC 208145818,PD7006a,20,31023520,c,-,D,0,105,29.23076923,65,2,ASXL1,CCDS13201.1,c.3005delC,p.S1003fs*21,ONCOGENIC 208266132,PD7010a,20,31023520,c,-,D,,105,3.717472119,269,2,ASXL1,CCDS13201.1,c.3005delC,p.S1003fs*21,ONCOGENIC 67016472,PD7386a,20,31023636,G,A,Sub,0,42,17,500,na,ASXL1,CCDS13201.1,c.3121G>A,p.A1041T,POSSIBLE ONCOGENIC 67016473,PD7386a,20,31023654,A,T,Sub,0,32,14,500,na,ASXL1,CCDS13201.1,c.3139A>T,p.N1047Y,POSSIBLE ONCOGENIC 208479447,PD6122a,20,31023702,-,A,I,0,40,6.984126984,315,1,ASXL1,CCDS13201.1,c.3187_3188insA,p.S1064fs*23,ONCOGENIC 58157651,PD6271a,20,31023892,A,G,Sub,0,111,53.47,245,na,ASXL1,CCDS13201.1,c.3377A>G,p.H1126R,POSSIBLE ONCOGENIC 208271648,PD6329a,20,31024027,g,-,D,0,81,19.76047904,334,3,ASXL1,CCDS13201.1,c.3512delG,p.A1172fs*2,ONCOGENIC 58183617,PD6854a,20,31024033,T,C,Sub,0,76,48.53,204,na,ASXL1,CCDS13201.1,c.3518T>C,p.L1173S,POSSIBLE ONCOGENIC 58135357,PD6247a,20,31024084,G,A,Sub,0,77,48.84,215,na,ASXL1,CCDS13201.1,c.3569G>A,p.R1190K,POSSIBLE ONCOGENIC 344723097,PD8939a,20,31024131,gcagtcccaa,-,D,0,102,19.60784314,50,1,ASXL1,CCDS13201.1,c.3616_3625delGCAGTCCCAA,p.A1206fs*8,ONCOGENIC 207995805,PD6488a,20,31024150,c,-,D,0,109,35.02304147,217,3,ASXL1,CCDS13201.1,c.3635delC,p.L1213fs*4,ONCOGENIC 58147329,PD6057a,20,31024503,C,T,Sub,0,63,15.15,33,na,ASXL1,CCDS13201.1,c.3988C>T,p.P1330S,ONCOGENIC 58179052,PD6784a,20,31024584,G,A,Sub,0,66,16.71,395,na,ASXL1,CCDS13201.1,c.4069G>A,p.A1357T,POSSIBLE ONCOGENIC 208556990,PD6977a,20,31024636,-,G,I,0,62,34.69387755,294,6,ASXL1,CCDS13201.1,c.4121_4122insG,p.P1377fs*3,ONCOGENIC 208210957,PD6914a,20,31024642,-,G,I,0,59,28.74493927,246,6,ASXL1,CCDS13201.1,c.4127_4128insG,p.P1377fs*3,ONCOGENIC 58194312,PD6881a,20,37378687,C,T,Sub,0,61,11.73,307,na,ACTR5,CCDS13308.1,c.410C>T,p.A137V,UNKNOWN 58071603,PD6110a,20,37394938,G,A,Sub,0,246,61.6,500,na,ACTR5,CCDS13308.1,c.1351G>A,p.A451T,UNKNOWN 58079206,PD6822a,20,57415501,G,A,Sub,0,53,51.65,91,na,GNAS,CCDS13471.1,c.340G>A,p.E114K,UNKNOWN 208294722,PD6062a,20,57415506,cgagtccgaaat,-,D,0,63,38.02816901,51,1,GNAS,CCDS13471.1,c.345_356delCGAGTCCGAAAT,p.I119_E122delIESE,POSSIBLE ONCOGENIC 208035228,PD6297a,20,57415543,gagcc<14>ccact,-,D,0,49,26.19047619,68,0,GNAS,CCDS13471.1,c.382_405del24,p.E128_T135delEPETAPTT,POSSIBLE ONCOGENIC 58139533,PD6898a,20,57415573,G,A,Sub,0,29,47.69,65,na,GNAS,CCDS13471.1,c.412G>A,p.E138K,UNKNOWN 58092576,PD7016a,20,57415747,C,T,Sub,0,15,29.03,31,na,GNAS,CCDS13471.1,c.586C>T,p.P196S,UNKNOWN 58104715,PD6916a,20,57415865,C,T,Sub,0,39,50,16,na,GNAS,CCDS13471.1,c.704C>T,p.S235F,UNKNOWN 58106543,PD6088a,20,57428604,G,A,Sub,0,45,12.31,65,na,GNAS,CCDS46622.1,c.284G>A,p.S95N,UNKNOWN 58137098,PD6092a,20,57428930,G,A,Sub,0,43,88.89,27,na,GNAS,CCDS46622.1,c.610G>A,p.A204T,UNKNOWN 58095434,PD6100a,20,57429072,G,A,Sub,0,23,40,10,na,GNAS,CCDS46622.1,c.752G>A,p.G251D,UNKNOWN 58173466,PD6975a,20,57429251,A,C,Sub,0,11,52.46,61,na,GNAS,CCDS46622.1,c.931A>C,p.I311L,UNKNOWN 58164049,PD6943a,20,57430121,C,G,Sub,0,30,57.78,45,na,GNAS,CCDS46622.1,c.1801C>G,p.R601G,UNKNOWN 58114472,PD6287a,20,57430171,G,A,Sub,0,29,45.16,93,na,GNAS,ENST00000306120,c.1661G>A,p.G554E,UNKNOWN 58079066,PD6523a,20,57430193,A,G,Sub,0,30,31.08,74,na,GNAS,CCDS46622.1,c.1873A>G,p.S625G,UNKNOWN 58187334,PD6307a,20,57430206,A,C,Sub,0,25,43.06,72,na,GNAS,CCDS46622.1,c.1886A>C,p.K629T,UNKNOWN 58092578,PD7016a,20,57430266,G,A,Sub,0,25,16.67,30,na,GNAS,CCDS46622.1,c.1946G>A,p.R649H,UNKNOWN 58162448,PD6317a,20,57474021,G,A,Sub,0,59,42.56,195,na,GNAS,CCDS46622.1,c.2167G>A,p.A723T,UNKNOWN 58137970,PD6803a,20,57478740,C,T,Sub,0,145,10.53,133,na,GNAS,CCDS46622.1,c.2255C>T,p.A752V,UNKNOWN 58099560,PD7008a,20,57484421,G,A,Sub,0,103,30.2,500,na,GNAS,CCDS46622.1,c.2531G>A,p.R844H,ONCOGENIC 58145446,PD6855a,20,57484421,G,A,Sub,0,103,39.95,383,na,GNAS,CCDS46622.1,c.2531G>A,p.R844H,ONCOGENIC 58157649,PD6271a,20,57485805,C,T,Sub,0,70,47.03,236,na,GNAS,CCDS46622.1,c.3035C>T,p.T1012I,UNKNOWN 208528337,PD6191a,21,36164657,-,TACA,I,,1,34.83146067,89,1,RUNX1,CCDS13639.1,c.1217_1218insTGTA,p.Y407fs*194,ONCOGENIC 208307858,PD6156a,21,36164691,g,-,D,,1,54.38596491,57,3,RUNX1,CCDS13639.1,c.1184delC,p.P395fs*199,ONCOGENIC 344039441,PD7389a,21,36164763,-,A,I,0,4,47.72727273,44,1,RUNX1,CCDS13639.1,c.1111_1112insT,p.M371fs*229,ONCOGENIC 208092244,PD6857a,21,36164763,-,A,I,,4,32.40740741,108,1,RUNX1,CCDS13639.1,c.1111_1112insT,p.M371fs*229,ONCOGENIC 208184406,PD6513a,21,36164772,atgccgat,-,D,0,6,16.90140845,67,1,RUNX1,CCDS13639.1,c.1096_1103delATCGGCAT,p.I366fs*231,ONCOGENIC 208535627,PD5746a,21,36164798,-,G,I,0,11,12.5,88,2,RUNX1,CCDS13639.1,c.1076_1077insC,p.V360fs*240,ONCOGENIC 208235515,PD6181a,21,36164843,-,G,I,0,12,51.28205128,78,5,RUNX1,CCDS13639.1,c.1031_1032insC,p.R346fs*254,ONCOGENIC 346913100,PD8730a,21,36164843,-,G,I,0,13,20.83333333,24,5,RUNX1,CCDS13639.1,c.1031_1032insC,p.R346fs*254,ONCOGENIC 347424788,PD7387a,21,36164867,-,G,I,0,10,34.61538462,26,4,RUNX1,CCDS13639.1,c.1007_1008insC,p.A338fs*262,ONCOGENIC 346882716,PD8731a,21,36164869,-,A,I,0,9,45.45454545,33,2,RUNX1,CCDS13639.1,c.1005_1006insT,p.A338fs*262,ONCOGENIC 208324933,PD6266a,21,36164885,-,GCTG,I,0,8,35.0877193,49,1,RUNX1,CCDS13639.1,c.989_990insCAGC,p.D332fs*269,ONCOGENIC 58113943,PD6990a,21,36171607,G,A,Sub,0,213,17.31,104,na,RUNX1,CCDS13639.1,c.958C>T,p.R320*,ONCOGENIC 208184422,PD6513a,21,36171653,-,A,I,0,207,8.695652174,138,1,RUNX1,CCDS13639.1,c.911_912insT,p.G305fs*295,ONCOGENIC 208222302,PD6280a,21,36171664,gc,-,D,0,194,36.23188406,138,1,RUNX1,CCDS13639.1,c.900_901delGC,p.P301fs*298,ONCOGENIC 209686981,PD6256a,21,36171668,-,T,I,0,189,43.80165289,121,2,RUNX1,CCDS13639.1,c.896_897insA,p.T300fs*300,ONCOGENIC 344018290,PD9663a,21,36171686,a,-,D,0,150,22.4137931,116,1,RUNX1,CCDS13639.1,c.879delT,p.P294fs*17,ONCOGENIC 58100556,PD6829a,21,36171700,C,A,Sub,0,148,32.56,172,na,RUNX1,CCDS13639.1,c.865G>T,p.G289*,ONCOGENIC 58201703,PD6894a,21,36193974,C,T,Sub,0,46,32.73,110,na,RUNX1,CCDS46646.1,c.744G>A,p.W248*,ONCOGENIC 58101414,PD6164a,21,36206728,G,A,Sub,0,49,50,54,na,RUNX1,CCDS13639.1,c.784C>T,p.Q262*,ONCOGENIC 58095420,PD6100a,21,36206761,C,T,Sub,0,35,12.28,171,na,RUNX1,CCDS13639.1,c.751G>A,p.A251T,ONCOGENIC 208490061,PD7037a,21,36206776,-,G,I,0,30,42.25352113,142,5,RUNX1,CCDS13639.1,c.735_736insC,p.T246fs*15,ONCOGENIC 58182592,PD6171a,21,36206845,C,A,Sub,0,17,32.56,43,na,RUNX1,CCDS13639.1,c.667G>T,p.E223*,ONCOGENIC 208450238,PD5735a,21,36206874,-,G,I,0,10,23.33333333,120,1,RUNX1,CCDS13639.1,c.637_638insC,p.Q213fs*15,ONCOGENIC 58067713,PD6253a,21,36231773,C,T,Sub,0,175,50.25,197,na,RUNX1,CCDS13639.1,c.611G>A,p.R204Q,ONCOGENIC 58085727,PD6179a,21,36231773,C,A,Sub,0,175,38.04,184,na,RUNX1,CCDS13639.1,c.611G>T,p.R204L,ONCOGENIC 58202917,PD6946a,21,36231773,C,T,Sub,0,175,45.22,314,na,RUNX1,CCDS13639.1,c.611G>A,p.R204Q,ONCOGENIC 58204210,PD6484a,21,36231773,C,T,Sub,0,175,41.1,309,na,RUNX1,CCDS13639.1,c.611G>A,p.R204Q,ONCOGENIC 58155057,PD6821a,21,36231774,G,A,Sub,0,184,37.26,365,na,RUNX1,CCDS13639.1,c.610C>T,p.R204*,ONCOGENIC 58107932,PD6274a,21,36231782,C,T,Sub,0,203,21.9,274,na,RUNX1,CCDS13639.1,c.602G>A,p.R201Q,ONCOGENIC 58199804,PD6953a,21,36231797,G,A,Sub,0.41,243,33.8,500,na,RUNX1,ENST00000457086,c.418-2C>T,p.?,ONCOGENIC 58079428,PD6098a,21,36231809,G,A,Sub,0,262,17.48,206,na,RUNX1,CCDS13639.1,c.575C>T,p.A192V,ONCOGENIC 208336286,PD6991a,21,36231844,-,C,I,0,264,14.08450704,284,0,RUNX1,CCDS13639.1,c.539_540insG,p.F180fs*33,POSSIBLE ONCOGENIC 66951326,PD7391a,21,36252855,T,G,Sub,0,51,15.54,148,na,RUNX1,CCDS13639.1,c.507A>C,p.R169S,ONCOGENIC 58157218,PD6279a,21,36252865,C,T,Sub,0,56,43.88,237,na,RUNX1,CCDS13639.1,c.497G>A,p.R166Q,ONCOGENIC 58172689,PD6155a,21,36252866,G,A,Sub,0,57,11.15,287,na,RUNX1,CCDS13639.1,c.496C>T,p.R166*,ONCOGENIC 58189672,PD6147a,21,36252866,G,A,Sub,0,57,48.17,218,na,RUNX1,CCDS13639.1,c.496C>T,p.R166*,ONCOGENIC 58196737,PD6258a,21,36252866,G,A,Sub,0,57,45.55,281,na,RUNX1,CCDS13639.1,c.496C>T,p.R166*,ONCOGENIC 58099386,PD6928a,21,36252876,C,G,Sub,0,59,50.8,500,na,RUNX1,CCDS13639.1,c.486G>C,p.R162S,ONCOGENIC 58171550,PD5765a,21,36252877,C,T,Sub,0,58,10.08,357,na,RUNX1,CCDS13639.1,c.485G>A,p.R162K,ONCOGENIC 58093398,PD5714a,21,36252878,T,C,Sub,0,59,45.76,389,na,RUNX1,CCDS13639.1,c.484A>G,p.R162G,ONCOGENIC 58113945,PD6990a,21,36252880,A,G,Sub,0,59,10.89,303,na,RUNX1,CCDS13639.1,c.482T>C,p.L161P,ONCOGENIC 58101912,PD6329a,21,36252884,C,A,Sub,0,63,50.58,431,na,RUNX1,CCDS13639.1,c.478G>T,p.D160Y,ONCOGENIC 208373394,PD6312a,21,36252888,A,-,D,0,66,32.25806452,372,3,RUNX1,CCDS13639.1,c.474delT,p.F158fs*18,ONCOGENIC 58148591,PD6963a,21,36252940,G,A,Sub,0,73,62.8,500,na,RUNX1,CCDS13639.1,c.422C>T,p.S141L,ONCOGENIC 58165573,PD7116a,21,36252940,G,A,Sub,0,73,37.57,354,na,RUNX1,CCDS13639.1,c.422C>T,p.S141L,ONCOGENIC 58178762,PD6481a,21,36252940,G,A,Sub,0,73,13.44,439,na,RUNX1,CCDS13639.1,c.422C>T,p.S141L,ONCOGENIC 68092364,PD8729a,21,36252940,G,A,Sub,0,73,27.84,176,na,RUNX1,CCDS13639.1,c.422C>T,p.S141L,ONCOGENIC 58179554,PD5773a,21,36252945,G,C,Sub,0,68,47.6,500,na,RUNX1,CCDS13639.1,c.417C>G,p.N139K,POSSIBLE ONCOGENIC 58072931,PD6985a,21,36259153,G,A,Sub,0,36,37.5,8,na,RUNX1,CCDS13639.1,c.338C>T,p.P113L,ONCOGENIC 58098446,PD6783a,21,36259153,G,T,Sub,0,36,26.47,34,na,RUNX1,CCDS13639.1,c.338C>A,p.P113H,ONCOGENIC 66786084,PD7384a,21,36259156,A,T,Sub,0,35,58.33,12,na,RUNX1,CCDS13639.1,c.335T>A,p.L112Q,ONCOGENIC 58128568,PD5724a,21,36259171,C,G,Sub,0,40,31.15,61,na,RUNX1,CCDS13639.1,c.320G>C,p.R107P,ONCOGENIC 58161964,PD7026a,21,36259171,C,T,Sub,0,40,24.44,45,na,RUNX1,CCDS13639.1,c.320G>A,p.R107H,ONCOGENIC 58198163,PD7001a,21,36259171,C,T,Sub,0,40,35.71,28,na,RUNX1,CCDS13639.1,c.320G>A,p.R107H,ONCOGENIC 58114446,PD6287a,21,36259172,G,A,Sub,0,42,43.48,46,na,RUNX1,CCDS13639.1,c.319C>T,p.R107C,ONCOGENIC 58152473,PD5748a,21,36259172,G,A,Sub,0,42,33.33,48,na,RUNX1,CCDS13639.1,c.319C>T,p.R107C,ONCOGENIC 58184694,PD6254a,21,36259176,G,C,Sub,0,42,30.43,23,na,RUNX1,CCDS13639.1,c.315C>G,p.H105Q,ONCOGENIC 58093208,PD6318a,21,36259177,T,A,Sub,0,42,15.15,33,na,RUNX1,CCDS13639.1,c.314A>T,p.H105L,ONCOGENIC 58107393,PD6188a,21,36259192,G,A,Sub,0,38,53.7,54,na,RUNX1,CCDS13639.1,c.299C>T,p.S100F,ONCOGENIC 208336413,PD6948a,21,36259199,-,CAGAG,I,0,43,13.51351351,35,0,RUNX1,CCDS13639.1,c.291_292insCTCTG,p.V101fs*23,ONCOGENIC 209616883,PD6278a,21,36259233,-,AC,I,0,22,33.33333333,24,0,RUNX1,CCDS13639.1,c.257_258insGT,p.G87fs*36,ONCOGENIC 209593112,PD6277a,21,36259239,-,G,I,0,22,28.57142857,28,2,RUNX1,CCDS13639.1,c.251_252insC,p.H85fs*53,ONCOGENIC 58157219,PD6279a,21,36421141,C,T,Sub,0,16,46.3,432,na,RUNX1,CCDS13639.1,c.56G>A,p.R19K,POSSIBLE ONCOGENIC 66876062,PD8735a,21,39755579,C,T,Sub,0,153,10.91,55,na,ERG,CCDS46648.1,c.1207G>A,p.A403T,UNKNOWN 58113755,PD6810a,21,39755697,G,C,Sub,0,93,39.88,168,na,ERG,CCDS46648.1,c.1089C>G,p.S363R,UNKNOWN 58101210,PD6133a,21,39762946,C,T,Sub,0,66,11.05,353,na,ERG,CCDS46648.1,c.911G>A,p.G304E,UNKNOWN 58194901,PD6905a,21,39762956,G,A,Sub,0,73,10.03,299,na,ERG,CCDS46648.1,c.901C>T,p.Q301*,UNKNOWN 207954552,PD6985a,21,39764346,-,G,I,0,8,16.66666667,36,6,ERG,CCDS46648.1,c.786_787insC,p.R263fs*29,UNKNOWN 208023520,PD5722a,21,39764363,-,GGTA,I,,6,40.58577406,239,1,ERG,CCDS46648.1,c.769_770insTACC,p.Y259fs*3,UNKNOWN 67016504,PD7386a,21,39774502,C,T,Sub,0,19,54.55,77,na,ERG,CCDS46648.1,c.671G>A,p.R224Q,UNKNOWN 58080196,PD7002a,21,39775530,C,T,Sub,0,94,50.39,387,na,ERG,CCDS46648.1,c.511G>A,p.D171N,UNKNOWN 58091741,PD6782a,21,39947603,G,A,Sub,0,20,17.07,82,na,ERG,CCDS46648.1,c.22C>T,p.P8S,UNKNOWN 58073174,PD6941a,21,44513236,A,T,Sub,0,26,45.08,122,na,U2AF1,CCDS13694.1,c.699T>A,p.D233E,POSSIBLE ONCOGENIC 58078424,PD6515a,21,44514777,T,G,Sub,0.65,155,34.03,191,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58080902,PD6251a,21,44514777,T,G,Sub,0.65,155,45.96,235,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58091695,PD6506a,21,44514777,T,G,Sub,0.65,155,48.29,263,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58094268,PD6853a,21,44514777,T,G,Sub,0.65,155,46.37,179,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58101027,PD6539a,21,44514777,T,G,Sub,0.65,155,38.49,252,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58104310,PD6240a,21,44514777,T,G,Sub,0.65,155,36.17,282,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58104990,PD6283a,21,44514777,T,C,Sub,0,155,53.06,245,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58106539,PD6088a,21,44514777,T,G,Sub,0,155,11.59,233,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58107395,PD6188a,21,44514777,T,G,Sub,0.65,155,46.46,198,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58112889,PD6191a,21,44514777,T,G,Sub,0.65,155,48.98,147,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58114731,PD6093a,21,44514777,T,G,Sub,0,155,39.37,254,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58130221,PD5756a,21,44514777,T,G,Sub,0,155,45.26,232,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58132682,PD6160a,21,44514777,T,C,Sub,0,155,46.38,69,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58136450,PD6246a,21,44514777,T,G,Sub,0.65,155,37.66,231,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58143320,PD5726a,21,44514777,T,C,Sub,0,155,50.42,240,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58155412,PD6241a,21,44514777,T,G,Sub,0.65,155,28.92,249,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58158860,PD6173a,21,44514777,T,C,Sub,0,155,42.86,126,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58162804,PD6950a,21,44514777,T,G,Sub,0.65,155,26.19,294,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58167999,PD6988a,21,44514777,T,G,Sub,0.65,155,12,200,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58177618,PD5732a,21,44514777,T,G,Sub,0,155,24.48,339,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58179638,PD6780a,21,44514777,T,C,Sub,0,155,63.27,49,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58180654,PD6474a,21,44514777,T,C,Sub,0,155,42.2,282,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58182228,PD6502a,21,44514777,T,G,Sub,0.65,155,41.2,216,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58192366,PD6534a,21,44514777,T,G,Sub,0.65,155,26.64,229,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58199805,PD6953a,21,44514777,T,G,Sub,0.65,155,20.76,342,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58202746,PD6909a,21,44514777,T,G,Sub,0.65,155,12.78,227,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 58204211,PD6484a,21,44514777,T,G,Sub,0.65,155,46.32,190,na,U2AF1,CCDS13694.1,c.470A>C,p.Q157P,ONCOGENIC 66782288,PD7385a,21,44514777,T,C,Sub,0,155,41.3,92,na,U2AF1,CCDS13694.1,c.470A>G,p.Q157R,ONCOGENIC 58106917,PD6986a,21,44514780,C,T,Sub,0,149,23.32,223,na,U2AF1,CCDS13694.1,c.467G>A,p.R156H,ONCOGENIC 66831447,PD7372a,21,44514780,C,T,Sub,0,149,41.94,93,na,U2AF1,CCDS13694.1,c.467G>A,p.R156H,ONCOGENIC 58072295,PD6926a,21,44524456,G,A,Sub,0,79,34.35,329,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58075393,PD6969a,21,44524456,G,T,Sub,0,79,41.19,318,na,U2AF1,CCDS13694.1,c.101C>A,p.S34Y,ONCOGENIC 58077603,PD6053a,21,44524456,G,A,Sub,0,79,46.93,326,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58080735,PD7117a,21,44524456,G,A,Sub,0,79,42.54,355,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58110273,PD6266a,21,44524456,G,A,Sub,0,79,44.83,377,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58116268,PD6263a,21,44524456,G,A,Sub,0,79,39.09,307,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58125867,PD6242a,21,44524456,G,A,Sub,0,79,47.61,376,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58165741,PD6496a,21,44524456,G,A,Sub,0,79,49.34,379,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58168705,PD6195a,21,44524456,G,A,Sub,0,79,23.91,138,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58185117,PD6485a,21,44524456,G,A,Sub,0,79,45.95,309,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58188079,PD6903a,21,44524456,G,A,Sub,0,79,10.76,446,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58189506,PD5737a,21,44524456,G,A,Sub,0,79,39.71,408,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58189673,PD6147a,21,44524456,G,A,Sub,0,79,55.09,167,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58196897,PD5728a,21,44524456,G,A,Sub,0,79,13.95,337,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 66782289,PD7385a,21,44524456,G,A,Sub,0,79,44.76,105,na,U2AF1,CCDS13694.1,c.101C>T,p.S34F,ONCOGENIC 58133904,PD6058a,22,28146963,C,T,Sub,0,21,36.45,107,na,MN1,CCDS42998.1,c.3903G>A,p.W1301*,UNKNOWN 58112081,PD7010a,22,28192871,C,T,Sub,0,96,11.43,70,na,MN1,CCDS42998.1,c.3661G>A,p.A1221T,UNKNOWN 58114089,PD6990a,22,28192975,G,C,Sub,0,24,50,86,na,MN1,CCDS42998.1,c.3557C>G,p.A1186G,UNKNOWN 66960456,PD8734a,22,28193250,G,C,Sub,0,20,53.66,41,na,MN1,CCDS42998.1,c.3282C>G,p.H1094Q,UNKNOWN 58076377,PD6795a,22,28193318,C,G,Sub,0,35,44.35,115,na,MN1,CCDS42998.1,c.3214G>C,p.A1072P,UNKNOWN 208219895,PD7017a,22,28193573,-,C,I,0,9,13.46153846,52,5,MN1,CCDS42998.1,c.2958_2959insG,p.S989fs*27,UNKNOWN 208015466,PD6110a,22,28193689,-,T,I,0,8,39.39393939,33,5,MN1,CCDS42998.1,c.2842_2843insA,p.R948fs*13,UNKNOWN 58071782,PD6110a,22,28193920,G,A,Sub,0,36,23.81,21,na,MN1,CCDS42998.1,c.2612C>T,p.A871V,UNKNOWN 58151161,PD6529a,22,28193933,C,T,Sub,0,41,45.57,79,na,MN1,CCDS42998.1,c.2599G>A,p.G867S,UNKNOWN 58169664,PD7073a,22,28195036,G,A,Sub,0,15,40.78,103,na,MN1,CCDS42998.1,c.1496C>T,p.T499I,UNKNOWN 66960457,PD8734a,22,28195054,C,G,Sub,0,14,44.23,52,na,MN1,CCDS42998.1,c.1478G>C,p.G493A,UNKNOWN 208015467,PD6110a,22,28195190,-,G,I,0,18,23.07692308,39,6,MN1,CCDS42998.1,c.1341_1342insC,p.Y450fs*35,UNKNOWN 208452026,PD6819a,22,28195603,tgc,-,D,0,4,22.22222222,43,6,MN1,CCDS42998.1,c.927_929delGCA,p.Q309delQ,UNKNOWN 58163067,PD7077a,22,30733025,C,T,Sub,0,78,13.57,221,na,SF3A1,CCDS13875.1,c.2096G>A,p.R699H,UNKNOWN 66837991,PD8739a,22,30733176,A,C,Sub,0,25,66.67,12,na,SF3A1,CCDS13875.1,c.1952-7T>G,p.?,UNKNOWN 66763520,PD8936a,22,30740919,C,T,Sub,0,65,17.39,23,na,SF3A1,CCDS13875.1,c.651+3G>A,p.?,UNKNOWN 58071784,PD6110a,22,30742471,C,T,Sub,0,82,18.22,258,na,SF3A1,CCDS13875.1,c.223G>A,p.E75K,UNKNOWN 58117145,PD6185a,22,30742471,C,T,Sub,0,82,13.93,201,na,SF3A1,CCDS13875.1,c.223G>A,p.E75K,UNKNOWN 58136261,PD6236a,22,41489046,C,T,Sub,0,15,11.54,78,na,EP300,CCDS14010.1,c.38C>T,p.A13V,UNKNOWN 58160475,PD7045a,22,41513408,G,A,Sub,0,83,10.2,500,na,EP300,CCDS14010.1,c.312G>A,p.M104I,UNKNOWN 66876142,PD8735a,22,41513490,A,T,Sub,0,64,50,254,na,EP300,CCDS14010.1,c.394A>T,p.T132S,UNKNOWN 58102120,PD6329a,22,41513629,T,A,Sub,0,75,52.8,500,na,EP300,CCDS14010.1,c.533T>A,p.L178*,ONCOGENIC 66971261,PD7379a,22,41521886,C,T,Sub,0,181,10.28,107,na,EP300,CCDS14010.1,c.748C>T,p.P250S,UNKNOWN 58204929,PD6082a,22,41521902,C,T,Sub,0,189,48.71,464,na,EP300,CCDS14010.1,c.764C>T,p.S255L,UNKNOWN 58181521,PD6816a,22,41522037,C,T,Sub,0,222,10.91,220,na,EP300,CCDS14010.1,c.899C>T,p.P300L,UNKNOWN 58165818,PD6496a,22,41523665,G,A,Sub,0,42,50,224,na,EP300,CCDS14010.1,c.1081G>A,p.V361M,UNKNOWN 58101385,PD6133a,22,41527551,C,T,Sub,0,124,52.71,387,na,EP300,CCDS14010.1,c.1442C>T,p.P481L,UNKNOWN 58138487,PD6918a,22,41527551,C,A,Sub,0,124,53.52,284,na,EP300,CCDS14010.1,c.1442C>A,p.P481Q,UNKNOWN 58133754,PD6163a,22,41533706,A,G,Sub,0,132,53.44,393,na,EP300,CCDS14010.1,c.1672A>G,p.T558A,UNKNOWN 209548892,PD7100a,22,41536189,-,A,I,0,217,21.84684685,444,1,EP300,CCDS14010.1,c.1806_1807insA,p.R603fs*14,ONCOGENIC 58179616,PD5773a,22,41537071,T,C,Sub,0,75,45.88,170,na,EP300,CCDS14010.1,c.1898T>C,p.L633P,UNKNOWN 58071776,PD6110a,22,41545097,C,T,Sub,0,297,14.7,313,na,EP300,CCDS14010.1,c.2297C>T,p.P766L,UNKNOWN 58173440,PD6879a,22,41545159,G,A,Sub,0.54,185,43.52,409,na,EP300,CCDS14010.1,c.2359G>A,p.G787S,UNKNOWN 58079401,PD6822a,22,41545841,G,A,Sub,0,80,10.03,309,na,EP300,CCDS14010.1,c.2456G>A,p.C819Y,UNKNOWN 58091910,PD6782a,22,41545897,C,A,Sub,0,52,38.2,500,na,EP300,CCDS14010.1,c.2512C>A,p.R838S,UNKNOWN 66783259,PD7371a,22,41546030,C,G,Sub,0,160,53.13,160,na,EP300,CCDS14010.1,c.2645C>G,p.P882R,UNKNOWN 58087369,PD6282a,22,41546053,A,C,Sub,0.7,143,43.44,343,na,EP300,CCDS14010.1,c.2668A>C,p.T890P,UNKNOWN 66878371,PD8934a,22,41546146,G,A,Sub,0,45,31.25,48,na,EP300,CCDS14010.1,c.2761G>A,p.A921T,UNKNOWN 58145424,PD5725a,22,41548339,C,T,Sub,0,13,33.33,183,na,EP300,CCDS14010.1,c.3127C>T,p.Q1043*,ONCOGENIC 58099946,PD6272a,22,41551112,A,G,Sub,0,313,48.34,391,na,EP300,CCDS14010.1,c.3256A>G,p.I1086V,UNKNOWN 58095473,PD6100a,22,41554480,C,T,Sub,0,36,10.06,159,na,EP300,CCDS14010.1,c.3566C>T,p.A1189V,UNKNOWN 66948348,PD7373a,22,41556657,G,A,Sub,0,97,47.52,101,na,EP300,CCDS14010.1,c.3602G>A,p.C1201Y,UNKNOWN 66763514,PD8936a,22,41556690,G,A,Sub,0,112,15.94,69,na,EP300,CCDS14010.1,c.3635G>A,p.S1212N,UNKNOWN 58109480,PD6114a,22,41556714,C,G,Sub,0,115,43.96,455,na,EP300,CCDS14010.1,c.3659C>G,p.S1220C,UNKNOWN 58191734,PD6781a,22,41560116,A,T,Sub,0,232,51.33,300,na,EP300,CCDS14010.1,c.3788A>T,p.E1263V,UNKNOWN 58179262,PD6784a,22,41564801,G,A,Sub,0,182,13.2,500,na,EP300,CCDS14010.1,c.4102G>A,p.G1368S,UNKNOWN 58095474,PD6100a,22,41564844,G,A,Sub,0,254,11.66,223,na,EP300,CCDS14010.1,c.4145G>A,p.G1382D,UNKNOWN 209726369,PD6898a,22,41565517,at,-,D,0,54,28.2115869,392,2,EP300,CCDS14010.1,c.4183_4184delAT,p.S1396fs*4,ONCOGENIC 58119754,PD6218a,22,41565575,A,G,Sub,0,55,19.56,409,na,EP300,CCDS14010.1,c.4241A>G,p.Y1414C,UNKNOWN 58179687,PD6780a,22,41566439,C,A,Sub,0,76,12.5,96,na,EP300,CCDS14010.1,c.4316C>A,p.P1439Q,UNKNOWN 58125269,PD6194a,22,41568635,C,T,Sub,0,188,16.25,240,na,EP300,CCDS14010.1,c.4585C>T,p.R1529*,ONCOGENIC 58138309,PD6823a,22,41568650,A,G,Sub,0,179,44.79,384,na,EP300,CCDS14010.1,c.4600A>G,p.S1534G,UNKNOWN 58112077,PD7010a,22,41569723,G,A,Sub,0,32,32.91,237,na,EP300,CCDS14010.1,c.4714G>A,p.G1572R,UNKNOWN 58109312,PD6077a,22,41573350,C,T,Sub,0,47,48.4,500,na,EP300,CCDS14010.1,c.5635C>T,p.P1879S,UNKNOWN 58181525,PD6816a,22,41573981,C,G,Sub,0,38,17.6,500,na,EP300,CCDS14010.1,c.6266C>G,p.A2089G,UNKNOWN 58111220,PD6930a,22,41574238,A,C,Sub,0,185,44.95,465,na,EP300,CCDS14010.1,c.6523A>C,p.M2175L,UNKNOWN 58129798,PD6974a,22,41574238,A,C,Sub,0,185,48,500,na,EP300,CCDS14010.1,c.6523A>C,p.M2175L,UNKNOWN 207953256,PD6494a,22,41574341,accagttccagc,-,D,0,53,35.90733591,178,1,EP300,CCDS14010.1,c.6626_6637delACCAGTTCCAGC,p.N2209_Q2213>K,ONCOGENIC 208116011,PD6544a,22,41574341,accagttccagc,-,D,0,53,35.22167488,277,1,EP300,CCDS14010.1,c.6626_6637delACCAGTTCCAGC,p.N2209_Q2213>K,ONCOGENIC 207942264,PD5724a,22,41574511,cag,-,D,0,45,32.98969072,431,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208067275,PD6250a,22,41574511,cag,-,D,0,45,36.72839506,278,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208182458,PD5779a,22,41574511,cag,-,D,0,45,37.64172336,385,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208183479,PD6306a,22,41574511,cag,-,D,0,45,37.20930233,189,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208212969,PD6310a,22,41574511,cag,-,D,0,45,35.84337349,289,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208224179,PD6514a,22,41574511,cag,-,D,0,45,35.4005168,336,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208408364,PD6987a,22,41574511,cag,-,D,0,45,32.0441989,303,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208497932,PD6259a,22,41574511,cag,-,D,0,45,39.71428571,297,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 208526005,PD6296a,22,41574511,cag,-,D,0,45,37.90087464,292,1,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 207929096,PD7120a,22,41574511,cag,-,D,0,45,21.15384615,364,na,EP300,CCDS14010.1,c.6796_6798delCAG,p.Q2266delQ,UNKNOWN 58086483,PD7017a,22,41574581,C,T,Sub,0,48,15.6,500,na,EP300,CCDS14010.1,c.6866C>T,p.A2289V,UNKNOWN 66858227,PD8935a,22,41574776,C,T,Sub,0,76,10.53,95,na,EP300,CCDS14010.1,c.7061C>T,p.A2354V,UNKNOWN 58151485,PD6954a,22,41574829,A,G,Sub,0,70,50.65,308,na,EP300,CCDS14010.1,c.7114A>G,p.M2372V,UNKNOWN 66878374,PD8934a,22,41574841,C,A,Sub,0,66,8.86,158,na,EP300,CCDS14010.1,c.7126C>A,p.L2376I,UNKNOWN 208411880,PD6843a,X,15809103,c,-,D,0,149,84.4765343,275,1,ZRSR2,CCDS14172.1,c.88delC,p.R30fs*8,ONCOGENIC 207944956,PD6206a,X,15809111,-,CTGA,I,0,145,70.39800995,325,0,ZRSR2,CCDS14172.1,c.96_97insCTGA,p.E33fs*23,ONCOGENIC 58178774,PD6481a,X,15809136,G,C,Sub,0,107,89.26,270,na,ZRSR2,CCDS14172.1,c.121G>C,p.G41R,UNKNOWN 68196030,PD8730a,X,15818015,G,A,Sub,0,88,100,12,na,ZRSR2,CCDS14172.1,c.142G>A,p.E48K,UNKNOWN 58177261,PD6947a,X,15818018,G,C,Sub,0,88,44.98,289,na,ZRSR2,CCDS14172.1,c.145G>C,p.E49Q,UNKNOWN 58154735,PD6808a,X,15821832,G,A,Sub,0,11,69.05,84,na,ZRSR2,CCDS14172.1,c.225G>A,p.W75*,ONCOGENIC 68093012,PD8728a,X,15821860,G,A,Sub,0,14,28,25,na,ZRSR2,CCDS14172.1,c.253G>A,p.E85K,UNKNOWN 58085800,PD6238a,X,15822249,C,T,Sub,0,163,65.82,79,na,ZRSR2,CCDS14172.1,c.328C>T,p.Q110*,ONCOGENIC 207995109,PD6872a,X,15822289,a,-,D,0,181,70.87378641,102,1,ZRSR2,CCDS14172.1,c.368delA,p.E123fs*42,ONCOGENIC 66848149,PD7377a,X,15822318,G,T,Sub,0,207,97.3,37,na,ZRSR2,CCDS14172.1,c.397G>T,p.E133*,ONCOGENIC 68195819,PD8731a,X,15827426,G,T,Sub,0,257,96.39,83,na,ZRSR2,CCDS14172.1,c.542G>T,p.C181F,ONCOGENIC 58193066,PD6245a,X,15827427,C,A,Sub,0,258,88.89,135,na,ZRSR2,CCDS14172.1,c.543C>A,p.C181*,ONCOGENIC 58114748,PD6093a,X,15827438,A,G,Sub,0,247,84.58,214,na,ZRSR2,CCDS14172.1,c.554A>G,p.D185G,POSSIBLE ONCOGENIC 58170980,PD5786a,X,15833805,C,G,Sub,0,107,48.94,94,na,ZRSR2,CCDS14172.1,c.563C>G,p.S188*,ONCOGENIC 58090323,PD6162a,X,15833813,C,T,Sub,0,107,16.67,108,na,ZRSR2,CCDS14172.1,c.571C>T,p.H191Y,ONCOGENIC 58165550,PD6787a,X,15833945,C,T,Sub,0,139,81.67,120,na,ZRSR2,CCDS14172.1,c.703C>T,p.Q235*,ONCOGENIC 66858220,PD8935a,X,15833958,T,A,Sub,0,142,16.67,60,na,ZRSR2,CCDS14172.1,c.716T>A,p.F239Y,ONCOGENIC 344759847,PD7364a,X,15833998,-,A,I,0,139,7.936507937,63,3,ZRSR2,CCDS14172.1,c.756_757insA,p.V253fs*36,ONCOGENIC 58077356,PD6937a,X,15838340,T,C,Sub,0,165,25.55,274,na,ZRSR2,CCDS14172.1,c.838T>C,p.C280R,ONCOGENIC 58092648,PD7016a,X,15838345,A,T,Sub,0,171,22.77,202,na,ZRSR2,CCDS14172.1,c.843A>T,p.Q281H,POSSIBLE ONCOGENIC 208519792,PD6252a,X,15838352,-,T,I,0,197,65.51724138,203,3,ZRSR2,CCDS14172.1,c.850_851insT,p.S285fs*4,ONCOGENIC 208184900,PD6513a,X,15838361,t,-,D,0,211,85.71428571,208,3,ZRSR2,CCDS14172.1,c.859delT,p.F287fs*18,ONCOGENIC 58077586,PD6303a,X,15838370,C,T,Sub,0,212,77.88,217,na,ZRSR2,CCDS14172.1,c.868C>T,p.R290*,ONCOGENIC 58181699,PD6510a,X,15838385,C,T,Sub,0,216,88.06,201,na,ZRSR2,CCDS14172.1,c.883C>T,p.R295*,ONCOGENIC 208304048,PD6537a,X,15838434,t,-,D,0,159,76.23762376,201,3,ZRSR2,CCDS14172.1,c.932delT,p.C312fs*>177,ONCOGENIC 58163527,PD6544a,X,15838436,T,C,Sub,0,150,81.31,198,na,ZRSR2,CCDS14172.1,c.934T>C,p.C312R,POSSIBLE ONCOGENIC 58074487,PD6101a,X,15840871,C,T,Sub,0,106,61.96,92,na,ZRSR2,CCDS14172.1,c.955C>T,p.Q319*,ONCOGENIC 58099610,PD7008a,X,15840936,G,A,Sub,0,107,90.07,151,na,ZRSR2,CCDS14172.1,c.1020G>A,p.W340*,ONCOGENIC 58191739,PD6781a,X,15840936,G,A,Sub,0,107,76.67,120,na,ZRSR2,CCDS14172.1,c.1020G>A,p.W340*,ONCOGENIC 208500046,PD6848a,X,15841010,a,-,D,0,83,60.18957346,208,1,ZRSR2,CCDS14172.1,c.1094delA,p.E365fs*>124,ONCOGENIC 58121245,PD6791a,X,15841035,C,G,Sub,0,68,93.33,120,na,ZRSR2,CCDS14172.1,c.1119C>G,p.Y373*,ONCOGENIC 208388979,PD6072a,X,15841093,g,-,D,0,61,80.92485549,173,4,ZRSR2,CCDS14172.1,c.1177delG,p.E394fs*>95,ONCOGENIC 208328909,PD7033a,X,15841259,-,GAGCCGGAGCCG,I,0,18,47.22222222,22,0,ZRSR2,CCDS14172.1,c.1343_1344insGAGCCGGAGCCG,p.R448_R449insSRSR,ONCOGENIC 208385214,PD6783a,X,39913178,-,G,I,0,94,25.92592593,81,6,BCOR,CCDS48093.1,c.4936_4937insC,p.L1646fs*6,ONCOGENIC 66769883,PD8738a,X,39913184,G,A,Sub,0,84,15.63,64,na,BCOR,CCDS48093.1,c.4931C>T,p.T1644I,UNKNOWN 208020303,PD6233a,X,39913253,g,-,D,0,53,58.18181818,110,6,BCOR,CCDS48093.1,c.4862delC,p.P1621fs*53,ONCOGENIC 208344775,PD6195a,X,39913510,a,-,D,0,9,50,32,1,BCOR,CCDS48093.1,c.4818delT,p.C1606fs*12,ONCOGENIC 208103535,PD7117a,X,39913510,A,-,D,0,9,74.46808511,47,na,BCOR,CCDS48093.1,c.4818delT,p.C1606fs*12,ONCOGENIC 208147576,PD7020a,X,39913527,aga,-,D,0,12,84.44444444,76,1,BCOR,CCDS48093.1,c.4799_4801delTCT,p.F1600delF,ONCOGENIC 66848150,PD7377a,X,39914732,C,T,Sub,0,169,14.29,49,na,BCOR,CCDS48093.1,c.4630G>A,p.E1544K,UNKNOWN 58094774,PD5718a,X,39914753,C,T,Sub,0,167,23.85,260,na,BCOR,CCDS48093.1,c.4609G>A,p.A1537T,UNKNOWN 58118329,PD6081a,X,39916421,G,A,Sub,0,149,15.38,117,na,BCOR,CCDS48093.1,c.4582C>T,p.Q1528*,ONCOGENIC 66962476,PD9663a,X,39921415,G,A,Sub,0,12,43.75,16,na,BCOR,CCDS48093.1,c.4405C>T,p.R1469W,UNKNOWN 58132948,PD5720a,X,39921636,C,T,Sub,0,21,44.64,56,na,BCOR,CCDS48093.1,c.4184G>A,p.R1395Q,UNKNOWN 58114093,PD6990a,X,39922048,C,T,Sub,0,87,36.8,125,na,BCOR,CCDS48093.1,c.4124G>A,p.R1375Q,UNKNOWN 58138124,PD6803a,X,39922073,G,C,Sub,0,77,26.92,78,na,BCOR,CCDS48093.1,c.4099C>G,p.H1367D,UNKNOWN 208450690,PD6242a,X,39922216,gcaggcggcc,-,D,0,26,85,31,1,BCOR,CCDS48093.1,c.3947_3956delGGCCGCCTGC,p.R1316fs*50,ONCOGENIC 208512311,PD6157a,X,39923025,T,C,DI,0,248,21.11111111,86,0,BCOR,CCDS48093.1,-,-,ONCOGENIC 58182549,PD6809a,X,39923059,G,A,Sub,0,180,81.19,101,na,BCOR,CCDS48093.1,c.3649C>T,p.R1217*,ONCOGENIC 207982451,PD5768a,X,39923086,-,T,I,0,156,41.93548387,124,6,BCOR,CCDS48093.1,c.3621_3622insA,p.Q1208fs*8,ONCOGENIC 66786199,PD7384a,X,39923140,C,T,Sub,0,91,13.33,30,na,BCOR,CCDS48093.1,c.3568G>A,p.E1190K,UNKNOWN 58138328,PD6823a,X,39923726,G,A,Sub,0,60,20.83,24,na,BCOR,CCDS48093.1,c.3365C>T,p.S1122L,UNKNOWN 208192728,PD6968a,X,39930936,-,GAAG,I,0,110,15.78947368,126,0,BCOR,CCDS48093.1,c.3004_3005insCTTC,p.G1002fs*17,ONCOGENIC 58181513,PD6816a,X,39931699,G,A,Sub,0,62,11.11,63,na,BCOR,CCDS48093.1,c.2900C>T,p.A967V,UNKNOWN 58117498,PD6120a,X,39932081,G,A,Sub,0,131,44.74,114,na,BCOR,CCDS48093.1,c.2518C>T,p.P840S,UNKNOWN 208464099,PD6824a,X,39932084,-,G,I,0,143,12.28070175,114,6,BCOR,CCDS48093.1,c.2514_2515insC,p.K839fs*5,ONCOGENIC 209559148,PD6057a,X,39932084,-,G,I,0,148,16.21621622,37,6,BCOR,CCDS48093.1,c.2514_2515insC,p.K839fs*5,ONCOGENIC 208458910,PD6287a,X,39932467,g,-,D,0,96,47.5,40,2,BCOR,CCDS48093.1,c.2132delC,p.P711fs*4,ONCOGENIC 58068198,PD6142a,X,39932612,G,T,Sub,0,55,43.51,131,na,BCOR,CCDS48093.1,c.1987C>A,p.P663T,UNKNOWN 58112347,PD7040a,X,39932713,T,C,Sub,0,96,69.92,123,na,BCOR,CCDS48093.1,c.1886A>G,p.N629S,UNKNOWN 58161874,PD6917a,X,39933164,C,T,Sub,0,124,97.08,137,na,BCOR,CCDS48093.1,c.1435G>A,p.G479R,UNKNOWN 208390522,PD6905a,X,39933219,-,T,I,0,135,16.72597865,281,5,BCOR,CCDS48093.1,c.1379_1380insA,p.M461fs*21,ONCOGENIC 58195050,PD6905a,X,39933337,C,A,Sub,0,35,40.34,176,na,BCOR,CCDS48093.1,c.1262G>T,p.G421V,UNKNOWN 208261383,PD6841a,X,39933593,-,G,I,0,44,23.07692308,26,6,BCOR,CCDS48093.1,c.1005_1006insC,p.S336fs*45,ONCOGENIC 58152450,PD5746a,X,39933665,G,A,Sub,0,51,18.18,77,na,BCOR,CCDS48093.1,c.934C>T,p.Q312*,ONCOGENIC 66786200,PD7384a,X,39933929,G,A,Sub,0,143,59.38,32,na,BCOR,CCDS48093.1,c.670C>T,p.Q224*,ONCOGENIC 208481455,PD6999a,X,39933935,g,-,D,0,143,82.03125,127,1,BCOR,CCDS48093.1,c.664delC,p.L222fs*44,ONCOGENIC 58194027,PD7110a,X,39934411,C,T,Sub,0,52,48.68,76,na,BCOR,CCDS48093.1,c.188G>A,p.R63K,UNKNOWN 58095463,PD6100a,X,44820536,G,A,Sub,0,89,15.47,181,na,KDM6A,CCDS14265.1,c.233G>A,p.R78H,UNKNOWN 58092650,PD7016a,X,44896915,C,T,Sub,0,93,12.57,175,na,KDM6A,CCDS14265.1,c.635C>T,p.A212V,UNKNOWN 58112072,PD7010a,X,44896915,C,T,Sub,0,93,12.99,231,na,KDM6A,CCDS14265.1,c.635C>T,p.A212V,UNKNOWN 58081079,PD6251a,X,44918265,T,C,Sub,0,134,100,126,na,KDM6A,CCDS14265.1,c.890T>C,p.I297T,UNKNOWN 58095466,PD6100a,X,44919382,C,T,Sub,0,80,14.2,345,na,KDM6A,CCDS14265.1,c.1310C>T,p.A437V,UNKNOWN 58109969,PD6175a,X,44920653,C,T,Sub,0,81,21.33,300,na,KDM6A,CCDS14265.1,c.1414C>T,p.Q472*,ONCOGENIC 58079607,PD6098a,X,44922998,A,T,Sub,0,48,11.93,109,na,KDM6A,CCDS14265.1,c.1859A>T,p.N620I,UNKNOWN 68093014,PD8728a,X,44928837,G,A,Sub,0,118,10.71,224,na,KDM6A,CCDS14265.1,c.1937G>A,p.G646D,UNKNOWN 58086473,PD7017a,X,44929028,C,T,Sub,0,84,25.57,176,na,KDM6A,CCDS14265.1,c.2128C>T,p.Q710*,ONCOGENIC 58156393,PD6813a,X,44929164,C,T,Sub,0,68,47.8,500,na,KDM6A,CCDS14265.1,c.2264C>T,p.T755M,UNKNOWN 58151472,PD6954a,X,44938595,G,A,Sub,0,130,11.08,397,na,KDM6A,CCDS14265.1,c.3143G>A,p.R1048K,UNKNOWN 58118333,PD6081a,X,44949095,G,A,Sub,0,124,13.8,500,na,KDM6A,CCDS14265.1,c.3656G>A,p.W1219*,ONCOGENIC 67026176,PD8740a,X,44950058,A,G,Sub,0,240,100,218,na,KDM6A,CCDS14265.1,c.3827A>G,p.N1276S,UNKNOWN 68196228,PD8645a,X,44966713,G,A,Sub,0,181,14.86,175,na,KDM6A,CCDS14265.1,c.3937G>A,p.A1313T,UNKNOWN 66858223,PD8935a,X,44969472,A,G,Sub,0,212,5.88,221,na,KDM6A,CCDS14265.1,c.4154A>G,p.Q1385R,UNKNOWN 58160577,PD7045a,X,48650422,G,A,Sub,0,106,28,25,na,GATA1,CCDS14305.1,c.392G>A,p.S131N,UNKNOWN 58079374,PD6822a,X,48650448,C,T,Sub,0,109,10,50,na,GATA1,CCDS14305.1,c.418C>T,p.R140W,UNKNOWN 66887288,PD7382a,X,48650564,C,A,Sub,0,92,47.59,187,na,GATA1,CCDS14305.1,c.534C>A,p.S178R,UNKNOWN 66997178,PD9660a,X,48650880,G,A,Sub,0,66,48.04,179,na,GATA1,CCDS14305.1,c.744+5G>A,p.?,UNKNOWN 58115136,PD6982a,X,48652357,G,T,Sub,0,17,12.2,41,na,GATA1,CCDS14305.1,c.1028G>T,p.S343I,UNKNOWN 208203853,PD6081a,X,48652402,-,C,I,0,32,25,24,6,GATA1,CCDS14305.1,c.1073_1074insC,p.G359fs*19,UNKNOWN 209546581,PD6140a,X,76777761,t,-,D,0,96,9.102244389,802,2,ATRX,CCDS14434.1,c.6955delA,p.M2319fs*0,ONCOGENIC 58105246,PD6944a,X,76814180,G,T,Sub,0,48,90.42,240,na,ATRX,CCDS14434.1,c.6464G>A,p.G2155E,UNKNOWN 58094771,PD5718a,X,76849245,C,T,Sub,0,46,10,100,na,ATRX,CCDS14434.1,c.6031G>A,p.A2011T,UNKNOWN 66878384,PD8934a,X,76849313,G,T,Sub,0,42,45.29,393,na,ATRX,CCDS14434.1,c.5963C>A,p.A1988D,UNKNOWN 66876149,PD8735a,X,76855214,C,T,Sub,0,30,6.31,222,na,ATRX,CCDS14434.1,c.5773G>A,p.D1925N,UNKNOWN 58145413,PD5725a,X,76889058,A,T,Sub,0,111,92.47,93,na,ATRX,CCDS14434.1,c.4952T>A,p.L1651H,UNKNOWN 58116005,PD7072a,X,76890119,A,T,Sub,0,87,11.35,423,na,ATRX,CCDS14434.1,c.4775T>A,p.L1592H,UNKNOWN 58130859,PD6824a,X,76937081,C,A,Sub,0,304,13.68,190,na,ATRX,CCDS14434.1,c.3667G>T,p.E1223*,ONCOGENIC 58073997,PD6891a,X,76937102,T,C,Sub,0,262,49.2,500,na,ATRX,CCDS14434.1,c.3646A>G,p.I1216V,UNKNOWN 58108570,PD7000a,X,76937683,C,T,Sub,0,359,49.4,500,na,ATRX,CCDS14434.1,c.3065G>A,p.R1022Q,UNKNOWN 58091991,PD6782a,X,76937978,G,T,Sub,0,299,13.2,500,na,ATRX,CCDS14434.1,c.2770C>A,p.L924I,UNKNOWN 58197362,PD6979a,X,76938264,C,G,Sub,0,424,99.58,473,na,ATRX,CCDS14434.1,c.2484G>C,p.M828I,UNKNOWN 66878385,PD8934a,X,76938281,T,A,Sub,0,373,5.5,109,na,ATRX,CCDS14434.1,c.2467A>T,p.K823*,ONCOGENIC 66931138,PD7381a,X,76939475,T,G,Sub,0,243,100,131,na,ATRX,CCDS14434.1,c.1273A>C,p.K425Q,UNKNOWN 58080540,PD6545a,X,76939589,T,C,Sub,0,221,97.44,273,na,ATRX,CCDS14434.1,c.1159A>G,p.T387A,UNKNOWN 58179206,PD6784a,X,76939658,C,T,Sub,0,156,35.8,500,na,ATRX,CCDS14434.1,c.1090G>A,p.A364T,UNKNOWN 58187953,PD6895a,X,76939691,T,A,Sub,0,159,98.59,284,na,ATRX,CCDS14434.1,c.1057A>T,p.I353F,UNKNOWN 58138122,PD6803a,X,76949366,G,A,Sub,0.24,417,34.64,280,na,ATRX,CCDS14434.1,c.431C>T,p.P144L,UNKNOWN 58172306,PD6128a,X,76952181,A,G,Sub,0,190,100,189,na,ATRX,CCDS14434.1,c.254T>C,p.I85T,UNKNOWN 58202954,PD6946a,X,123176484,G,T,Sub,0,121,88.34,163,na,STAG2,CCDS43990.1,c.451G>T,p.E151*,ONCOGENIC 58152550,PD5748a,X,123179197,C,T,Sub,0,67,73.04,115,na,STAG2,CCDS43990.1,c.646C>T,p.R216*,ONCOGENIC 58184944,PD6276a,X,123179197,C,T,Sub,0,67,47.54,183,na,STAG2,CCDS43990.1,c.646C>T,p.R216*,ONCOGENIC 343751563,PD7372a,X,123179209,-,AGCA,I,0,70,37.03703704,78,1,STAG2,CCDS43990.1,c.658_659insAGCA,p.T220fs*20,ONCOGENIC 58087829,PD6509a,X,123181206,A,G,Sub,0,146,51.57,318,na,STAG2,CCDS43990.1,c.670A>G,p.M224V,UNKNOWN 58136435,PD6962a,X,123181311,C,T,Sub,0,103,17.78,225,na,STAG2,CCDS43990.1,c.775C>T,p.R259*,ONCOGENIC 208336336,PD6991a,X,123185197,at,-,D,0,148,9.756097561,286,2,STAG2,CCDS43990.1,c.1149_1150delAT,p.Y384fs*0,ONCOGENIC 58098609,PD6783a,X,123185213,C,T,Sub,0,138,84.29,140,na,STAG2,CCDS43990.1,c.1165C>T,p.Q389*,ONCOGENIC 58138323,PD6823a,X,123191793,T,G,Sub,0,61,25.12,207,na,STAG2,CCDS43990.1,c.1382T>G,p.V461G,UNKNOWN 208387048,PD6155a,X,123195109,-,GGGGA,I,0,271,16.38795987,257,0,STAG2,CCDS43990.1,c.1452_1453insGGGGA,p.W485fs*10,ONCOGENIC 58078385,PD5747a,X,123196764,A,G,Sub,0,140,41.25,257,na,STAG2,CCDS43990.1,c.1651A>G,p.K551E,UNKNOWN 58095468,PD6100a,X,123196795,C,T,Sub,0,140,14.29,28,na,STAG2,CCDS43990.1,c.1682C>T,p.T561I,UNKNOWN 208211077,PD6914a,X,123196814,-,T,I,0,149,35.12396694,242,5,STAG2,CCDS43990.1,c.1701_1702insT,p.A568fs*20,ONCOGENIC 58130852,PD6824a,X,123196822,C,T,Sub,0,139,29.46,112,na,STAG2,CCDS43990.1,c.1709C>T,p.A570V,UNKNOWN 58072424,PD6926a,X,123197772,T,A,Sub,0,189,10.09,347,na,STAG2,CCDS43990.1,c.1896T>A,p.C632*,ONCOGENIC 208191819,PD6152a,X,123197783,-,A,I,0,173,64.97005988,334,1,STAG2,CCDS43990.1,c.1907_1908insA,p.Y636fs*0,ONCOGENIC 58085742,PD6179a,X,123199755,T,A,Sub,0,54,27.33,172,na,STAG2,CCDS43990.1,c.2055T>A,p.Y685*,ONCOGENIC 58079037,PD6504a,X,123200039,C,G,Sub,0,125,94.44,54,na,STAG2,CCDS43990.1,c.2111C>G,p.S704*,ONCOGENIC 58154227,PD7036a,X,123202473,C,A,Sub,0,35,15.11,139,na,STAG2,CCDS43990.1,c.2325C>A,p.Y775*,ONCOGENIC 209729172,PD6247a,X,123205012,tg,-,D,0,122,24.05063291,153,3,STAG2,CCDS43990.1,c.2372_2373delTG,p.C792fs*0,ONCOGENIC 66951399,PD7391a,X,123205069,T,G,Sub,0,149,49.59,244,na,STAG2,CCDS43990.1,c.2429T>G,p.L810*,ONCOGENIC 58200069,PD5736a,X,123210203,C,G,Sub,0,160,43.93,346,na,STAG2,CCDS43990.1,c.2555C>G,p.A852G,UNKNOWN 67008719,PD7388a,X,123211879,G,A,Sub,0,27,11.35,141,na,STAG2,CCDS43990.1,c.2746G>A,p.A916T,UNKNOWN 208111638,PD6812a,X,123211886,-,A,I,,28,50.57915058,518,0,STAG2,CCDS43990.1,c.2753_2754insA,p.L919fs*10,ONCOGENIC 208564013,PD7027a,X,123215246,-,A,I,0,267,78.75,238,1,STAG2,CCDS43990.1,c.2792_2793insA,p.Q932fs*6,ONCOGENIC 208191808,PD6152a,X,123215338,g,-,D,0,287,7.279693487,261,1,STAG2,CCDS43990.1,c.2884delG,p.D962fs*3,ONCOGENIC 58075518,PD6969a,X,123217296,G,T,Sub,0,186,42.67,150,na,STAG2,CCDS43990.1,c.2950G>T,p.E984*,ONCOGENIC 58115756,PD6278a,X,123217380,C,T,Sub,0,247,49.39,245,na,STAG2,CCDS43990.1,c.3034C>T,p.R1012*,ONCOGENIC 58130350,PD5756a,X,123217380,C,T,Sub,0,247,49.15,118,na,STAG2,CCDS43990.1,c.3034C>T,p.R1012*,ONCOGENIC 58160026,PD6998a,X,123217380,C,T,Sub,0,247,91.2,125,na,STAG2,CCDS43990.1,c.3034C>T,p.R1012*,ONCOGENIC 208329127,PD7033a,X,123217399,-,T,I,0,252,10.19417476,173,1,STAG2,CCDS43990.1,c.3053_3053+1insT,p.Y1019fs*35,ONCOGENIC 207982403,PD5768a,X,123220431,-,TG,I,0,120,25.3968254,314,1,STAG2,CCDS43990.1,c.3088_3089insTG,p.S1031fs*12,ONCOGENIC 58136437,PD6962a,X,123220440,C,T,Sub,0,121,65.45,165,na,STAG2,CCDS43990.1,c.3097C>T,p.R1033*,ONCOGENIC 58106035,PD6182a,X,123220476,C,T,Sub,0,118,48.4,500,na,STAG2,CCDS43990.1,c.3133C>T,p.R1045*,ONCOGENIC 209600884,PD6253a,X,123220544,-,G,I,0,116,31.44104803,456,5,STAG2,CCDS43990.1,c.3201_3202insG,p.S1068fs*6,ONCOGENIC 66786197,PD7384a,X,123224542,T,G,Sub,0,166,71.3,108,na,STAG2,CCDS43990.1,c.3395T>G,p.L1132*,ONCOGENIC 66878386,PD8934a,X,123224717,T,A,Sub,0,82,18.18,66,na,STAG2,CCDS43990.1,c.3481T>A,p.W1161R,UNKNOWN 58094066,PD6779a,X,123227989,G,A,Sub,0,159,10.99,464,na,STAG2,CCDS43990.1,c.3700G>A,p.D1234N,UNKNOWN 66960572,PD8734a,X,133511650,G,C,Sub,0,100,52.2,205,na,PHF6,CCDS14639.1,c.3G>C,p.M1I,ONCOGENIC 208317202,PD6240a,X,133511708,a,-,D,0,120,54.62962963,324,2,PHF6,CCDS14639.1,c.61delA,p.K21fs*12,ONCOGENIC 208551076,PD6785a,X,133511716,-,A,I,0,129,91.06280193,413,1,PHF6,CCDS14639.1,c.69_70insA,p.R24fs*12,ONCOGENIC 58193065,PD6245a,X,133511739,T,G,Sub,0,143,50.65,231,na,PHF6,CCDS14639.1,c.92T>G,p.L31*,ONCOGENIC 58125266,PD6194a,X,133511777,A,T,Sub,0,171,51.59,314,na,PHF6,CCDS14639.1,c.130A>T,p.K44*,ONCOGENIC 345149684,PD7376a,X,133527598,-,A,I,0,142,41.59292035,113,1,PHF6,CCDS14639.1,c.308_309insA,p.Y103fs*1,ONCOGENIC 58204342,PD6484a,X,133527982,G,A,Sub,0,207,84.9,192,na,PHF6,CCDS14639.1,c.418G>A,p.A140T,UNKNOWN 58125818,PD6992a,X,133547559,C,T,Sub,0,120,46.04,202,na,PHF6,CCDS14639.1,c.457C>T,p.P153S,UNKNOWN 208536704,PD6201a,X,133547571,a,-,D,0,127,44.44444444,144,4,PHF6,CCDS14639.1,c.469delA,p.S158fs*60,ONCOGENIC 58158037,PD6536a,X,133547863,C,G,Sub,0,280,31.78,365,na,PHF6,CCDS14639.1,c.596C>G,p.S199C,UNKNOWN 58073120,PD6985a,X,133547910,G,C,Sub,0,266,13.6,125,na,PHF6,CCDS14639.1,c.643T>C,p.C215R,UNKNOWN 66801910,PD7376a,X,133547940,C,T,Sub,0.33,300,6.9,145,na,PHF6,CCDS14639.1,c.673C>T,p.R225*,ONCOGENIC 58160727,PD5715a,X,133547941,G,A,Sub,0,309,10.93,430,na,PHF6,CCDS14639.1,c.674G>A,p.R225Q,UNKNOWN 58113050,PD6191a,X,133547986,A,G,Sub,0,367,96.7,212,na,PHF6,CCDS14639.1,c.719A>G,p.Y240C,UNKNOWN 58091728,PD6506a,X,133549137,G,A,Sub,0,128,84.21,152,na,PHF6,CCDS14639.1,c.821G>A,p.R274Q,UNKNOWN 58107559,PD6188a,X,133549149,T,A,Sub,0,132,89.89,178,na,PHF6,CCDS14639.1,c.833T>A,p.M278K,UNKNOWN 58092244,PD6890a,X,133551229,A,G,Sub,0,89,10.79,241,na,PHF6,CCDS14639.1,c.865A>G,p.T289A,UNKNOWN 58142287,PD6201a,X,133551305,T,C,Sub,0,103,12,125,na,PHF6,CCDS14639.1,c.941T>C,p.I314T,UNKNOWN 66801911,PD7376a,X,133551305,T,C,Sub,0,103,21.25,80,na,PHF6,CCDS14639.1,c.941T>C,p.I314T,UNKNOWN 208093735,PD6816a,X,133551317,c,-,D,,107,5.421686747,166,1,PHF6,CCDS14639.1,c.953delC,p.S318fs*33,ONCOGENIC 58192476,PD6534a,X,133551319,C,T,Sub,0,101,43.38,136,na,PHF6,CCDS14639.1,c.955C>T,p.R319*,ONCOGENIC \ No newline at end of file diff --git a/_articles/RJ-2024-002/data/mds.paper.clin.txt b/_articles/RJ-2024-002/data/mds.paper.clin.txt new file mode 100644 index 0000000000..6530a8ffa3 --- /dev/null +++ b/_articles/RJ-2024-002/data/mds.paper.clin.txt @@ -0,0 +1 @@ +PDID Gender 0:no mut|1:seq|2:removedbyqc|3:failed center AGE WHO category DATE OF SAMPLE DATE OF DIAGNOSIS DATE LAST FU OUTCOME DATE AML PROGRESSION AML PROGRESSION KARYOTYPE CYTOGENETIC RISK CYTO_chr3 CYTO_del5_5q CYTO_del7_7q CYTO_tri8 CYTO_del11 CYTO_del12 CYTO_chr17 CYTO_tri19 CYTO_del20q CYTO_delY CYTO_other CYTO_complex SEQ_tri8 SEQ_del5q SEQ_mono7_7q SEQ_del11q SEQ_del12p SEQ_i17q SEQ_tri19 SEQ_del20q SEQ_other IPSS norm WPSS TRANSFUSION DEPENDENCY SERUM FERRITIN PB CYTOPENIA HB ANC PLT % BM BLASTS M/E RATIO % RING SIDEROBLASTS Karyotype score (for IPSS) Bone Marrow Score (for IPSS) WHO score PB CYTOPENIA score (for IPSS) PD7090a 0 1 1 73 RCMD 08/09/2003 27/06/1998 19/05/2008 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 80 1 13.1 2 43 3 na 0 na na na na PD7364a 0 1 1 66 RAEB 1 06/10/2003 26/06/2003 12/05/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 533 1 14.6 2 128 9.5 na 0 na na na na PD7365a 0 1 1 74 RA 03/11/2003 24/01/2003 12/03/2008 1 0 "45,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 low 0 220 1 12.5 na 65 6 na 0 na na na na PD7366a 0 1 1 77 RCMD 17/02/2005 30/06/2003 01/09/2006 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 low 0 498 1 10.5 0.7 227 1 na 0 na na na na PD7378a 1 1 1 47 RCMD-RS 18/04/2005 07/07/1999 21/05/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 3200 0 11.1 3.7 182 1 na 32 na na na na PD7112a 0 1 1 51 RARS 20/06/2006 30/06/1991 13/07/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 2998 1 10.7 0.97 144 1 na 32 na na na na PD7098a 1 1 1 71 RCMD-RS 17/10/2005 10/05/2004 10/07/2012 0 0 "47,XX,+8(2)" Int 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 328 1 10.2 1.7 320 4 na 29 na na na na PD7379a 0 1 1 57 RAEB 2 24/05/2004 07/05/2004 09/03/2005 1 0 "42-44XY,der5,der7,-13,der16,der17,20q-,-22" High 0 1 1 0 0 0 1 0 1 0 1 1 na na na na na na na na na high 1 340 1 11.2 5.9 340 12 na 0 na na na na PD7368a 1 0 1 85 RA 26/05/2004 14/05/2004 29/04/2011 1 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 1 9.4 2.8 224 1 na 0 na na na na PD7104a 1 1 1 59 RARS 31/05/2010 30/06/1994 21/03/2011 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 1363 0 10.3 5.5 303 2 na 33 na na na na PD7380a 0 1 1 48 RCMD-RS 18/10/2004 19/11/2004 01/08/2005 1 0 complex High 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 1 725 1 9.8 1.5 22 4 na 33 na na na na PD7369a 0 1 1 63 RAEB 1 22/11/2004 01/01/2001 15/05/2009 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 na 1 10 0.08 92 10 na 0 na na na na PD7370a 1 1 1 56 CMML 29/11/2004 01/09/2004 23/09/2006 1 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 0 na 0 11.4 na 336 6 na 0 na na na na PD7371a 1 1 1 79 CMML 16/12/2004 17/06/2002 04/05/2007 1 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 1 11.9 na 273 4 na 0 na na na na PD7372a 0 1 1 76 RAEB 2 12/01/2005 12/01/2005 17/05/2012 1 0 "46,XY(20),43-45,XY(4)" Low 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-2 1 381 1 9.9 0.21 381 13.5 na 0 na na na na PD7374a 1 1 1 66 5q- 27/04/2005 17/03/2005 05/07/2012 0 0 "46,XX,del(5)(q13q33)[9]45,XX,del(5)(q13q33)[3]" Low 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 1 9.5 6.1 305 4 na 0 na na na na PD7375a 1 1 1 77 RAEB 1 02/05/2005 05/04/2005 07/03/2008 1 0 "46,XX[17],44-45,XX[8]" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 73 1 13.1 1.06 100 6 na 0 na na na na PD7111a 0 1 1 74 RCMD 10/12/2007 04/05/2005 24/03/2009 1 0 "46-47,XY,+19[2]46,XY[16]44-45,XY[7]" Int 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 1 1787 1 9.1 5.4 213 4 na 4 na na na na PD7381a 0 1 1 81 RCMD-RS 08/06/2005 23/05/2005 16/06/2008 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 837 1 9.8 5.2 191 0.5 na 15 na na na na PD7376a 0 1 1 77 RCMD 04/07/2005 25/05/2005 11/07/2006 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 1 10.1 1.4 183 0 na 0 na na na na PD7073a 1 1 1 89 RCMD-RS 30/05/2005 11/04/2004 14/05/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 2760 1 8.8 1.01 272 4 na 15 na na na na PD7071a 0 1 1 69 RARS 29/06/2005 13/06/2005 22/11/2010 1 0 "46,XY,(15),42-45XY(7)" Low 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 low 0 833 0 10.8 2.6 349 2 na 15 na na na na PD7088a 1 1 1 76 RCMD-RS 09/06/2008 20/06/2005 21/10/2009 1 0 "46,XX,del5(1)" Low 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 3804 1 9.2 6.3 195 5 na 0 na na na na PD7072a 1 1 1 69 RCMD-RS 05/09/2005 26/08/2005 16/08/2007 1 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 349 1 8.9 3.9 355 4.5 na 15 na na na na PD7074a 0 1 1 73 RCMD-RS 10/10/2005 29/10/2003 10/10/2006 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 737 0 10.8 2.6 238 1 na 39 na na na na PD7377a 0 1 1 67 CMML 11/10/2005 19/01/2005 24/06/2009 1 0 "45,XY" Low 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 0 168 1 11.5 3.9 68 16 na 0 na na na na PD7099a 1 1 1 80 RCMD-RS 23/11/2005 10/10/2005 30/04/2012 0 0 "46,XX,9qh+(21)42-45,XX,9hq+[21]" Int 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 int-1 0 230 0 10.6 4.4 419 2 na 0 na na na na PD7079a 0 1 1 77 RCMD-RS 31/01/2008 08/10/2001 01/03/2009 0 NA "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 3470 1 9.1 3 35 3 na 15 na na na na PD7084a 1 1 1 54 RARS 16/02/2009 01/09/2002 14/06/2012 0 0 "46,XX[23],45,XX[3]" Low 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 low 0 356 1 9.2 2.8 444 4.5 na 12 na na na na PD7382a 1 1 1 32 RAEB 2 23/08/2006 18/08/2006 02/04/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 0 na 1 7.1 2.2 76 12 na 0 na na na na PD7383a 1 1 1 67 5q- 02/10/2007 01/01/2006 13/06/2011 0 0 "46,XX,del(5q)/9[25]" Low 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 210 1 12.1 1.5 85 2 na 0 na na na na PD7384a 0 1 1 75 RAEB 2 08/01/2007 01/03/2006 30/11/2008 1 08/07/2008 1 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 1 na 1 10.8 na 19 10 na 0 na na na na PD7385a 0 1 1 76 RAEB 2 31/05/2007 01/01/2006 14/06/2010 1 0 "46,XY[17],42-45,XY[8]" Low 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 1 2102 1 10.2 0.9 87 12.5 na 0 na na na na PD7075a 1 1 1 74 RARS-T 24/08/2009 06/12/2001 04/07/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 904 0 10.6 5.2 1478 4 na 6 na na na na PD7386a 0 1 1 67 RAEB 2 04/10/2007 06/08/2007 18/06/2010 1 15/01/2010 1 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 0 276 0 10.4 0.8 336 12 na 2 na na na na PD7387a 0 1 1 58 CMML 07/10/2007 10/06/2007 24/04/2010 1 12/11/2009 1 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 0 31 1 11.1 15.2 45 10.5 na 0 na na na na PD7388a 0 1 1 35 RAEB 2 08/10/2007 28/09/2007 24/05/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 539 1 11 8 36 15.5 na 0 na na na na PD7076a 0 1 1 67 RARS 15/10/2007 09/02/2007 04/07/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 540 0 10.6 2.4 395 7.5 na 15 na na na na PD7389a 1 1 1 69 RAEB 2 14/01/2008 23/02/2006 06/07/2009 1 19/05/2009 1 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 high 1 1699 0 11.1 1.7 10 14 na 0 na na na na PD7390a 1 1 1 72 RAEB 1 01/02/2008 17/01/2008 04/03/2009 1 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na 0 10.2 6.2 133 7.5 na 39 na na na na PD7087a 0 1 1 76 RARS 04/08/2008 18/06/2008 13/07/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 777 1 10.1 na 362 3.5 na 15 na na na na PD7077a 1 1 1 60 RCMD-RS 18/11/2008 17/09/2008 25/06/2012 0 0 "46,XX[26]" Low 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low 0 301 0 10.9 5.6 426 4 na 22 na na na na PD7081a 1 1 1 69 RCMD-RS 20/04/2009 06/12/2008 27/06/2012 0 0 "46,XX[25]" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 219 1 11.3 3 366 5.5 na 40 na na na na PD7085a 0 1 1 70 RCMD-RS 02/03/2009 02/02/2009 05/07/2012 0 0 "46,XY(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 519 0 10.8 2.4 348 3 na 32 na na na na PD7083a 0 1 1 71 RCMD-RS 23/02/2009 23/02/2009 09/07/2012 0 0 "46,XY(31)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 394 1 10.2 0.8 211 4 na 27 na na na na PD7080a 1 1 1 74 RARS 12/05/2009 27/01/2009 25/06/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 408 0 10.6 2.7 296 2.5 na 15 na na na na PD7082a 1 1 1 70 RARS 25/08/2009 21/07/2009 12/06/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 316 0 10.9 6.2 271 1 na 21 na na na na PD7107a 0 0 1 69 RCMD-RS 19/08/2009 19/08/2008 07/03/2012 0 0 "46,XY,del20(7),46,XY(18)" Low 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 low 0 154 0 11 6.3 267 2 na 5 na na na na PD7089a 1 1 1 83 RARS 20/08/2009 27/10/2008 21/06/2010 1 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 1 6.9 3.5 358 1 na 15 na na na na PD7091a 0 1 1 60 RARS-T 21/09/2009 01/03/2008 16/07/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 1495 1 9.2 5.1 1093 1.5 na 16 na na na na PD7092a 0 1 1 43 RARS-T 19/10/2009 22/02/2006 04/07/2012 0 0 "46,XY,t(2;13)" Low 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 low 1 378 1 9.6 4.3 878 1 na 10 na na na na PD7093a 1 1 1 79 RCMD-RS 16/11/2009 20/02/2007 07/03/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 1 9.1 na 407 4 na 36 na na na na PD7109a 0 1 1 64 RCMD-RS 15/02/2010 29/09/2009 17/04/2012 0 0 "46,XY(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 233 na 13.2 2.6 296 1.2 na 42 na na na na PD7094a 1 0 1 20 RARS 01/12/2009 01/12/2009 na na NA "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na 0 8 na na na na na na na na na PD7095a 1 1 1 80 RARS 07/01/2010 01/01/2008 25/05/2011 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 240 1 9.9 3.5 329 1.5 na 24 na na na na PD7096a 1 0 1 78 RCMD-RS 23/02/2010 27/01/2010 15/03/2010 0 0 "46,XX(26)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 0 10.4 4.9 379 2 na 29 na na na na PD7097a 0 1 1 69 RCMD-RS 15/02/2010 05/08/2009 19/03/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 1 8.8 5.8 497 1.5 na 26 na na na na PD7100a 0 1 1 85 RARS 28/04/2010 01/01/2009 08/10/2011 1 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 2143 1 8.9 1.7 251 2 na 3.5 na na na na PD7102a 0 1 1 79 RARS 08/06/2010 19/05/2010 21/05/2012 0 0 "45,X-Y(22)/46,XY(6)" Low 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low 1 1420 0 11.5 3.4 152 3 na 17 na na na na PD7103a 0 1 1 64 RCMD-RS 08/06/2010 24/05/2010 17/08/2011 1 0 "45,XY,add(1)(p?),-5+8,der(13;14)(q10:q10),-18+21(19)/46,XY(8)" High 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 int-2 0 677 1 8.6 0.8 145 1 na 32 na na na na PD7105a 1 1 1 83 RARS 06/09/2010 01/07/2010 15/05/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 1961 1 9.3 5.3 263 2 na 33 na na na na PD7106a 0 1 1 78 RCMD-RS 10/08/2010 03/08/2010 11/07/2012 0 0 "46,XY" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 542 1 9.7 2.9 180 3 na 39 na na na na PD7108a 1 1 1 75 RARS 26/10/2010 18/10/2010 03/05/2012 0 0 "56,XX(26)" Low 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 low 0 341 1 10.7 1.4 312 1.5 na 15 na na na na PD7110a 1 1 1 78 RARS 01/02/2011 14/12/2010 13/02/2012 0 0 "46,XX(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 148 0 10.3 3 304 2 na 21 na na na na PD8648a 0 1 1 na RARS na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na 1 na na na na na na na na na na PD8649a 1 0 1 na RARS na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na 0 na na na na na na na na na na PD8645a 1 1 1 81 RCMD-RS 04/10/2011 05/09/2011 20/06/2012 0 0 "46,XX" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 405 0 10.7 3.6 321 1.5 na 85 na na na na PD8646a 0 1 1 70 RARS 30/10/2011 10/10/2011 20/06/2012 0 0 "46,XY[30]" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 0 11.7 5.8 177 1.5 na 85 na na na na PD8647a 0 1 1 77 RCMD-RS 15/11/2011 19/10/2011 16/07/2012 0 0 "46,XY(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 1283 1 9.4 3.3 216 0.5 na 15 na na na na PD9663a 1 1 1 77 RCMD-RS 16/11/2011 23/08/2011 06/07/2012 0 0 "46,XX(26)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 257 1 9.9 2 205 3 na 15 na na na na PD9660a 0 1 1 94 RCMD-RS 24/01/2012 19/12/2011 17/07/2012 0 0 "46,XY(29)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 381 1 10.2 1 160 3 na 15 na na na na PD9711a 1 1 1 64 RARS 08/02/2012 09/01/2012 13/06/2012 0 0 "46,XX(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 390 0 7.7 2.7 385 2 na 15 na na na na PD9659a 1 1 1 64 RARS 27/02/2012 09/01/2012 27/06/2012 0 0 "46,XX(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 0 10.2 4.4 309 1.5 na 15 na na na na PD9662a 0 1 1 56 RCMD-RS 16/01/2012 27/12/2011 18/07/2012 0 0 "46,XY(25)" Low 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 na 1 10.9 0.3 150 4 na 17 na na na na PD9661a 1 1 1 82 RARS-T 29/02/2012 18/01/2012 13/07/2012 0 0 "46,XX,del(5)(q13q33)[15]/45,idem-7[2]/46,XX[5]" High 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 1 2084 1 8.7 na 221 6 na 45 na na na na PD6183a 1 1 2 51 RAEB 09/12/2003 01/07/1998 01/12/2010 0 NA "46, XX, del(5)(q14;q34) [20] / 46, XX [5]" 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 na na na 9 2.1 349 na na na na na na na PD6184a 1 1 2 42 RAEB 09/12/2003 01/02/2000 02/12/2010 0 NA "46, XX, del(5)(q14;q34)" 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 na na na 6.4 7.4 1042 na na na na na na na PD6173a 1 1 2 61 RA 27/01/2004 01/10/2003 01/12/2005 0 NA "46, XX, del(5)(q14;q34) [21], inv9(q11;q12)" 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 na na na 10.7 2.4 169 na na na na na na na PD6174a 1 0 2 34 5q- 11/12/2003 01/01/1997 11/12/2010 0 NA "46, XX, del(5)(q13;q33) [18]" 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na na na na na na na na PD6175a 1 1 2 76 RA 14/01/2004 01/11/2001 01/02/2011 0 NA "46, XX, t(1;3)(p33:p14), del(5)(q14;q34)[21] / 46, XX [4]" 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 na na na 9.4 1.2 331 na na na na na na na PD6185a 1 1 2 50 RAEB 15/01/2004 01/06/1989 03/12/2010 0 NA "46, XX, del(5)(q14;q34)" 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 na na na 10.9 1.3 445 na na na na na na na PD6178a 0 1 2 na 5q- 02/12/1999 na na na NA 5q- na na 1 na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6176a 0 1 2 71 RA 08/02/2001 01/01/2001 19/09/2004 1 NA "46,Y, der(X)t(X;12)(p22;q21), del(5)(q14-15;q33-34), der(12)del(12)(p11p13)t(X;12)(p22;q21)[7]/ 46, XY" 2 0 1 0 0 0 1 0 0 0 0 0 1 na na na na na na na na na int-2 na na na 9.1 1.1 286 na na na na na na na PD6186a 1 1 2 77 RAEB 20/04/2001 na 04/11/2005 1 NA "46, XX [13] / 46, XX, del(5)9q22q33) [3] / 47, XX, +8 [2]" 1 0 1 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na na na na na na na na PD6193a 0 1 2 65 RCMD 15/02/2007 17/03/2005 01/05/2011 0 0 "46, XY" 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 1 na na 9.9 0.9 248 na na na na na na na PD6177a 0 1 2 87 RA 22/03/2007 22/03/2007 25/07/2008 1 0 "45, X, -Y" 0 0 0 0 0 0 0 0 0 0 1 0 0 na na na na na na na na na int-1 1 na na 9.6 2.4 91 na na na na na na na PD6198a 0 0 2 61 RCMD-RS 29/03/2007 12/02/2004 18/02/2010 1 0 "46, XY" 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na Low 1 na na 7.9 4.4 244 na na na na na na na PD6196a 0 1 2 75 RCMD-RS 27/04/2007 14/01/1998 10/05/2009 1 0 "46, XY, +8 [5] / 46, XY [15]" 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na na 9.2 3.5 149 na na na na na na na PD8728a 1 1 2 67 RCMD 27/04/2007 23/07/2004 05/04/2012 0 0 46XX [20] na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 1 na na 11.5 2.1 103 na na na na na na na PD8729a 0 1 2 69 RAEB 11/05/2007 25/02/2003 15/09/2008 1 0 46XY [20] na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 1 na na 10.4 2.1 61 na na na na na na na PD6188a 0 1 2 68 RAEB 07/06/2007 20/09/2004 08/09/2008 1 18/08/2005 1 "46, XY" 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 1 na na 9.1 1.5 11 na na na na na na na PD6189a 1 1 2 87 RAEB 11/08/2008 11/08/2008 28/07/2010 1 27/07/2010 1 "46, XX" 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 0 na na 10.7 1.1 109 na na na na na na na PD8730a 0 1 2 68 RAEB 2 15/09/2008 18/02/2008 01/11/2010 1 16/06/2010 1 "47XY, +8 [6/46] XY[54]" na 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 1 na na 12.9 3.2 47 na na na na na na na PD7116a 0 1 2 64 RCMD 02/10/2008 25/11/2004 08/09/2011 1 0 "46, XY" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 1 na na 11.1 5.4 45 na na na na na na na PD6194a 0 1 2 77 RCMD 07/01/2009 01/02/2003 01/05/2011 0 0 "45, X, -Y [20]" 0 0 0 0 0 0 0 0 0 0 1 0 0 na na na na na na na na na Low 0 na na 12.1 1.3 124 na na na na na na na PD6195a 0 1 2 na RCMD 29/01/2009 na 04/05/2009 1 0 del 20q 0 0 0 0 0 0 0 0 0 1 0 0 0 na na na na na na na na na int-1 na na na 9.6 2.9 85 na na na na na na na PD7117a 0 1 2 na RCMD na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6197a 0 1 2 83 RCMD-RS 26/02/2009 26/02/2009 15/08/2010 1 06/07/2010 1 "46, XY [20]" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 1 na na 10.6 1.9 205 na na na na na na na PD8731a 0 1 2 79 RAEB 26/03/2009 01/12/2008 29/07/2009 1 0 TRISOMY 8 na 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 0 na na 9.9 9.4 30 na na na na na na na PD6170a 0 0 2 na MDS-U 23/04/2009 na 06/06/2009 1 0 "46, XY" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low na na na 17.6 3.2 111 na na na na na na na PD6190a 0 1 2 63 RAEB 22/02/2010 04/09/2006 01/05/2011 0 0 "46,XY,t(18;20)(q2;q11,2)[6]/47,idem,+8[10]/46,XY[4]" 1 0 0 0 1 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 1 na na 10.2 2 109 na na na na na na na PD8732a 0 1 2 64 RCMD-RS 01/03/2010 08/02/2007 19/04/2012 0 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 1 na na 9 9.9 554 na na na na na na na PD8733a 1 1 2 69 RCMD-RS 13/10/2010 13/10/2010 29/02/2012 0 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 0 na na 11.7 5.3 390 na na na na na na na PD7118a 1 1 2 86 RCMD 05/01/2011 05/01/2011 05/04/2012 0 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 1 na na 13.2 2.8 115 na na na na na na na PD8734a 1 1 2 61 RCMD 05/01/2011 05/01/2011 25/02/2011 1 14/02/2011 1 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 1 na na 5.9 na 11 na na na na na na na PD8735a 0 1 2 80 CMML 18/01/2011 18/01/2011 28/12/2011 0 NA "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na Low 0 na na 12.4 5.5 245 na na na na na na na PD7119a 1 1 2 57 RCMD 18/01/2011 11/07/2006 16/02/2012 0 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Low 0 na na 12.2 2.9 401 na na na na na na na PD7120a 1 1 2 60 RAEB 09/02/2011 09/02/2011 16/02/2012 0 0 "46,XX,del(5)(q13q33)[17]46,XX[3]" na 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 0 na na 11.2 0.7 300 na na na na na na na PD8736a 0 1 2 66 RCMD 22/02/2011 01/09/2010 15/06/2011 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na na 6 1.3 35 na na na na na na na PD8737a 0 1 2 77 CMML 02/03/2011 10/08/2010 06/03/2012 0 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 0 na na 12.3 na 52 na na na na na na na PD8738a 1 1 2 68 CMML 10/03/2011 21/09/2010 05/04/2012 0 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 1 na na 11.8 0.7 432 na na na na na na na PD8739a 0 1 2 62 RAEB 31/03/2011 19/01/2011 22/01/2012 0 30/03/2011 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 na na 10.8 0.6 45 na na na na na na na PD8740a 0 1 2 82 RAEB 2 05/04/2011 16/11/2010 17/11/2011 0 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 1 na na 8.6 0.8 78 na na na na na na na PD6199a 0 2 2 na CMML 02/10/1987 na na na NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6200a 0 2 2 na CMML na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6201a 0 1 2 na CMML 16/06/1988 na na na NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6187a 1 1 2 na RAEB 24/08/1988 01/08/1988 01/06/1989 1 NA "47, XX, -5 (-12,-13,-16,+1 +mar) / 48, XX, -5 (-12,-13,-16,+1,+mar)" 2 0 1 0 0 0 1 0 0 0 1 1 1 na na na na na na na na na int-2 na na na 10.3 0.5 78 na na na na na na na PD6191a 0 1 2 64 RAEB 2 24/11/1986 na na 1 NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6202a 0 1 2 61 CMML 02/02/1988 na na 1 NA "46, XY" na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na na na na na na na na PD6180a 1 3 2 na RAEB na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6179a 0 1 2 na RAEB 12/02/1989 na na na NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6182a 1 1 2 76 RAEB 28/02/1989 na na 1 NA "46, XX" na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na na na na na na na na PD6181a 1 1 2 87 RAEB 04/05/1989 na na 1 NA "46, XX" na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na na na na na na na na PD6192a 1 1 2 83 RAEB 2 07/06/1989 na na 1 NA "46, XX, del(5)(q15;q33)" na 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na na na na na na na na PD6171a 1 1 2 87 RA 26/01/1987 22/01/1987 26/07/1988 1 NA "46, XX" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6203a 0 1 2 na CMML na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6336a 0 2 2 na RARS na na na na na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6951a 0 1 3 65 RA 01/08/2007 12/07/2007 26/08/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 very low 0 116 1 9.9 1.7101 132 1 2.45 0 na na na na PD7030a 0 1 3 na AML-MDS 12/02/2009 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 33 na na na na PD6911a 1 0 3 na AML-MDS 06/07/2007 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD6778a 0 0 3 na AML-MDS 21/06/2005 na na na NA na na na na na na na na na na na na na na 0 0 1 0 0 0 0 0 0 na na na na na na na 38 9 0 na na na na PD6952a 0 0 3 na AML-MDS 13/07/2007 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD7034a 0 1 3 na RA 27/04/2005 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low na na na na na na 0 2.7 0 na na na na PD6868a 1 2 3 na AML-MDS 16/12/2010 na na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6858a 0 1 3 na AML-MDS 30/11/2010 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 na na na na na na 25 1.86 0 na na na na PD6904a 0 1 3 na AML-MDS 06/10/2006 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na 45 2.85 2 na na na na PD6861a 1 1 3 na AML-MDS 10/07/2007 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD6991a 1 1 3 73 RAEB 2 16/04/2008 24/08/2007 03/12/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 1326 1 7.5 0.6222 99 18 1.5 7 na na na na PD6827a 1 1 3 na RA 11/10/2004 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low na na na na na na 0 2.33 5 na na na na PD6908a 0 0 3 na RA 29/11/2005 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na 0 1.86 6 na na na na PD6145a 1 1 3 51 RARS 05/09/2007 09/02/2007 05/10/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 308 0 10.3 5 425 2 1.5 65 na na na na PD6146a 1 1 3 67 RA 11/09/2007 06/04/2005 07/10/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 209 0 10.1 5.52 198 2 1.5 1 na na na na PD6147a 0 1 3 72 RAEB 2 24/10/2007 13/09/2007 19/11/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 1 4560 1 8.4 1.296 24 12 5.67 na na na na na PD6148a 1 1 3 na RARS 15/09/2010 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low na na na na na na 0 na 50 na na na na PD7012a 1 0 3 70 RAEB 1 05/11/2007 27/06/2007 30/11/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 80 1 10.1 1.741 98 9 na 0 na na na na PD6149a 1 1 3 86 RAEB 15/11/2007 10/10/2007 19/11/2007 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 0 458 0 9.6 2.471 701 6 na na na na na na PD6150a 0 1 3 75 RCMD 12/11/2007 05/11/2007 02/08/2008 1 0 "trisomy 8, t(3;6)" 1 1 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 high 1 839 1 8.1 1.235 42 3 na na na na na na PD6884a 1 1 3 81 RAEB 1 17/12/2007 25/10/2007 27/03/2009 0 0 t(3;3) e t(4;6) 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 high 0 128 0 8.8 3.6 496 6 na 60 na na na na PD6151a 0 1 3 79 RCMD-RS 04/12/2007 15/10/2005 10/01/2008 0 0 "trisomy 8, t(2;?)(p;?)" 1 0 0 0 1 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 high 1 3250 1 9.2 1.96 38 4 2.33 40 na na na na PD6152a 0 1 3 70 RCMD 29/07/2008 29/07/2008 20/12/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 169 1 8.9 1.606 294 4 1.17 0 na na na na PD6945a 1 1 3 51 CMML 25/01/2008 14/09/2007 18/12/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 37 0 10.9 3.069 65 3 na 0 na na na na PD6989a 0 1 3 65 RARS 22/04/2008 01/02/2008 13/05/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 179 0 12.4 2.445 190 3 na 70 na na na na PD6887a 1 1 3 60 RAEB 1 29/05/2008 08/08/2007 10/07/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 108 0 11.8 0.66 130 6 4 0 na na na na PD6840a 1 1 3 80 RARS-T 11/07/2008 24/04/2008 21/07/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 0 10.1 3.185 919 2 2.33 30 na na na na PD6153a 0 1 3 65 RAEB 2 17/07/2008 16/07/2008 16/08/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 516 0 10 1.43 186 15 1.5 60 na na na na PD7043a 0 1 3 43 RCMD 21/02/2008 15/11/2006 06/06/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 990 1 6.7 1.6 40 2 0.33 9 na na na na PD6154a 1 0 3 77 RA 23/07/2008 08/05/2008 12/08/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 12 0 12.7 8.3853 279 0 3 0 na na na na PD6950a 0 1 3 71 AML-MDS 01/09/2009 13/01/1995 15/10/2009 0 24/09/2009 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 0 381 1 10.9 0.87 12.5 23 2.57 0 na na na na PD6155a 0 1 3 44 RAEB 15/09/2009 30/07/2009 03/02/2010 1 0 "5q-, 17p-, 1 mar" 2 0 1 0 0 0 0 1 0 0 0 0 0 na na na na na na na na na int-2 high 0 1830 1 10.6 0.93 32.7 7 4.56 17 na na na na PD6886a 1 0 3 74 RA 18/01/2006 18/01/2006 29/07/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 very low 0 167 1 10.4 1.512 42.9 3 2.57 0 na na na na PD6156a 0 1 3 59 RCMD 20/10/2008 20/10/2008 16/06/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 na 1 11.8 0.822 96 4 1.78 0 na na na na PD6796a 0 1 3 57 RCMD-RS 17/07/2009 28/04/2004 20/08/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low intermediate 1 1290 0 7.3 2.361 164 2 0.92 50 na na na na PD6890a 1 1 3 43 RCMD 11/11/2008 15/09/2008 09/07/2009 0 0 monosomy 7 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-2 high 1 na 1 8.3 0.736 82 3 9 0 na na na na PD6889a 1 1 3 69 CMML 09/07/2009 17/04/2008 13/08/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 9 0 10.8 2.516 91 4 9 0 na na na na PD6956a 0 1 3 70 RCMD 11/05/2009 11/05/2009 18/06/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 571 1 10.2 0.4706 43.3 3 0.85 0 na na na na PD6157a 1 1 3 72 RAEB 2 26/06/2009 26/06/2009 08/10/2009 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 0 409 1 9.7 0.414 55 11 1.13 0 na na na na PD6880a 0 1 3 67 RAEB 1 07/10/2009 06/10/2009 17/01/2011 1 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 high 1 1320 1 4.98 1.325 307 5 0.82 0 na na na na PD6931a 0 1 3 62 RCMD-RS 22/10/2010 04/11/2009 16/12/2010 0 0 Y- 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 336 1 9.7 1.73 233 1 3.55 79 na na na na PD6964a 1 1 3 66 RARS 07/10/2009 15/07/2009 26/11/2009 0 0 riarr 4q 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 low 0 431 0 10.6 2.95 434 1 1.7 70 na na na na PD6906a 0 1 3 59 AML-MDS 16/10/2008 15/05/2008 12/12/2008 0 0 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 na 1 na 1 8 0.585 6 65 na 61 na na na na PD6907a 0 1 3 72 RAEB 1 11/09/2008 02/07/2008 09/04/2009 1 0 12p- 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 191 1 9.2 3.9648 93.5 6 1 0 na na na na PD6158a 0 1 3 72 RARS 30/10/2009 30/10/2009 27/01/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 951 0 9.45 6.231 225 1 2.03 34 na na na na PD6159a 0 1 3 78 RAEB 13/11/2008 13/11/2008 27/11/2008 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 435 1 8.5 0.853 47 9 1.5 0 na na na na PD6160a 0 1 3 78 RAEB 2 13/11/2008 22/10/2008 20/11/2008 0 0 "t(X;3), riarr 17p" 1 1 0 0 0 0 0 1 0 0 0 0 0 na na na na na na na na na high very high 1 543 1 8.3 4.456 63 18 1.27 0 na na na na PD6915a 1 0 3 70 RAEB 1 30/10/2009 15/08/2009 10/12/2009 0 0 "5q-, 11q-" 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-2 high 0 281 1 10.5 1.323 84.4 5 7.33 0 na na na na PD6930a 0 1 3 65 RAEB 2 03/03/2009 15/09/2008 16/04/2009 0 0 monosomy 7 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 high very high 1 873 1 8 0.66 48 11 0.67 33 na na na na PD6161a 1 1 3 58 RA 11/12/2009 26/08/2009 14/01/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 na 0 12.6 3.036 155 3 5.67 0 na na na na PD6859a 0 0 3 45 RCMD 21/12/2009 30/06/2006 21/01/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 972 1 12.6 1.38 18 3 0.64 5 na na na na PD8939a 0 1 3 na RAEB 2 18/09/2007 09/05/2007 14/07/2009 1 19/03/2008 1 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6916a 1 1 3 52 RA 18/02/2010 23/09/2009 24/01/2011 0 0 "3q-, 5q-, riarr6p, riarr8q" 2 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 int-1 intermediate 0 na 0 8.9 1.632 381 2 9 3 na na na na PD6895a 0 1 3 62 5q- 29/06/2006 29/06/2006 10/02/2011 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low very low 0 947 0 12.4 1.624 173 3 1 0 na na na na PD6162a 0 1 3 68 RAEB 17/11/2008 09/04/2008 12/01/2010 0 0 12p- 1 0 0 0 0 0 1 0 0 0 0 0 0 na na na na na na na na na int-1 high 1 1310 0 na na na 6 2.33 68 na na na na PD6988a 0 1 3 72 RAEB 2 19/03/2008 14/06/2007 17/04/2008 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na very high 1 613 1 7.4 0.8 1383 13 na 5 na na na na PD6922a 0 1 3 75 AML-MDS 10/10/2008 18/11/2004 16/10/2008 0 0 na na na na na na na na na na na na na na 0 1 0 0 1 1 0 0 0 na 1 2278 0 6.9 1.925 394 23 na 0 na na na na PD6929a 0 1 3 71 RCMD 06/04/2009 06/04/2009 14/05/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 209 0 10.4 1.897 129 2 na 0 na na na na PD6813a 1 1 3 78 RA 01/02/2010 04/12/2009 01/04/2010 0 0 46 XX na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 204 na 8.9 3.75 197 2 2.7 2 0 0 0 0 PD6819a 0 1 3 64 CMML 10/02/2009 15/09/2008 19/03/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 0 12.4 2.89 79 3 na 0 na na na na PD6920a 0 1 3 72 RCMD 21/10/2010 08/03/2010 20/12/2010 1 0 riarr 7 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 high 1 2850 0 7.5 8.8 315 0 4.56 9 na na na na PD6163a 0 1 3 56 RCMD-RS 07/08/2007 15/07/2005 17/03/2008 0 10/03/2008 1 "trisomy 8, 21" 1 0 0 0 1 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 high 1 11300 0 6.9 2.854 252 0 0.67 91 na na na na PD6829a 0 1 3 60 RAEB 1 17/03/2010 22/02/2010 22/04/2010 0 0 20q- 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 int-1 high 1 1930 1 7.9 1.291 97 6 2.33 78 na na na na PD6876a 1 1 3 68 CMML 21/07/2009 10/09/2007 27/08/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 224 0 13 2.5 90 3 5.67 0 na na na na PD6864a 0 1 3 42 RCMD 31/08/2009 23/01/2008 08/10/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 24 0 10.5 1.815 38.5 4 0.96 0 na na na na PD6896a 0 1 3 56 RCMD 04/03/2010 30/06/2007 02/04/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 480 1 7.6 4.999 82.9 2 3 0 na na na na PD6164a 0 1 3 61 RAEB 29/01/2009 15/10/2008 21/03/2009 0 17/03/2009 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 0 na 1 9.27 1.457 92 7 na na na na na na PD7016a 1 1 3 78 RCMD 23/10/2008 18/09/2008 13/05/2010 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 high 1 1100 1 6.2 1.155 121 4 3.55 7 na na na na PD7044a 1 1 3 58 RAEB 2 20/06/2008 20/06/2008 04/02/2009 0 17/11/2008 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-2 high 0 na 1 9.3 0.506 69 12 0.33 52 na na na na PD6165a 0 1 3 60 RARS 30/04/2010 26/01/2010 03/06/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 very low 0 266 1 9.4 1.56 486 1 0.64 78 na na na na PD6166a 0 1 3 44 RA 13/05/2010 05/02/2008 22/06/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 628 0 13.2 3.243 352 1 1.7 0 na na na na PD6919a 0 1 3 58 MDSMPN 17/06/2010 07/05/2008 22/07/2010 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 180 0 11.5 12.5 35 7 9 0 na na na na PD6953a 0 1 3 67 CMML 07/07/2010 07/07/2010 16/08/2010 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 632 0 11.1 7.068 86 1 0.19 0 na na na na PD6902a 0 2 3 76 MDSMPN 20/11/2008 22/10/2008 01/02/2011 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 0 289 0 11.3 10.894 141 0 2.33 0 na na na na PD6167a 0 2 3 77 RAEB 12/11/2008 12/11/2008 24/11/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 high 1 na 1 7.5 1.584 20 6 2.33 0 na na na na PD6866a 1 0 3 55 5q- 21/07/2010 21/07/2010 25/11/2010 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low very low 0 740 0 8.94 1.851 266 2 7.33 0 na na na na PD6807a 0 1 3 68 CMML 26/03/2009 26/03/2009 23/09/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 66 0 15.6 14.43 233 4 2.03 0 na na na na PD6168a 0 2 3 56 RAEB 2 23/06/2010 27/07/2009 06/07/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 166 1 10.1 0.567 46.8 19 1 0 na na na na PD6932a 0 1 3 73 RCMD 26/08/2010 11/05/2010 16/12/2010 0 0 "trisomy 8, 7p-, 2mar" 2 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 int-2 high 1 828 1 6.2 3.825 61 4 2.85 0 na na na na PD6961a 1 1 3 57 RARS 06/09/2004 30/03/2001 27/10/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 850 0 10.6 3.5 243 1 0.89 94 na na na na PD6843a 0 1 3 52 RCMD 24/08/2010 03/05/2010 30/09/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low intermediate 1 1220 0 8 2.5 500 1 9 0 na na na na PD7013a 1 2 3 33 RCMD 11/08/2005 21/03/2005 10/08/2005 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 534 1 7 0.176 98 2 2.23 0 na na na na PD6984a 1 1 3 48 RA 28/10/2005 02/12/2003 23/11/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 116 0 12.7 0.558 275 2 1.5 2 na na na na PD6898a 0 1 3 66 RCMD 03/06/2010 11/10/2007 15/07/2010 0 0 riarr 2p e 7p 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 high 0 95 0 12.6 2.581 7 3 4.26 9 na na na na PD7009a 0 1 3 75 RCMD-RS 18/10/2007 18/10/2007 15/11/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 387 0 11 0.3514 154 2 4 18 na na na na PD6795a 0 1 3 42 CMML 21/08/2008 03/08/2007 13/10/2008 0 0 46 XY na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na 418 na 9.3 15.75 267 2 2.33 0 na na na na PD6933a 1 1 3 74 RCMD 19/09/2008 25/05/2007 20/10/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 na 1 7 1.3 5 1 5.25 0 na na na na PD6879a 0 1 3 60 RCMD 07/11/2008 30/06/1996 25/11/2008 0 0 riarr 6 e 16 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 254 1 11 0.109 96.6 3 2.33 0 na na na na PD7037a 0 1 3 44 RCMD 29/05/2008 15/02/2008 30/10/2008 1 0 na na na na na na na na na na na na na na 0 0 1 0 0 0 0 0 0 na 1 na 1 6.1 0.312 65 2 na 0 na na na na PD6888a 1 1 3 51 RCMD 28/08/2007 30/06/2004 15/10/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 1552 1 7.6 3 16 0 0.67 0 na na na na PD6940a 0 1 3 52 RAEB 1 16/07/2010 16/04/2010 06/09/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 223 1 9.6 0.945 365 7 1.38 38 na na na na PD6842a 0 1 3 80 CMML 07/10/2008 21/02/2007 07/09/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 547 0 12.8 3.058 87 1 6.14 0 na na na na PD6891a 1 1 3 83 RARS-T 22/09/2010 21/09/2010 04/11/2010 0 0 11q- 1 0 0 0 0 1 0 0 0 0 0 0 0 na na na na na na na na na int-1 1 219 0 7.08 2.59 939 4 1.78 32 na na na na PD6857a 1 1 3 64 CMML 30/07/2008 15/06/2008 09/09/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 418 0 10.8 9.516 60 3 5.67 0 na na na na PD6893a 1 2 3 55 RARS-T 05/11/2004 24/07/2001 01/07/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low 0 324 0 10 2.5 500 2 0.67 81 na na na na PD6937a 0 1 3 68 RAEB 2 27/03/2009 10/01/2008 17/06/2010 0 0 riarr 2p 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-2 high 0 209 0 11.9 0.76 138 12 0.67 0 na na na na PD6928a 0 1 3 35 AML-MDS 23/12/2009 01/12/2009 27/09/2010 0 0 "t(1;7)(p10;q10), trisomy 9, 21" 2 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 high 1 na 1 8.23 1.08 78 21 2.85 0 na na na na PD6924a 1 1 3 53 RCMD 29/07/2010 30/06/2009 02/09/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low intermediate 1 292 0 7 2 100 3 2.7 0 na na na na PD7024a 1 1 3 57 RARS 02/11/2008 30/06/1998 22/12/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 2710 0 7.5 2.5 150 1 1 70 na na na na PD6903a 0 1 3 46 RCMD 02/11/2010 26/05/2009 21/02/2011 0 0 20q- 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 772 1 10.8 1.5 90 3 3.35 3 na na na na PD6169a 0 1 3 47 RCMD 19/11/2010 11/03/2009 19/11/2010 0 0 na na na na na na na na na na na na na na na na na na na na na na na low low 0 65 0 13.5 2.632 45.1 2 3.35 0 na na na na PD6885a 1 0 3 77 RCMD 16/11/2010 16/11/2010 04/01/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 70 0 11.6 0.815 149 4 2.13 0 na na na na PD6848a 0 1 3 68 CMML 23/12/2008 23/12/2008 16/12/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 746 0 10.4 2.124 342 1 5.67 0 na na na na PD6999a 0 1 3 36 RAEB 2 23/05/2008 10/03/2008 07/08/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 281 0 7.5 2.83 254 15 na 48 na na na na PD6818a 1 1 3 73 RA 16/12/2010 15/12/2010 03/02/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 133 0 11 7.635 159 3 2.45 0 na na na na PD6992a 1 1 3 82 RCMD-RS 16/10/2008 16/10/2008 16/02/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 494 1 7.5 1.435 136 0 1 50 na na na na PD6833a 1 1 3 36 RAEB 2 25/01/2011 25/01/2011 17/02/2011 0 0 na na na na na na na na na na na na na na 1 0 0 0 0 0 0 0 0 na 0 na 0 12.3 4.478 275 13 5.67 0 na na na na PD6839a 0 1 3 62 5q- 20/01/2011 20/01/2011 24/02/2011 0 0 5q- (q13;q31) 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low very low 0 197 0 9.4 2.032 228 1 4.88 21 na na na na PD6877a 0 1 3 76 RARS 19/11/2010 28/09/2010 20/12/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 356 0 10.2 2.085 163 2 4.56 25 na na na na PD6990a 1 1 3 75 RARS-T 04/02/2011 23/07/2010 07/03/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 1040 1 9.08 1.05 600 3 4 15 na na na na PD7028a 0 1 3 81 RAEB 2 20/02/2008 31/01/2008 28/05/2008 0 0 "trisomy 11, monosomy 7" 2 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 high very high 1 401 1 8 1.15 46 12 1.5 0 na na na na PD6871a 1 1 3 58 RCMD 24/01/2011 10/06/2008 15/03/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 1060 1 7.7 2.272 86 0 0.64 0 na na na na PD6936a 1 0 3 62 AML-MDS 07/06/2007 05/06/2007 26/07/2007 0 0 t(9;11) 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 high 1 na 1 7.7 1.25 17 21 na 0 na na na na PD6841a 0 1 3 na RAEB 1 10/12/2009 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na 7 4.88 0 na na na na PD6979a 1 1 3 na MDSMPN 18/01/2011 na na na NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 8 4.56 0 na na na na PD6974a 0 1 3 na AML-MDS 27/01/2011 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD6894a 1 1 3 na RAEB 1 10/11/2010 na na na NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 0 na na na na PD6986a 1 1 3 na RAEB 2 10/03/2011 na na na NA na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na 17 10.11 0 na na na na PD6836a 0 1 3 na AML-MDS 09/07/2009 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD6849a 1 1 3 na AML-MDS 30/11/2010 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD7000a 1 1 3 na RARS 16/11/2010 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 95 na na na na PD6910a 0 2 3 na RCMD 25/11/2008 na na na NA riarr 18p 1 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 na na na na na na 4 10.11 0 na na na na PD6958a 0 1 3 na AML-MDS 31/12/2010 na na na NA na na na na na na na na na na na na na na 0 0 1 0 0 0 0 0 0 na na na na na na na na na 5 na na na na PD6882a 0 0 3 na AML-MDS 26/11/2010 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD6783a 0 1 3 na AML-MDS 10/08/2010 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 0 na na na na PD6803a 0 1 3 63 RCMD 15/06/2005 19/03/2004 15/06/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low low 0 1040 0 14.5 1.28 50 3 2.45 2 na na na na PD6832a 1 0 3 na RA 23/12/2004 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na 2 1.04 0 na na na na PD6875a 0 1 3 na RARS 18/02/2011 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 84 na na na na PD6987a 0 1 3 na RCMD-RS 26/10/2010 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na 62 na na na na PD6853a 0 1 3 na AML-MDS 17/11/2010 na na na NA na na na na na na na na na na na na na na 0 0 1 1 0 0 0 0 0 na na na na na na na 21 9 25 na na na na PD6901a 1 2 3 78 RCMD 23/06/2005 11/04/2005 01/11/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 181 1 9.2 0.7936 40 4 3.55 0 na na na na PD7042a 1 2 3 61 RA 12/06/2003 11/01/2001 01/07/2007 0 0 "5q-, 1 doppio minuto" 1 0 1 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 low 0 496 0 10.4 3.538 378 4 2.13 8 na na na na PD6053a 1 1 3 72 RAEB 05/07/2002 05/07/2002 05/05/2003 1 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 1 na 1 368 1 8 1.083 83 8 0.67 0 na na na na PD6054a 1 1 3 69 RARS 03/07/2003 27/11/2002 02/02/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 na 0 11 2.76 285 1 0.54 44 na na na na PD6997a 1 1 3 72 RA 09/04/2008 09/01/2003 16/12/2010 0 0 "12p-, 5q-" 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 low 0 283 0 8.2 2.16 292 3 4.56 0 na na na na PD6830a 0 1 3 61 AML-MDS 31/05/2004 15/12/1997 31/05/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 1 na 0 8.6 2.5 150 21 0.85 45 na na na na PD6822a 1 1 3 86 RCMD 24/12/2002 19/12/2002 23/06/2004 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 25 1 8 1.52 91 2 0.56 0 na na na na PD6852a 1 2 3 49 RCMD-RS 23/06/2010 26/03/2008 25/07/2010 0 0 riarr 7p 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 high 1 1000 0 7 2.5 300 3 3 86 na na na na PD6870a 1 1 3 59 AML-MDS 24/04/2008 02/11/1999 02/06/2008 1 0 "5q-, iso Xp" 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 high 1 746 1 7.8 0.381 18 21 0.92 3 na na na na PD6790a 1 2 3 49 5q- 16/02/2004 09/10/2000 06/01/2009 0 20/11/2008 1 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low low 1 1240 0 7.4 2.756 264 2 6.69 26 na na na na PD6055a 1 0 3 52 RAEB 2 04/03/2003 04/03/2003 04/04/2003 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 91 1 9 1.7 109 11 7.33 0 na na na na PD6949a 1 1 3 77 RAEB 1 25/06/2006 01/06/1996 25/07/2006 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 int-1 high 1 3367 0 7.9 1.568 219 6 2.57 0 na na na na PD6056a 0 2 3 62 RAEB 2 09/09/2003 15/05/2000 15/12/2009 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 na 0 na na na 12 6.14 0 na na na na PD6926a 1 1 3 51 RCMD 04/10/2002 04/10/2002 22/01/2003 1 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 1 178 1 9 0.04 119 4 0.35 0 na na na na PD6057a 1 1 3 68 RA 01/04/2004 15/10/1999 01/01/2009 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low low 1 na 0 na na na 2 7.33 1 na na na na PD6785a 0 1 3 49 RAEB 2 27/09/2007 21/01/2002 28/04/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 256 0 7.5 7.17 48 14 1.56 0 na na na na PD6058a 1 1 3 76 RCMD 20/05/2003 13/06/2001 01/02/2005 1 23/12/2004 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 na 1 8 0.68 190 1 5.67 0 na na na na PD6780a 1 1 3 57 RCMD-RS 13/05/2004 09/03/2001 11/05/2005 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 1 802 1 7.9 0.306 39 1 4 40 na na na na PD6059a 0 1 3 84 RCMD-RS 06/05/2008 15/07/2005 17/11/2008 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 na 0 na na na 1 1 15 na na na na PD6975a 0 1 3 69 RAEB 2 23/04/2008 27/05/2002 19/01/2010 1 0 na na na na na na na na na na na na na na 0 1 0 1 0 0 0 0 0 na very high 1 3840 1 6.7 1.02 185 15 9 5 na na na na PD6856a 1 2 3 42 RA na 13/01/2003 17/09/2003 0 normal na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 1 116 1 8 2880 41000 1 0.7 0 na na na na PD6994a 0 1 3 62 5q- 09/01/2007 04/07/2001 28/09/2006 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 591 0 11 1.75 361 4 1.44 69 na na na na PD6060a 1 2 3 54 RARS 11/02/2004 23/07/1980 26/04/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 na 0 na na na 2 4.88 42 na na na na PD6808a 0 1 3 67 CMML 05/05/2004 29/10/2002 12/09/2005 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 154 0 11.8 1.32 132 5 2.57 0 na na na na PD6815a 0 1 3 19 RCMD 19/11/2008 15/03/2000 09/09/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 1157 1 7.5 1.2 24 2 2.33 0 na na na na PD6061a 1 2 3 64 RCMD 22/06/2004 23/12/2002 18/02/2010 0 0 7q- 2 0 0 1 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 high 0 37 0 11.2 1.472 121 2 3.55 7 na na na na PD6798a 0 1 3 63 5q- 31/05/2004 16/10/2003 17/01/2010 1 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 137.5 0 10.3 3.9 99 1 1.33 75 na na na na PD7038a 1 1 3 77 5q- 03/03/2003 23/11/1998 06/07/2003 1 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 low 1 928 1 6.2 1.391 257 4 1.7 0 na na na na PD6062a 1 1 3 78 RCMD-RS 20/05/2003 20/05/2003 29/03/2010 0 0 12p- 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 287 0 8.1 2.298 604 0 0.61 36 na na na na PD6063a 1 1 3 54 RA 20/02/2003 04/04/2001 20/01/2005 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 na 0 na na na 1 na 0 na na na na PD7001a 1 1 3 65 RARS 09/11/2005 09/11/2005 14/02/2006 0 03/02/2006 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 261 0 10 5.299 219 3 6.69 81 na na na na PD6064a 0 1 3 53 RCMD 06/07/2004 15/10/2002 15/12/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 na 0 7.7 5.65 573 4 1 0 na na na na PD6892a 1 0 3 35 RCMD 02/10/2002 01/10/2000 21/01/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 61 1 11.8 1.496 79 3 3.35 0 na na na na PD6065a 1 1 3 66 RARS-T 22/04/2004 26/03/2003 30/05/2010 0 0 12p- 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 na 0 9.9 3.477 651 3 1.5 34 na na na na PD6066a 1 0 3 46 RA 18/04/2002 15/07/2000 18/04/2002 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 1370 0 9.4 2 268 2 0.49 8 na na na na PD6067a 1 2 3 67 RCMD-RS 13/02/2004 21/07/2003 06/04/2005 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 na 0 na na na 0 1.08 79 na na na na PD6792a 1 1 3 40 RCMD 10/09/2003 01/01/2000 27/01/2004 0 0 riarr 10p e X 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 na 0 11.4 1.144 147 1 na 0 na na na na PD6068a 1 2 3 64 RARS 08/05/2003 08/05/2003 21/01/2007 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 167 0 8.1 2.685 435 4 1.38 68 na na na na PD6069a 1 0 3 26 RA 25/03/2003 15/04/1981 25/03/2003 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 na 0 na na na 1 na 0 na na na na PD6070a 1 1 3 62 RCMD-RS 17/07/2003 01/03/2003 01/08/2007 0 0 "monosomy 3, 1 mar" 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 102 1 9.9 0.783 330 1 3.17 30 na na na na PD6071a 1 2 3 66 RAEB 2 20/04/2005 05/01/2005 13/09/2006 1 20/12/2005 1 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na high very high 1 728 1 7.3 0.264 75 16 9 na na na na na PD6905a 1 1 3 33 RARS-T 20/06/2003 30/06/1989 22/07/2003 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low 0 360 0 10.1 2.5 800 1 na 30 na na na na PD7014a 1 2 3 84 RA 19/04/2004 04/07/2003 01/04/2007 0 0 del16q na 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 low 0 2910 na 11.3 1037 63000 1 1.33 0 na na na na PD6072a 0 1 3 68 RAEB 19/07/2004 29/01/2003 02/12/2004 1 0 anomalies cr 7 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-2 very high 1 na 1 na na na 5 3.35 10 na na na na PD6050a 1 1 3 75 RAEB 2 28/12/2004 16/12/2004 10/06/2005 1 13/05/2006 1 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na very high 1 536 1 8.7 0.174 21 18 2.85 na na na na na PD6073a 1 2 3 73 RAEB 11/05/2004 12/05/2003 22/07/2008 0 24/10/2006 1 riarr 1p 1 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 high 0 na 0 12.1 0.96 136 9 1.33 0 na na na na PD6797a 0 1 3 67 AML-MDS 10/05/2004 09/07/2003 27/09/2004 1 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 0 na 1 8.4 1.465 61 54 2.85 0 na na na na PD6074a 1 1 3 60 RARS 30/04/2008 21/03/2003 31/12/2010 0 0 20q- 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 1 na 1 9.6 1.019 139 4 0.96 50 na na na na PD6971a 0 0 3 57 RARS 02/08/2007 10/04/1996 14/12/2006 0 0 t(3;9) 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 na 0 9.8 2.352 196 4 2.45 60 na na na na PD6820a 1 0 3 39 AML-MDS 04/01/2005 01/12/2004 15/01/2005 1 15/03/2005 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 0 65 0 8.9 1.833 166 27 2.03 12 na na na na PD6075a 0 1 3 66 RA 13/05/2004 22/05/2003 16/01/2008 1 01/08/2007 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 na 0 8.6 6.24 244 4 9 na na na na na PD6793a 1 1 3 72 CMML 04/02/2004 16/09/2003 07/05/2004 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 123 0 7.9 13.44 134 4 na 0 na na na na PD6076a 0 2 3 82 RCMD-RS 12/05/2004 12/05/2004 12/05/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 1 na 1 na na na 1 0.54 92 na na na na PD6960a 1 1 3 71 5q- 24/08/2006 30/06/2003 09/05/2007 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 very low 0 500 1 8.8 0.9968 228 4 4.88 52 na na na na PD6969a 1 1 3 73 RAEB 2 26/04/2007 06/04/2007 29/05/2007 0 0 46 XX na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 345 na 7.6 1.34 15 12 2.03 0 0 1.5 3 1 PD6077a 1 1 3 37 RCMD 05/07/2004 15/07/2003 05/05/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 22 0 10.7 2.6 77 1 6.69 0 na na na na PD6078a 1 2 3 45 RAEB 26/02/2004 01/07/2003 28/04/2004 0 28/04/2004 1 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 na na na na na na na na na int-2 high 0 na 1 9.2 0.608 202 7 5.67 12 na na na na PD6079a 0 1 3 30 RAEB 2 09/02/2009 09/02/2009 22/05/2009 0 0 t(3;21) 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 184 0 12 2.73 35 11 na 15 na na na na PD6934a 1 1 3 53 RAEB 2 03/03/2004 03/03/2004 01/06/2004 0 10/05/2004 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 169 0 10 1.8816 81 18 0.75 66 na na na na PD6806a 0 1 3 57 RA 12/03/2004 11/03/2004 12/03/2004 0 0 "Y-, der(11q), der(1)" 2 0 0 0 0 1 0 0 0 0 1 0 1 na na na na na na na na na int-1 intermediate 0 na 0 10 2.5 150 1 4.88 0 na na na na PD7045a 0 1 3 36 RAEB 2 07/04/2004 06/04/2004 17/05/2004 0 0 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 high very high 0 na 0 10 2 100 11 3.17 54 na na na na PD6811a 0 2 3 83 RCMD 07/04/2004 05/02/2004 01/03/2005 1 0 t(1;6) 1 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 high 1 na 1 9.4 1.617 222 1 1.7 0 na na na na PD6823a 0 1 3 68 AML-MDS 12/07/2004 19/03/2004 30/06/2008 1 0 riarr 11p 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 na 0 173 1 11.4 1.0982 90 41 3.35 1 na na na na PD6080a 0 1 3 82 RCMD 06/06/2005 06/06/2005 30/06/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 447 0 10 1.0504 186 2 na 0 na na na na PD6786a 0 1 3 81 RCMD 27/04/2004 27/04/2004 24/05/2004 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 1405 0 9.7 4.05 705 3 9 0 na na na na PD6824a 1 1 3 55 RCMD 23/09/2004 11/12/2000 17/07/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 393 1 9.2 0.924 49 0 na 0 na na na na PD6081a 1 1 3 80 RCMD 30/03/2004 17/01/2003 30/03/2004 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 0 na 1 11.1 1.325 60 2 3.17 7 na na na na PD6837a 1 1 3 65 5q- 09/11/2004 20/10/1999 15/08/2005 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 low 1 2758 1 7.4 0.851 266 2 9 0 na na na na PD6831a 1 1 3 55 RAEB 1 07/04/2005 15/03/2005 07/04/2005 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 intermediate 0 65 0 10.6 1.9 334 6 4.56 11 na na na na PD6935a 0 2 3 na RAEB 2 19/04/2004 17/01/2004 01/07/2004 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 2910 na 10.5 270 23000 na 9 0 na na na na PD6082a 0 1 3 71 RCMD 20/04/2004 15/12/2002 12/05/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 326 1 13.1 0.925 77 4 1.38 0 na na na na PD6083a 1 1 3 82 RARS 14/04/2004 14/04/2004 01/08/2007 0 0 "12p-, riarr 11q" 1 0 0 0 0 1 1 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 118 1 8.6 1.6146 128 4 1.38 15 na na na na PD6084a 1 1 3 61 RCMD-RS 10/05/2004 20/06/2003 20/07/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 7 0 10.1 2.83 444 3 3.55 15 na na na na PD7004a 0 2 3 54 RCMD 22/04/2004 21/03/2003 22/04/2004 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 0 2910 na 10.5 270 23000 12 9 0 na na na na PD6944a 1 1 3 61 RCMD 14/04/2004 15/06/1998 15/06/2004 0 0 "45 XX,-2,del(5q)" na 0 1 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-1 intermediate 0 113 na 9.8 2.65 111 0 6.69 0 0.5 0 1 0 PD6085a 0 1 3 76 RARS 13/05/2004 13/05/2004 01/08/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 950 0 9 3.762 302 1 3 78 na na na na PD6086a 0 1 3 72 RA 30/06/2004 05/10/2000 29/11/2004 0 0 Y- 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 int-1 low 1 na 1 na na na 3 1.17 na na na na na PD6912a 0 2 3 75 AML-MDS 01/06/2004 22/01/2004 01/08/2004 1 0 11q- 1 0 0 0 0 1 0 0 0 0 0 0 0 na na na na na na na na na na 0 281 0 8.6 0.3276 167 40 5.67 0 na na na na PD6087a 0 0 3 82 RAEB 07/05/2004 15/10/2003 26/11/2004 1 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 na 1 na na na 5 na na na na na na PD6834a 0 1 3 58 RCMD 09/09/2004 07/01/2002 18/11/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 304 0 15.3 1.944 174 4 4.56 0 na na na na PD6781a 0 1 3 65 RCMD 21/04/2004 06/04/2004 31/12/2006 0 0 t(7;11) 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 high 0 na 0 12.7 12.616 87 4 2.85 0 na na na na PD6821a 0 1 3 78 RCMD 03/05/2004 19/12/2003 15/05/2006 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 555 1 8.2 2.64 72 3 2.13 0 na na na na PD7007a 1 2 3 89 RAEB 2 16/06/2005 30/06/2004 17/01/2006 1 21/12/2005 1 anomalies cr 7 2 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na high very high 0 91.7 0 10.3 1.3156 104 11 1.22 22 na na na na PD6867a 0 2 3 63 AML-MDS 15/06/2004 07/01/2004 15/06/2004 0 0 normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na 0 na 1 10.1 920 45000 46 1.33 0 na na na na PD6088a 0 1 3 64 RAEB 2 24/06/2004 15/02/2004 15/07/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 1700 1 8.3 0.479 19 13 2.23 22 na na na na PD6089a 0 0 3 69 RA 28/06/2004 28/06/2004 25/10/2007 0 0 7q- 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 26 0 13.3 6.107 225 1 2.45 0 na na na na PD6090a 0 1 3 62 RCMD-RS 24/06/2004 24/06/2004 14/02/2010 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low intermediate 1 1360 0 7.5 3.9196 106 3 3.76 29 na na na na PD6787a 0 1 3 72 CMML 01/09/2004 15/09/1998 05/05/2006 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 394 1 9.2 5.5513 63 7 4.88 0 na na na na PD6091a 0 1 3 70 RARS 06/07/2004 06/07/2004 19/08/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 1450 0 10.4 2.37 227 4 2.03 80 na na na na PD6092a 1 1 3 76 RARS 20/07/2004 15/11/2003 25/06/2008 0 0 20q- 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 low very low 0 564 0 9.1 2.468 166 1 0.61 49 na na na na PD6093a 0 1 3 79 RCMD 06/09/2005 30/06/2000 17/04/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 na 0 na na na 4 6.69 2 na na na na PD6094a 0 1 3 63 RARS 16/04/2008 10/05/2004 10/06/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 na 0 8.5 6.426 154 1 1 60 na na na na PD6810a 1 1 3 60 5q- 19/07/2004 15/06/2001 07/06/2004 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low very low 0 161 0 10 2.5 150 1 3.76 0 na na na na PD6788a 0 1 3 57 RAEB 14/09/2004 15/09/2003 08/10/2004 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 high 1 3000 0 8.3 1.848 103 8 5.67 0 na na na na PD6812a 1 1 3 73 CMML 18/10/2004 19/04/2004 14/12/2004 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 1 693 1 6.2 1.302 121 6 9 0 na na na na PD6863a 0 2 3 68 RAEB 1 17/09/2004 01/07/2003 17/09/2004 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 1 2090 1 7.3 0.48 4 6 1.04 0 na na na na PD6095a 0 1 3 50 RAEB 2 13/09/2004 15/05/2004 04/02/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 na 1 11.9 1.905 85 12 9 0 na na na na PD6825a 1 1 3 66 RCMD 23/09/2004 15/11/2003 23/09/2004 0 0 anomalies cr 7 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 high 0 67 0 11.3 1.988 45 3 1.94 0 na na na na PD7033a 0 1 3 64 RAEB 1 01/10/2004 15/08/2004 01/10/2004 0 0 "5q-, 1 mar" 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-2 high 0 423 1 10.5 1.617 95 6 3 0 na na na na PD6096a 0 2 3 70 RCMD-RS 20/09/2004 20/09/2004 19/10/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low intermediate 1 762 0 8 5.538 430 1 1.38 84 na na na na PD6097a 1 1 3 56 RAEB 2 15/12/2004 06/08/2004 15/03/2006 1 17/10/2005 1 t(1q;10q) 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 high very high 1 627 1 na na na 11 1.94 4 na na na na PD6098a 0 1 3 61 RAEB 17/12/2004 17/12/2004 10/01/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 high 1 4340 0 8.3 2.101 465 6 3 60 na na na na PD7031a 0 1 3 71 RCMD 16/12/2004 15/11/2004 05/01/2006 1 19/12/2005 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 672 1 8.6 0.442 148 3 1.5 1 na na na na PD7035a 1 2 3 33 RAEB 2 16/08/2004 15/10/2003 08/02/2011 1 24/09/2004 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 8 1 9.9 0.139 84 19 3 0 na na na na PD6782a 0 1 3 66 RA 11/05/2004 15/04/2004 11/03/2005 1 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 1450 1 7.3 0.816 24 4 0.2 0 na na na na PD6099a 0 1 3 61 RCMD 02/02/2006 02/12/2003 12/11/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 4698 1 9.2 1.492 40 3 9 6 na na na na PD6100a 1 1 3 84 RARS 23/11/2004 04/11/2004 24/11/2004 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 327 0 9.5 4.5346 36 1 6.69 47 na na na na PD6101a 0 1 3 72 RAEB 07/10/2004 07/10/2004 04/03/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 526 0 10.3 1.284 187 7 5.25 1 na na na na PD6102a 0 1 3 63 RA 21/10/2004 15/09/2004 08/07/2007 1 06/10/2006 1 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 858 0 7.9 2.33 189 4 9 na na na na na PD6816a 1 1 3 61 RARS-T 03/11/2004 03/11/2004 21/05/2010 0 0 12q- 1 0 0 0 0 0 1 0 0 0 0 0 0 na na na na na na na na na int-1 0 700 0 10.7 4.617 787 1 1.44 72 na na na na PD6968a 1 1 3 74 AML-MDS 21/07/2005 15/10/2004 01/12/2005 1 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-2 1 5200 1 7.5 0.596 54 21 9 6 na na na na PD6103a 1 2 3 62 RARS 19/11/2004 09/10/2003 23/12/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 404 0 8.3 2.925 254 1 0.85 68 na na na na PD6814a 0 1 3 75 RCMD 21/07/2004 15/01/2004 12/08/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 1 1810 1 9.2 0.7296 24 1 4.56 0 na na na na PD6104a 0 2 3 63 RCMD-RS 10/02/2004 07/06/2001 10/02/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low low 0 315 0 8.4 4.144 509 1 1.94 62 na na na na PD6835a 1 1 3 46 RARS-T 20/10/2004 20/10/2004 18/12/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 502 0 10.1 4.9042 543 3 1.44 58 na na na na PD6784a 1 1 3 82 CMML 15/11/2004 15/11/2004 15/11/2004 0 0 "12p-, 20q-" 1 0 0 0 0 0 1 0 0 1 0 0 0 na na na na na na na na na int-2 0 1070 1 9.1 8.5171 17 7 1.78 0 na na na na PD7021a 1 1 3 89 RARS-T 24/10/2005 11/10/2005 20/12/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 243 0 10.2 3.8 456 2 1.33 87 na na na na PD6794a 0 2 3 63 RARS-T 05/11/2004 30/06/1999 23/07/2010 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low 1 2770 0 8.5 2.45 500 1 1.63 91 na na na na PD6105a 1 1 3 31 RAEB 04/01/2005 27/12/2004 14/04/2005 0 0 "5q-, 3q-" 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 1 379 1 5.6 12.594 90 7 0.72 45 na na na na PD6106a 0 1 3 66 RAEB 27/01/2005 01/11/2004 23/03/2005 0 23/03/2005 1 anomalies cr 7 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-2 high 0 1140 1 9.5 1.86 90 5 9 0 na na na na PD6817a 1 0 3 77 5q- 15/02/2005 30/06/2003 15/02/2005 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low very low 0 442 0 10.1 1.4 192 0 na 0 na na na na PD6801a 1 2 3 56 5q- 13/12/2004 26/09/1994 14/09/2009 0 23/06/2009 1 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 1 1030 1 8.7 1.088 380 1 3.76 33 na na na na PD6107a 1 1 3 79 RARS 30/04/2008 01/09/2001 01/08/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 na 0 8.2 3 250 1 0.92 70 na na na na PD6802a 0 1 3 57 RARS-T 17/02/2005 12/08/1999 17/03/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 1044 0 9.4 2.5 500 2 1.22 86 na na na na PD6108a 1 2 3 74 RA 25/02/2005 01/11/2004 01/04/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 1343 0 10.8 4.2 252 4 2.13 0 na na na na PD6921a 1 1 3 79 RA 01/08/2006 18/05/2005 01/08/2006 0 0 46 XX na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 32 na 9.1 3.13 721 1 5.67 0 0 0 0 0 PD6109a 0 2 3 28 RAEB 2 07/04/2004 13/02/2004 15/07/2004 1 06/07/2004 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 1 na 1 9.6 1.83 34 15 9 0 na na na na PD6942a 0 1 3 78 RAEB 2 12/05/2010 30/03/2004 14/06/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 135 1 9.98 0.675 37 12 0.67 0 na na na na PD6110a 1 1 3 74 RA 19/02/2007 06/02/2004 15/11/2007 0 0 na na na na na na na na na na na na na na na na na na na na na na na low low 1 106 0 9.8 6.171 521 0 0.75 0 na na na na PD7039a 0 2 3 56 RAEB 2 08/03/2005 01/10/2004 01/01/2006 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 1 1050 1 4.5 0.1708 43 15 2.85 0 na na na na PD6977a 0 1 3 71 RAEB 2 15/03/2005 03/11/2004 15/07/2009 0 08/11/2007 1 46 XY na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 236 na 3.2 0.81 142 18 na 0 0 1.5 3 0 PD6111a 1 2 3 47 RCMD 14/03/2005 15/06/1995 12/04/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 low 0 35 1 8.1 6.912 38 1 2.33 na na na na na PD6804a 1 0 3 75 RCMD 01/03/2005 01/03/2005 12/01/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 136 0 11.3 6.104 73 1 9 0 na na na na PD6112a 1 1 3 57 RAEB 28/04/2005 24/09/1998 18/05/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 154 0 10.6 5.678 145 5 3.55 0 na na na na PD7006a 0 1 3 58 RAEB 1 29/04/2005 30/06/1998 18/05/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 2622 1 11.4 0.4 42 8 5.25 0 na na na na PD6878a 0 1 3 65 RARS-T 08/06/2006 06/05/2005 14/01/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 598 0 11.2 3.022 461 2 3.35 20 na na na na PD6995a 0 2 3 54 RCMD 07/06/2005 07/06/2005 07/06/2007 0 25/01/2007 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 1 1160 1 8.8 0.378 45 3 0.56 5 na na na na PD6113a 1 1 3 64 RAEB 2 21/10/2005 22/09/2005 23/12/2005 0 16/12/2005 1 2q- 1 0 0 0 0 0 0 0 0 0 0 1 0 na na na na na na na na na int-2 high 0 349 0 12.1 0.13 117 19 1.04 26 na na na na PD6865a 0 1 3 53 RA 27/09/2005 05/02/2005 01/04/2007 0 0 9q- 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 low 0 223 0 12.5 2.142 220 2 4.26 0 na na na na PD7027a 0 1 3 73 AML-MDS 29/07/2005 12/08/2003 30/03/2006 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 1 1770 0 7.9 1.2 164 35 4.56 0 na na na na PD6114a 0 1 3 87 RCMD 04/10/2005 04/10/2005 09/11/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 na 1 9 1.5 90 2 5.67 0 na na na na PD6923a 0 1 3 67 RCMD 16/11/2005 15/05/1997 19/01/2005 0 0 "5q-, 11q-" 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 int-1 intermediate 0 21 1 12.8 0.925 25 3 3.35 0 na na na na PD6978a 0 0 3 66 RCMD 28/10/2005 30/07/2005 19/11/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 329 1 10.7 1.527 25 1 3.17 0 na na na na PD6925a 1 1 3 76 RCMD 01/09/2004 01/07/2004 01/03/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 224 1 8.8 0.4906 42 4 2.57 1 na na na na PD8934a 0 1 3 na RAEB 1 14/11/2005 15/05/2005 01/12/2005 0 0 na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6791a 0 1 3 76 RCMD 09/06/2004 09/06/2004 09/07/2004 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 na 0 10.5 1.504 188 3 4 0 na na na na PD6917a 0 1 3 72 RAEB 2 24/11/2005 07/10/2005 16/12/2005 0 0 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 high very high 1 1100 1 8.9 1.747 16 15 0.67 10 na na na na PD6779a 1 1 3 53 AML-MDS 15/06/2005 30/06/1998 15/07/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 1 na 0 7.4 2.5 113 21 2.13 76 na na na na PD6872a 0 1 3 60 RCMD 07/12/2005 23/11/2005 23/10/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 615 1 12.9 0.999 97 3 4.26 6 na na na na PD6115a 0 1 3 69 RCMD 25/01/2010 16/12/2005 18/02/2010 0 0 na na na na na na na na na na na na na na 0 1 0 0 0 0 0 0 0 na 1 1930 1 8.09 2.552 95.1 0 4 0 na na na na PD7017a 1 1 3 65 RAEB 1 08/06/2006 19/12/2005 08/06/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 high 1 615 1 7.1 0.4 22 7 0.39 20 na na na na PD6983a 0 1 3 42 RAEB 1 29/11/2005 19/11/2005 11/08/2006 0 27/03/2006 1 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 266 1 12.6 1.675 66 6 8.09 1 na na na na PD6116a 1 1 3 30 RCMD 24/11/2005 26/10/2005 23/02/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 307 1 7.3 1.507 30 0 0.64 5 na na na na PD8936a 0 1 3 na RARS 05/12/2005 30/06/2001 10/07/2006 0 0 na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD6846a 1 1 3 54 RCMD-RS 21/01/2005 21/01/2005 21/09/2008 0 0 11q- 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 30 0 13.9 1.05 126 4 2.7 17 na na na na PD6914a 0 1 3 48 RAEB 2 12/05/2005 12/05/2005 26/11/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 221 1 14.4 1.762 80 19 1.04 0 na na na na PD6805a 0 1 3 74 RCMD 07/10/2005 06/09/2005 27/09/2006 0 0 "trisomy 8, 5q-" 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 high 1 181 0 7.2 1.9824 297 3 5.25 0 na na na na PD7010a 0 1 3 71 AML-MDS 07/06/2005 08/10/2003 27/03/2006 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 0 na 0 11 0.227 93 32 1.63 0 na na na na PD6117a 0 1 3 71 RAEB 2 24/11/2005 24/11/2005 16/12/2005 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 486 1 9.3 0.95 60 17 2.33 0 na na na na PD6800a 0 1 3 50 RA 28/10/2004 15/12/2002 15/09/2005 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 296 0 6.9 2.145 323 2 0.92 8 na na na na PD6118a 0 2 3 60 RCMD-RS 22/09/2004 15/02/2003 24/07/2007 0 04/06/2007 1 11q- 1 0 0 0 0 1 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 0 434 0 9.5 2.995 218 3 2.85 39 na na na na PD6119a 0 1 3 59 RCMD-RS 29/09/2004 09/09/2004 22/05/2007 1 0 17p- 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 int-1 intermediate 1 821 1 7 1.72 275 3 3.17 36 na na na na PD6120a 1 1 3 69 RCMD-RS 21/02/2006 21/02/2006 15/03/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 260 0 11.1 3.259 102 3 0.67 88 na na na na PD7029a 1 1 3 60 5q- 10/02/2006 15/02/2002 19/06/2009 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low very low 0 na 0 11.1 3.1616 536 3 3.35 68 na na na na PD8935a 0 1 3 na RAEB 1 02/02/2006 15/04/2005 02/03/2006 0 0 na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD8938a 1 1 3 na RA 26/01/2006 10/02/2005 09/05/2006 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6918a 1 1 3 42 RARS 05/09/2007 07/05/1991 14/10/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 1076 0 11.2 2.652 305 2 1.5 60 na na na na PD6850a 1 1 3 74 RCMD-RS 10/06/2009 08/09/2005 09/07/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 882 0 8.9 4.347 143 2 na 30 na na na na PD6981a 1 1 3 71 RA 15/02/2006 04/07/1997 22/02/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 very low 0 na 1 10.8 1.47 70 1 5.67 6 na na na na PD6838a 0 0 3 71 CMML 05/04/2005 06/06/2002 15/08/2005 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 int-1 1 na 1 7.2 0.504 69 4 8.09 0 na na na na PD6862a 1 1 3 78 RCMD 13/10/2004 27/09/2004 26/11/2004 1 01/11/2004 1 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 int-2 high 1 1800 1 7.2 1.035 183 3 0.23 5 na na na na PD6826a 0 1 3 68 RARS 22/12/2004 09/07/2003 06/04/2006 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 1 1280 0 7.6 2.5 150 2 1.38 78 na na na na PD6959a 0 1 3 54 RCMD-RS 23/01/2007 28/01/2004 15/02/2010 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 756 1 8.4 1.005 305 0 0.96 89 na na na na PD6799a 1 1 3 54 RCMD 17/06/2004 16/06/2004 23/11/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 35 1 9.5 2.47 72 0 1 0 na na na na PD7005a 0 2 3 65 AML-MDS 28/08/2008 16/12/2004 05/11/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 0 na 1 13.7 0.89 85 22 4.56 0 na na na na PD6789a 0 1 3 82 CMML 19/05/2004 19/05/2004 08/06/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 221 0 11.7 2.791 48 4 3.76 0 na na na na PD6121a 1 1 3 57 RAEB 05/09/2005 05/09/2005 14/12/2006 1 16/08/2006 1 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 high 1 3990 1 8.4 0.826 316 7 4.56 0 na na na na PD7040a 1 1 3 60 AML-MDS 04/05/2006 15/03/2006 12/07/2006 0 0 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 high 0 649 1 8.7 0.414 46 21 1.78 42 na na na na PD6946a 0 1 3 61 CMML 17/03/2006 15/09/2005 17/03/2006 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 int-1 0 179 1 8.7 10.695 63 4 2.33 0 na na na na PD6948a 0 1 3 74 RARS 09/05/2006 09/05/2006 27/06/2008 0 25/06/2008 1 "7q-, 20q-" 2 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 int-1 intermediate 0 213 0 8.7 2.725 251 4 1.13 66 na na na na PD6970a 0 1 3 55 CMML 11/04/2006 04/04/2006 15/06/2007 0 0 46 XY na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 0 397 na 14.6 1.02 86 4 4.26 72 na na na na PD6122a 0 1 3 68 RCMD-RS 24/07/2008 14/04/2006 13/01/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 933 0 10.1 6.146 401 3 0.54 75 na na na na PD6123a 1 2 3 74 RA 10/06/2004 27/02/2004 28/06/2006 0 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 10 0 9.2 4.158 198 1 0.92 0 na na na na PD7036a 0 1 3 72 RAEB 2 21/12/2005 28/04/2005 12/07/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 615 0 12 1.2987 160 12 2.85 0 na na na na PD6854a 0 1 3 56 AML-MDS 10/09/2004 01/08/2004 26/04/2005 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 1 310 1 5.4 0.95 45 63 4.56 0 na na na na PD6873a 0 1 3 49 RARS-T 12/07/2006 07/07/2006 31/08/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 2010 0 6 7.686 943 3 3.76 80 na na na na PD6899a 1 1 3 na RA 18/11/2006 30/06/2000 07/12/2010 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6982a 1 1 3 55 RCMD-RS 20/07/2006 09/11/2005 16/08/2006 0 0 "9q-, monosomy 17 and 18, 1 mar" 2 0 0 0 0 0 0 1 0 0 0 1 1 na na na na na na na na na int-2 high 1 1900 1 7 2 36 2 0.37 77 na na na na PD6881a 1 1 3 17 RARS-T 31/03/2003 31/03/2003 24/03/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low 0 65 0 12.3 4.9 977 0 5.67 24 na na na na PD6955a 0 1 3 64 RAEB 1 13/06/2006 21/03/2006 12/07/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 high 1 2350 1 6.5 0.45 24 8 7.33 15 na na na na PD7022a 0 1 3 63 RARS 15/05/2006 14/10/2004 20/06/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 1 3150 1 7.9 1.031 114 1 1.78 63 na na na na PD6828a 0 1 3 70 RARS 07/09/2004 07/09/2004 04/08/2006 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 1830 1 7.1 1.777 221 1 0.67 78 na na na na PD6124a 0 1 3 62 RCMD-RS 07/10/2004 01/10/2003 17/12/2009 0 01/10/2009 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 na 1 na na na 1 3.17 16 na na na na PD6855a 1 1 3 70 RA 06/03/2006 06/03/2006 13/01/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 162 0 13.5 7.756 129 4 2.57 0 na na na na PD6125a 0 1 3 63 RCMD 16/12/2005 25/07/2005 16/02/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 214 1 10.4 1.157 85 3 2.57 0 na na na na PD7015a 0 0 3 79 RAEB 1 25/09/2006 14/12/2004 01/08/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 intermediate 0 688 1 9.6 0.44 89 5 2.13 0 na na na na PD6126a 0 1 3 62 RCMD-RS 19/06/2006 15/02/2006 12/07/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 627 1 7.9 1.109 234 4 6.69 30 na na na na PD6874a 1 2 3 69 RA 30/03/2006 22/12/2005 26/03/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 12 0 9.8 2.88 363 2 1.94 0 na na na na PD6962a 0 1 3 72 RCMD 29/09/2006 03/10/2005 19/10/2006 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 1 1160 1 7.5 1.1895 38 4 2.33 0 na na na na PD7032a 1 1 3 50 AML-MDS 10/03/2006 15/10/2005 27/04/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 1 16 1 6.8 0.378 148 21 1.86 0 na na na na PD6913a 0 1 3 65 CMML 01/04/2009 22/11/2005 04/02/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 233 1 9.8 1.12 83 2 4.56 0 na na na na PD7003a 0 0 3 65 RCMD 13/03/2007 14/12/2006 20/03/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low intermediate 1 1030 0 8.7 2.8512 203 3 4 0 na na na na PD7041a 0 0 3 84 RARS 13/03/2007 12/03/2007 27/06/2007 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 208 0 10.1 3.1878 202 0 1.86 15 na na na na PD6127a 1 0 3 na RCMD 08/02/2007 21/06/2006 16/07/2010 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6128a 0 1 3 69 RAEB 2 24/01/2007 24/01/2007 12/02/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 222 0 15.6 2.241 115 11 2.13 0 na na na na PD6900a 0 1 3 53 RCMD 10/10/2005 07/11/2000 25/01/2006 0 0 Y- 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low low 0 237 0 15.8 2.55 63 1 2.7 0 na na na na PD6939a 1 1 3 58 RARS 29/06/2006 29/06/2006 25/11/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 356 0 9 2.64 305 0 0.61 94 na na na na PD6844a 1 1 3 37 RA 04/12/2006 08/06/1999 10/06/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 1 23 1 8.4 0.9 350 4 4 0 na na na na PD6996a 0 0 3 66 RAEB 2 04/11/2005 09/02/2000 25/09/2006 0 25/09/2006 1 "5q-, trisomy 8" 2 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 high very high 1 1135 1 6.2 1.192 121 16 2.85 72 na na na na PD6976a 0 0 3 57 RAEB 1 06/11/2006 15/09/2006 07/02/2008 0 0 "47 XY,del(5q),+21,t(3;9)" na 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 int-2 very high 1 248 na 8 2.18 179 5 1.33 1 1 0.5 2 0 PD6129a 1 2 3 61 RARS 30/10/2006 30/06/2004 30/11/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low low 1 697 0 7.5 4.718 323 1 0.54 94 na na na na PD6130a 1 1 3 33 RAEB 2 24/01/2008 24/01/2008 10/03/2008 0 0 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-2 high 1 1420 0 9.6 2.751 571 11 1.5 2 na na na na PD6897a 1 1 3 57 AML-MDS 06/10/2010 15/12/2005 14/12/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 551 0 7.9 2.55 446 3 0.18 1 na na na na PD6131a 0 1 3 63 RCMD-RS 14/07/2006 14/06/2006 10/12/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low intermediate 1 663 0 7.4 2.386 365 1 1.5 91 na na na na PD7019a 0 1 3 68 RCMD 06/11/2006 15/03/2006 27/11/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 1 635 1 8 0.264 206 4 2.33 10 na na na na PD6963a 0 1 3 60 RAEB 2 28/02/2007 06/06/2006 22/03/2007 0 0 trisomy 21 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 high high 0 78 1 8.3 1.221 396 13 2.7 24 na na na na PD6943a 0 1 3 59 RCMD-RS 07/03/2007 17/07/2006 22/03/2007 0 0 20q- 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 low low 0 103 0 11.5 0.972 187 2 2.03 16 na na na na PD7002a 1 1 3 66 RARS 17/08/2006 17/08/2006 29/09/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 155 0 11.1 1.6 144 0 1.5 30 na na na na PD6132a 0 1 3 34 RA 25/01/2008 21/08/2006 02/12/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 3550 0 7.2 4.588 226 2 5.67 0 na na na na PD6927a 1 1 3 59 RARS 04/11/2010 14/09/2005 16/12/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 203 0 10.3 3.65 279 0 0.96 72 na na na na PD6966a 0 0 3 58 RAEB 2 20/12/2005 19/12/2005 24/04/2006 1 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 908 1 8.6 0.35 70 17 na 18 na na na na PD6957a 0 1 3 69 CMML 28/10/2005 05/10/2005 01/01/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 156 0 11.3 7.803 476 2 9 0 na na na na PD6985a 1 1 3 56 RAEB 2 18/08/2005 29/06/2005 12/01/2006 0 18/12/2005 1 46 XX na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 high 0 608 na 12.3 0.59 21 18 5.25 17 0 1.5 3 1 PD6133a 1 1 3 73 RA 08/09/2004 07/11/1995 01/04/2007 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 na 0 na na na 2 na 0 na na na na PD6134a 0 0 3 39 RAEB 30/11/2010 26/06/2005 21/01/2011 0 0 "5q-, 7q-" 2 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-2 very high 1 na 0 9.4 2.645 347 9 1 0 na na na na PD6845a 1 1 3 50 5q- 28/07/2006 15/01/2001 20/06/2010 0 15/06/2010 1 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low low 1 558 0 7.9 1.935 305 1 4.26 0 na na na na PD6135a 0 1 3 83 RAEB 2 04/12/2006 15/10/2006 29/12/2006 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 1 0 na 0 319 0 11.1 2.208 88 19 4.88 0 na na na na PD7025a 0 1 3 72 RAEB 2 02/04/2008 08/02/2007 01/07/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 2120 1 9.3 0.4872 79 13 2.33 0 na na na na PD6909a 1 1 3 73 RCMD 28/12/2005 20/12/2005 08/02/2007 1 08/08/2006 1 monosomy 7 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-2 high 1 247 1 7.6 0.898 43 4 2.7 0 na na na na PD8937a 0 1 3 na RCMD 19/12/2006 18/12/2006 28/02/2007 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6851a 0 2 3 64 RCMD 22/11/2006 31/03/2000 07/12/2006 1 0 na na na na na na na na na na na na na na na na na na na na na na na na 1 1500 1 6.7 0.682 100 2 0.33 8 na na na na PD6973a 1 2 3 na RCMD-RS 31/01/2007 na na na NA Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low na na na na na na na na 60 na na na na PD7023a 0 2 3 53 RAEB 1 07/03/2007 15/10/2006 30/05/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 high 1 2720 1 8 1.5 150 5 9 0 na na na na PD6809a 0 1 3 82 AML-MDS 19/10/2004 19/10/2004 26/04/2005 1 26/04/2005 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 0 610 0 9.7 2.697 100 21 4.56 0 na na na na PD6136a 1 2 3 62 RARS 08/06/2004 30/06/1996 17/09/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low very low 0 na 0 na na na 0 na 15 na na na na PD6965a 0 1 3 70 AML-MDS 27/10/2004 01/09/2004 04/02/2005 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-2 0 1100 1 9.9 0.286 58 25 0.92 5 na na na na PD6051a 1 1 3 75 RA 27/01/2004 10/07/1997 16/05/2006 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 1 na 0 na na na 1 6.69 0 na na na na PD6998a 0 1 3 65 RAEB 2 28/03/2007 15/04/2005 02/04/2007 0 0 "5q-, trisomy 11" 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-2 very high 1 541 0 6.5 1.974 122 18 5.67 0 na na na na PD6869a 1 1 3 43 AML-MDS 26/04/2007 14/02/2007 30/04/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na 1 2150 1 7.7 2.268 11 45 9 4 na na na na PD7018a 1 1 3 76 RCMD 16/04/2007 10/02/2007 17/05/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 low 0 160 1 12.9 0.422 99 1 1.5 0 na na na na PD6137a 1 1 3 56 RA 03/05/2007 28/03/2007 04/06/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 814 0 10.4 2.39 224 2 1.33 0 na na na na PD7008a 0 1 3 69 RA 04/05/2006 07/04/2003 25/05/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low low 1 500 0 8.4 2.65 284 3 8.09 0 na na na na PD6052a 0 1 3 65 RCMD 20/04/2004 05/05/2000 04/02/2010 0 0 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 int-1 high 0 na 0 na na na 1 1.94 0 na na na na PD6138a 1 0 3 57 RAEB 12/04/2007 15/09/2005 19/05/2008 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 0 364 1 8.8 1.215 77 6 3.17 0 na na na na PD7011a 1 1 3 71 RARS 19/04/2007 20/03/2007 25/05/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 343 0 9.7 4.038 251 4 1.5 83 na na na na PD6139a 1 1 3 70 RAEB 17/03/2008 07/06/2007 05/05/2009 1 11/09/2008 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 na 0 8.1 1.94 473 7 9 0 na na na na PD6883a 1 1 3 79 RCMD 31/01/2011 01/05/2007 01/08/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 int-1 low 0 411 1 8.8 4.154 78 4 5.25 4 na na na na PD6140a 1 1 3 54 RARS 30/05/2007 09/01/2007 28/12/2009 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 149 0 10.4 2.026 219 0 1.5 40 na na na na PD6980a 0 1 3 61 RAEB 2 20/10/2006 15/09/2005 18/05/2007 0 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 114 0 10.5 5.36 38 10 4.56 8 na na na na PD7020a 0 1 3 74 RCMD-RS 07/04/2006 01/07/2005 11/07/2006 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 int-1 intermediate 1 1814 1 7.6 0.907 62 4 0.33 31 na na na na PD6941a 1 1 3 69 RCMD 18/07/2007 18/10/2006 01/08/2007 0 0 monosomy 7 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-2 high 0 na 1 13 1.4098 86 3 na 1 na na na na PD6141a 0 1 3 61 RCMD-RS 17/11/2008 26/06/2006 26/04/2010 0 0 "9q-, 13q-" 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 intermediate 0 na 0 9.5 2.855 180 4 2.33 94 na na na na PD7026a 0 1 3 63 RAEB 1 12/07/2007 15/10/2006 20/10/2008 0 18/12/2007 1 5q- 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 intermediate 0 600 1 8.9 0.725 91 8 4.88 0 na na na na PD6947a 1 1 3 77 RAEB 2 04/08/2004 01/08/2004 10/06/2007 0 04/06/2007 1 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 high 0 na 1 8.7 1.0272 169 11 4.56 0 na na na na PD6954a 1 1 3 57 RAEB 2 14/07/2006 14/07/2006 17/10/2006 0 18/09/2006 1 complex 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 high very high 0 349 1 10 0.601 11 13 6.69 0 na na na na PD6142a 1 1 3 74 RA 13/06/2007 30/06/2005 20/06/2007 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na 1 na 0 na na na 2 na na na na na na PD6143a 0 1 3 80 RAEB 2 05/09/2005 05/09/2005 19/12/2005 1 0 trisomy 8 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 high very high 1 1030 1 9.9 0.416 52 19 9 0 na na na na PD6972a 0 1 3 53 RAEB 1 16/01/2008 23/03/2007 08/10/2008 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 high 1 3250 1 7.3 0.5292 10 8 3.76 15 na na na na PD6144a 1 1 3 78 RARS 14/08/2007 14/08/2007 13/01/2011 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 802 0 10.3 2.102 391 2 1.5 45 na na na na PD6967a 1 1 3 50 RARS-T 06/07/2007 28/04/2004 06/09/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 566 0 7 3.55 577 2 0.33 80 na na na na PD6993a 1 1 3 68 RARS 25/07/2007 16/02/2006 02/08/2007 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 656 0 8 7.5 345 2 1.5 30 na na na na PD6938a 0 1 3 70 RARS 02/08/2007 02/08/2007 18/11/2010 0 0 Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low very low 0 409 0 8.7 3.207 282 0 1.94 60 na na na na PD7113a 1 1 3 na RA 10/02/2011 09/01/2003 31/08/2012 0 0 na na na na na na na na na na na na na na 0 1 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD7114a 1 1 3 na RA 01/08/2006 18/05/2005 01/08/2006 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6206a 0 1 4 73 RA 10/12/1997 10/12/1997 20/06/2001 1 0 "46, XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.3 0.8 147 1 na 0 na na na na PD6207a 0 2 4 76 RA 02/03/1998 13/02/1998 11/12/2002 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6208a 1 2 4 80 CMML 04/03/1998 04/03/1998 06/04/1998 1 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 9.7 4 313 7 na NA na na na na PD6209a 1 0 4 63 RAEB 1 25/03/1998 25/03/1998 30/04/2007 1 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 11.4 1.9 227 7 na 0 na na na na PD6210a 0 1 4 72 RA 22/04/1998 22/04/1998 29/09/1999 1 0 "46, XY [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.3 2.5 16 1 na NA na na na na PD6290a 0 0 4 78 RAEB 1 15/12/1997 15/11/1997 26/03/1999 1 NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na 10.5 0.8 57 1 na 0 na na na na PD6211a 1 1 4 83 CMML 21/05/1997 21/05/1997 06/10/2001 1 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.1 5.7 55 1 na 0 na na na na PD6212a 1 1 4 71 RAEB 1 09/06/1997 09/06/1997 17/09/1997 1 0 "46, X, del(X)(p11.2), add(16)(q12), dic(11;21)(q11;q22.3), +dic(11;21)(q11;q22.3)[10]" na 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 high na 1205 na 9.7 9.8 94 12 na 0 na na na na PD6213a 1 0 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6214a 0 1 4 69 CMML 11/08/1997 11/07/1997 13/02/1999 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na low na na na na na na 0 na NA na na na na PD6215a 1 1 4 92 RAEB 1 08/06/1998 08/06/1998 21/06/1998 1 0 "48-51, XX, +X,der(3;12)(q10;q10),-5,add (6)(p10),+8,+11,+13,+19,+1~3mar[cp15]/46,XX[3]" na 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 high na na na 8.7 0.6 32 11 na 0 na na na na PD6216a 1 0 4 67 RA 29/07/1998 10/04/1998 01/05/2011 0 0 "46, XX[10]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.1 3.7 322 1 na 0 na na na na PD6217a 0 1 4 80 RA 19/08/1998 15/05/1998 29/12/2001 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na na na na 0 na NA na na na na PD6218a 0 1 4 42 RAEB 1 24/11/1998 06/11/1998 20/05/2004 1 0 "47, XY, +r1[2]/49, XY, -7, +22, +r2, +mar 1, mar 2[5]/46, XY[13]" na 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 int-2 na na na 10.5 1.8 117 na na NA na na na na PD6219a 0 1 4 83 RA 02/12/1998 27/10/1998 25/02/1999 1 0 "46, XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11 0.9 410 1 na NA na na na na PD6291a 0 0 4 76 RAEB 1 02/12/1998 11/09/1996 27/07/1999 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 10.5 0.8 70 16 na NA na na na na PD6292a 0 0 4 84 RA 02/12/1998 10/04/1996 26/12/2000 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6220a 1 1 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6221a 0 1 4 74 RARS 14/01/1999 14/01/1999 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6222a 1 0 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6293a 1 1 4 58 RA 30/04/1999 21/01/1999 na 0 0 [normal] na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 na na na 9 1.6 18 1 na NA na na na na PD6294a 1 0 4 78 RA 29/01/1999 11/02/1998 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 10 6.9 535 1 na NA na na na na PD6295a 1 1 4 68 RARS 29/01/1999 28/06/1991 20/03/2005 1 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 13.5 0.2 268 1 na NA na na na na PD6296a 1 1 4 77 RARS 03/02/1999 25/07/1997 20/03/2003 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6297a 0 1 4 76 RA 02/02/1999 20/01/1994 22/03/2000 1 0 "45,X,-Y [10]/47,XY,+c[2]/46,XY (8)" na 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 int-1 na na na 11.9 3.1 64 1 na NA na na na na PD6298a 1 1 4 78 RARS 02/02/1999 25/05/1990 19/09/2002 1 0 "46,XX[40]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.9 3.3 241 1 na 60 na na na na PD6299a 1 1 4 79 RA 05/02/1999 17/09/1998 23/11/1999 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 10.8 0.8 67 1 na NA na na na na PD6223a 0 1 4 na RAEB na na na na NA na na na na na na na na na na na na na na 0 0 1 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6300a 0 1 4 48 RARS 16/02/1999 04/10/1995 12/12/2003 1 0 "47,XY,+8 (25)" na 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 int-1 na na na 14.2 11 884 1 na NA na na na na PD6301a 0 1 4 88 RA 15/02/1999 30/05/1996 12/10/1999 1 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na 10 3.2 194 2 na NA na na na na PD6302a 1 1 4 71 CMML 18/02/1999 20/01/1997 28/08/2001 1 0 "47,XX,+c(9)/46,XX (6)" na 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 low na na na 9.4 17 254 2 na NA na na na na PD6224a 0 1 4 74 RARS 22/02/1999 14/07/1995 01/05/2011 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na na na na 0 na NA na na na na PD6303a 0 1 4 69 RA 22/02/1999 28/11/1995 26/10/2003 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 9.9 1.7 253 2 na NA na na na na PD6304a 1 1 4 73 RARS 24/02/1999 30/12/1991 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 10 4.1 384 1 na NA na na na na PD6305a 1 1 4 77 RAEB 1 24/02/1999 29/01/1999 26/08/1999 1 0 "45XX,del (5) (q?), -18 [8] / 44XX, del (5) (q?), -7, -18 [3], 44,XX, del (5) (q?), -7, -18 [3]" na 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 na na na na na na na 0 na NA na na na na PD6306a 0 1 4 43 RA 01/03/1999 01/01/1999 13/12/1999 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6307a 1 1 4 85 RA 02/03/1999 18/06/1996 12/09/2005 1 0 "46, XX, del (20)(q11) [5]/46, XX [5]" na 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 8.9 3.2 195 1 na NA na na na na PD6308a 1 0 4 28 RAEB 1 08/03/1999 22/02/1999 11/04/1999 1 0 "46, XX[10]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 6.5 0.2 16 7 na NA na na na na PD6309a 0 0 4 55 RARS 16/03/1999 24/01/1995 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 na na na na 8.3 0.6 381 2 na NA na na na na PD6225a 0 1 4 71 CMML 18/03/1999 16/03/1999 21/08/2001 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 8.2 3.5 165 3 na NA na na na na PD6226a 1 0 4 52 RAEB 1 22/03/1999 01/08/1996 04/05/2000 1 04/03/2000 1 "47, XX, del(5)(q13q33), +21[3]/45, idem, add(2)(q11), -4, add(7)(q11), add(9)(q34), -17[7]" na 0 1 1 0 0 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0 0 int-1 na na na 4.9 3.6 519 10 na 0 na na na na PD6310a 0 1 4 72 RA 13/04/1999 12/04/1999 29/02/2012 0 0 "46, XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 13.6 0.6 156 1 na NA na na na na PD6311a 1 1 4 78 RARS 19/04/1999 10/07/1998 na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low na na na 8.1 3.6 305 1 na 20 na na na na PD6227a 0 0 4 70 RARS 21/04/1999 20/04/1999 na 0 0 "42-60, XY, del(3)(p21), -5, -7, add(12)(p10), +mar, inc[10]" na 1 1 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 int-2 na 609 na 5.3 1 60 na na 20 na na na na PD6313a 0 1 4 66 RARS 25/04/1999 13/06/1991 02/04/2003 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 11.1 3.6 281 1 na NA na na na na PD6228a 0 1 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 1 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6315a 0 1 4 na RARS 27/04/1999 na na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6229a 1 1 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6316a 1 0 4 58 RA 28/04/1999 09/07/1998 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6317a 0 1 4 na RAEB 1 02/05/1999 na na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6318a 1 1 4 74 RA 11/05/1999 10/12/1996 11/01/2001 1 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 na na na 9.4 0.8 35 0 na 0 na na na na PD6230a 0 1 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6231a 1 1 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6319a 0 0 4 85 CMML 20/05/1999 24/07/1997 14/07/2004 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 9.3 0.9 212 1 na NA na na na na PD6320a 0 1 4 59 CMML 20/05/1999 10/08/1996 12/01/2001 1 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 13 1.4 229 1 na NA na na na na PD6321a 0 0 4 64 RA 26/05/1999 05/11/1996 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6322a 0 1 4 61 RA 27/05/1999 16/05/1986 16/09/2005 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na 10.7 0.9 56 0 na NA na na na na PD6232a 1 1 4 70 RA 02/06/1999 02/06/1999 21/05/2000 1 19/01/2000 1 "46, XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.7 0.9 56 1 na 0 na na na na PD6233a 0 1 4 63 RA 03/06/1999 03/06/1999 22/04/2000 1 0 "46, XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 11.7 0.4 88 1 na NA na na na na PD6323a 1 0 4 na RAEB 1 02/06/1999 na na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na 0 na NA na na na na PD6234a 1 1 4 76 RA 09/06/1999 15/06/1994 15/04/2001 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6474a 0 1 4 77 CMML 09/06/1999 09/06/1999 01/01/2000 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6324a 0 1 4 70 RARS 29/06/1999 23/03/1999 29/09/1999 1 0 "45,X,-Y[9]/46,XY [21]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 10.5 2.2 8 3 na NA na na na na PD6325a 1 1 4 75 RAEB 1 29/06/1999 01/04/1999 25/06/2000 1 0 (normal) na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na 9.6 6.2 79 0 na NA na na na na PD6326a 1 0 4 78 RA 12/07/1999 19/08/1998 10/08/1999 1 0 "46, XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na int-1 na na na 8.7 1.5 56 0 na 0 na na na na PD6327a 1 0 4 na 5q- 20/07/1999 na na 0 0 5q- na 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6328a 0 1 4 na RAEB 1 20/07/1999 na na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6329a 1 1 4 na RA 22/07/1999 na na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 na na na na na na na 0 na NA na na na na PD6330a na 3 4 86 RA 09/08/1999 12/07/1999 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6475a 0 1 4 34 CMML 19/08/1999 19/08/1999 na 0 0 "46, XY [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 13.8 1.3 25 0 na 0 na na na na PD6476a 0 1 4 85 RA 15/09/1999 15/09/1999 16/08/2005 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 341 na 9.5 5.4 305 0 na 0 na na na na PD6477a 0 1 4 77 CMML 28/10/1999 27/10/1999 05/12/2001 1 25/10/2001 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 8.1 na 188 2 na 0 na na na na PD6235a 1 1 4 70 RAEB 1 04/11/1999 03/11/1999 08/09/2000 1 0 "46,XX,del(5)(q22q32)[2]/46,idem,add(1)(p36),add(18)(p11),-21,+mar[6]/46,XX[2]" na 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 high na na na 8.6 0.4 35 14 na 0 na na na na PD6331a 1 3 4 87 RARS 25/11/1999 17/09/1999 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na na na 0 na NA na na na na PD6332a 0 1 4 na CMML 05/12/1999 na na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6333a 1 1 4 83 RARS 17/01/2000 27/12/1995 03/10/2004 1 0 "46, XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 668 na 8.6 2.4 376 0 na 20 na na na na PD6236a 0 1 4 na RAEB na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6478a 0 1 4 80 RA 15/03/2000 15/03/2000 24/05/2000 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.1 6 49 2 na 7 na na na na PD6479a 1 1 4 77 CMML 22/03/2000 22/03/2000 02/05/2001 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 low na na na 11.4 2.7 131 0 na 0 na na na na PD6237a 1 1 4 72 RA 10/04/2000 10/04/2000 09/05/2005 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11 2.4 322 3 na 0 na na na na PD6480a 1 1 4 85 RAEB 1 19/04/2000 19/04/2000 15/01/2002 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 13 1.5 88 7 na 0 na na na na PD6481a 0 1 4 74 RAEB 1 24/04/2000 24/04/2000 10/06/2002 1 13/08/2001 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 7.8 2.7 142 10 na 0 na na na na PD6334a 0 1 4 65 CMML 26/04/2000 26/04/2000 10/03/2005 1 0 "45,X,-Y[16]/46,XY[4]" na 0 0 0 0 0 0 0 0 0 1 0 0 na na na na na na na na na int-1 na na na 10 1.8 116 8 na 0 na na na na PD6238a 0 1 4 na RA na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6482a 1 1 4 76 RARS 13/07/2000 13/07/2000 14/04/2006 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.2 1.6 195 1 na 40 na na na na PD6483a 1 1 4 69 RARS 03/08/2000 03/08/2000 15/01/2003 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 937 na 10.6 6.4 254 1 na 30 na na na na PD6239a 0 1 4 na CMML na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6240a 0 1 4 69 RARS 13/12/2000 22/11/2000 07/03/2003 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 int-1 na na na 7 2.4 36 0 na 20 na na na na PD6484a 0 1 4 81 RAEB 2 21/12/2000 21/12/2000 28/12/2002 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 int-2 na na na 8.8 1.8 21 na na 0 na na na na PD6241a 0 1 4 81 RAEB 1 17/01/2001 17/01/2001 07/11/2003 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.8 7.3 181 7 na 0 na na na na PD6242a 0 1 4 73 RAEB 1 05/03/2001 05/03/2001 19/09/2001 1 26/06/2001 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.3 3.3 30 5 na 0 na na na na PD6243a 0 0 4 59 RA 15/03/2001 15/03/2001 20/03/2011 1 0 "46,XY,+1,der(1;7)(q10;p10)[8]/46,XY[2]" na 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 na na na 11.9 1.2 155 0 na 0 na na na na PD6244a 1 1 4 69 RA 16/04/2001 16/04/2001 10/09/2003 1 0 "46,XX,del(20)(q11q13)[20]" na 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 low na na na 12.4 3.8 139 2 na 0 na na na na PD6245a 0 1 4 78 CMML 30/04/2001 30/04/2001 16/05/2002 1 0 "45,X,-Y[20]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.8 0.8 123 2 na 0 na na na na PD6246a 1 1 4 87 5q- 30/05/2001 30/05/2001 18/07/2002 1 0 "46,XX,del(5)(q13q33)[15]/46,XX[5]" na 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low na na na 10.4 3.9 103 2 na 0 na na na na PD6247a 0 1 4 70 RARS 05/06/2001 05/06/2001 26/08/2003 1 30/06/2003 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 1826 na 10.5 0.7 56 4 na 10 na na na na PD6485a 0 1 4 56 RA 16/07/2001 01/10/1999 27/04/2011 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na 0 na na na na PD6486a 0 1 4 56 RA 15/10/2001 26/06/2001 31/12/2004 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.6 3.3 39 4 na 0 na na na na PD6248a 1 0 4 69 RA 24/10/2001 21/12/1998 21/08/2005 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na na na na 0 na 0 na na na na PD6487a 1 0 4 61 RA 05/12/2001 05/12/2001 15/02/2012 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na 0 na na na na PD6249a 0 1 4 82 RARS 12/12/2001 12/12/2001 08/04/2003 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.1 4 288 2 na 20 na na na na PD6250a 1 1 4 85 RARS 17/12/2001 17/12/2001 14/03/2003 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.4 4 381 0 na 30 na na na na PD6251a 0 1 4 53 RAEB 1 04/02/2002 04/02/2002 06/07/2004 1 14/01/2004 1 "46,XY,+1,der(1;7)(q10;p10)[5]/46,XY,+1,der(1;7)(q10;p10)del(7)(p21.3p11.1)[15]" na 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 int-2 na 667 na 9.7 0.5 136 na na 0 na na na na PD6252a 0 1 4 80 CMML 27/02/2002 27/02/2002 07/01/2008 1 0 "45,X,-Y[12]/46,XY[8]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low na na na 13.1 0.4 122 0 na 0 na na na na PD6253a 1 1 4 71 CMML 27/03/2002 24/03/1997 30/04/2004 1 22/04/2004 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6335a 0 1 4 70 CMML 21/05/2002 06/07/2001 10/06/2002 1 30/05/2002 1 "47,XY,+8[6]/46,XY[14]" na 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-1 na na na 10.5 15.8 52 4 na 0 na na na na PD6254a 0 1 4 86 RA 22/05/2002 22/05/2002 30/07/2004 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 9.3 2.4 91 4 na 0 na na na na PD6255a 0 1 4 67 CMML 24/06/2002 24/06/2002 17/01/2005 1 21/12/2004 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.9 13.4 215 2 na 0 na na na na PD6256a 1 1 4 95 CMML 26/07/2002 25/07/2002 04/09/2002 1 0 "47,XX,+8[7]/46,XX[13]" na 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 9 0 na 2 na 0 na na na na PD6257a 0 0 4 87 RA 26/07/2002 25/07/2002 20/10/2002 1 0 "45,X,-Y[6]/46,XY[14]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.5 2 153 2 na 0 na na na na PD6258a 0 1 4 71 RARS 31/07/2002 31/07/2002 22/12/2003 1 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na 8.8 3.7 92 3 na 20 na na na na PD6259a 0 1 4 77 RA 21/08/2002 21/08/2002 01/12/2002 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.3 2.5 268 1 na 0 na na na na PD6260a 1 1 4 68 RA 26/08/2002 26/08/2002 22/04/2011 0 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.5 4.7 282 0 na 0 na na na na PD6261a 0 1 4 82 CMML 11/09/2002 11/09/2002 29/08/2003 1 29/08/2003 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 12.4 3.5 88 0 na 0 na na na na PD6262a 1 1 4 77 RARS 16/09/2002 16/09/2002 24/04/2008 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.7 4.7 223 0 na 20 na na na na PD6263a 0 1 4 73 RA 30/09/2002 30/09/2002 11/10/2002 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.1 2.9 72 0 na 0 na na na na PD6264a 0 1 4 81 RA 28/10/2002 28/10/2002 26/04/2005 1 0 "45,X,-Y[10]/46,XY[10]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.6 4.6 419 3 na 0 na na na na PD6488a 1 1 4 82 RA 08/11/2002 08/11/2002 20/01/2003 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 9.8 9 35 1 na 0 na na na na PD6489a 1 0 4 82 RA 13/11/2002 13/11/2002 23/01/2012 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na 10.9 5.8 194 0 na 0 na na na na PD6490a 1 1 4 83 RARS 04/12/2002 04/12/2002 21/06/2009 1 0 "46,XX,inv(2)(p12q21)[20]" na 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 low na na na 8.7 3.2 326 0 na 30 na na na na PD6265a 0 1 4 68 RAEB 1 18/12/2002 18/12/2002 04/05/2003 1 0 "46,XY,del(11)(q13)[13]/46,XY[7]" na 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 11.9 1.4 106 10 na 0 na na na na PD6491a 1 1 4 89 RA 08/01/2003 08/01/2003 10/07/2004 1 07/06/2004 1 46XX[20] na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10 3.6 120 0 na 0 na na na na PD6492a 0 1 4 74 RARS 03/02/2003 30/06/2000 04/12/2004 0 04/12/2004 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6493a 1 0 4 86 RA 03/02/2003 03/02/2003 23/12/2009 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 12.9 0.3 293 1 na 0 na na na na PD6494a 1 1 4 75 RA 11/02/2003 11/02/2003 12/07/2004 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.3 4.4 104 0 na 0 na na na na PD6495a 0 1 4 69 5q- 12/12/2003 12/02/2003 03/12/2010 0 0 "46,XY,del(5)(q14q34)[17]/46,XY[3]" na 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low na 707 na 9.6 2.5 306 1 na 0 na na na na PD6496a 0 1 4 67 RA 18/02/2003 02/08/1996 15/03/2007 1 02/02/2006 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 int-1 na na na na na na 0 na NA na na na na PD6266a 0 1 4 72 RAEB 1 02/04/2003 02/04/2003 23/06/2003 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6267a 0 0 4 82 RA 18/04/2003 18/04/2003 16/08/2003 1 30/07/2003 1 "45-46,XY,+Y,add(1)(p22),-3,del(5)(q13q35),-13,-15,add(17)(q25),-18,-20,+21, +mar1,+mar2[cp2]/44-48,idem,add(16)(q24),-21[cp5]/83-100,idemx2,+y,-4,-7, -12,-add(17)(q25),-mar1,-mar2[cp3]" na 1 1 1 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 int-2 na na na 9.6 0.8 136 na na 0 na na na na PD6268a 0 1 4 60 CMML 08/05/2003 08/05/2003 20/07/2004 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 7.6 8.6 182 2 na 0 na na na na PD6497a 1 1 4 54 RARS 29/05/2003 01/07/2002 06/04/2007 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 8.7 2.3 335 2 na 30 na na na na PD6269a 0 1 4 76 RA 02/06/2003 08/05/1998 15/04/2006 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6270a 0 1 4 82 CMML 17/06/2003 17/06/2003 07/10/2005 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 6 na 6 11.3 97 9 na 0 na na na na PD6271a 1 1 4 83 RAEB 1 18/06/2003 18/06/2003 04/05/2006 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 na na na 11.9 0.8 255 na na 0 na na na na PD6272a 0 1 4 78 CMML 30/06/2003 30/06/2003 26/03/2007 1 0 "45,X,-Y[3]/46,XY[17]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low na na na 14.1 2.2 134 3 na 0 na na na na PD6498a 0 1 4 45 RARS 03/07/2003 01/01/1993 na 0 0 "46,XY [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 10.2 0.9 107 1 na 30 na na na na PD6273a 0 1 4 67 CMML 09/07/2003 09/07/2003 29/10/2008 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.8 5.3 80 4 na 0 na na na na PD6499a 0 1 4 76 RARS 17/07/2003 01/09/2002 02/08/2008 1 0 "45,X,-Y[20]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.2 3.4 72 2.5 na 80 na na na na PD6274a 1 1 4 88 RAEB 1 03/09/2003 03/09/2003 25/09/2005 1 0 "46,XX,inv dup(20)(pter->q11.2::q13.3->qter::qter->q13.3::q11.2->pter)[10]" na 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 int-2 na na na 5.9 0.8 159 na na 0 na na na na PD6275a 0 1 4 79 RAEB 1 03/09/2003 03/09/2003 09/01/2005 1 03/10/2003 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6276a 1 1 4 79 CMML 04/09/2003 09/09/2003 04/03/2004 1 0 "47,XX,+8[4]/46,XX[9]" na 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 10.7 na 36 3 na 0 na na na na PD6277a 1 1 4 na CMML na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6278a 1 1 4 87 RAEB 1 24/11/2003 24/11/2003 21/01/2004 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 na na na 6.4 na 52 na na 0 na na na na PD6500a 1 1 4 54 RCMD-RS 27/11/2003 01/07/2003 05/06/2004 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 516 na 8.4 0.9 250 4 na 60 na na na na PD6501a 0 1 4 72 RA 28/01/2004 28/01/2004 09/01/2012 0 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.3 4.1 198 3 na 0 na na na na PD6502a 1 1 4 94 RA 30/01/2004 28/01/2004 01/06/2004 1 0 "44-46,XX,-5,del(6)(q21q23),-16,add(17)(p11),dmin[cp4]/43,idem,add(4)(q21), -7,-dmin[cp4]/42-44,idem,add(4)(q21),-7,del(13)(q21q33)[cp5]/86-88,idemx2, -dmin[cp4]/46,XX[3]" na 0 1 1 0 0 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 int-2 na na na 5.5 0.5 33 na na 0 na na na na PD6503a 0 1 4 84 RA 16/02/2004 16/02/2004 08/06/2004 1 0 "46,XY,del(20)(q11q13)[6]/46,XY[14]" na 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 low na na na 12.1 2.6 130 1 na 0 na na na na PD6504a 0 1 4 64 RARS 24/03/2004 24/03/2004 08/07/2005 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 285 na 7.7 2.1 131 3 na 20 na na na na PD6505a 1 0 4 82 RA 29/03/2004 29/03/2004 25/03/2012 0 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.4 4.8 260 3 na 0 na na na na PD6279a 0 1 4 58 CMML 20/04/2004 21/04/2004 22/10/2005 1 01/09/2005 1 "46,XY" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 11.1 na 23 3 na 0 na na na na PD6506a 0 1 4 60 RA 30/04/2004 30/04/2004 na 0 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 9.1 2 78 4 na 0 na na na na PD6507a 0 1 4 75 RA 28/06/2004 28/06/2004 01/08/2008 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 40 na 11.6 5.7 172 1 na 0 na na na na PD6280a 0 1 4 66 RAEB 1 01/07/2004 01/07/2004 20/02/2006 1 01/02/2005 1 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 9.8 0.6 24 10 na 0 na na na na PD6508a 1 0 4 68 RA 01/07/2004 01/07/2004 06/01/2012 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na na na na 0 na NA na na na na PD6281a 0 1 4 69 CMML 05/07/2004 05/07/2004 14/06/2010 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 16.8 2.2 86 4 na 0 na na na na PD6509a 1 1 4 83 RA 07/07/2004 07/07/2004 04/12/2006 1 0 "46,XX,del(5)(q22q33)[19]/46,XX[1]" na 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 low na 775 na 5.6 4.6 280 4 na 0 na na na na PD6282a 0 1 4 65 CMML 21/07/2004 21/07/2004 25/01/2008 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 12 7.9 47 4 na 0 na na na na PD6510a 0 1 4 76 RCMD 19/08/2004 01/03/2004 03/09/2010 1 0 "45,X,-Y[20]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 na na na na 12.5 0.76 134 2 na 0 na na na na PD6283a 1 1 4 87 CMML 30/08/2004 30/08/2004 04/09/2004 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 int-2 na na na 10.5 37.7 74 na na 0 na na na na PD6511a 1 1 4 73 RA 08/09/2004 08/09/2004 06/05/2011 0 0 "46,XX,del(3)(p12p21)[20]" na 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 8.6 3 209 0 na 0 na na na na PD6284a 0 1 4 87 RAEB 1 29/09/2004 28/09/2004 11/12/2004 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 na na na 9.1 0.4 66 na na 0 na na na na PD6285a 0 1 4 50 RA 05/10/2004 03/08/2004 25/08/2005 1 02/08/2005 1 "45,XY,der(5)t(5;7)(q11;?),-7,der(13)del(13)(q12q14)t(13;17)(q32;q21),-17,+r(13)[cp25] /46,XY[1] ABNORMALITIES CONFIRMED BY FISH" na 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 int-2 na na na 8.6 4.3 51 na na 0 na na na na PD6287a 0 1 4 63 RAEB 1 24/11/2004 24/11/2004 17/01/2006 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 na 2412 na 7.6 2.5 80 na na 30 na na na na PD6288a 1 1 4 68 CMML 01/12/2004 01/12/2004 09/11/2005 1 09/05/2005 1 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 na na na 11 4.6 313 na na 0 na na na na PD6289a 0 1 4 74 RAEB 1 10/12/2004 10/12/2004 24/07/2005 1 19/07/2005 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 int-2 na 616 na 8.4 0.9 113 na na 0 na na na na PD6512a 0 0 4 59 RCMD 16/12/2004 01/04/2003 16/12/2004 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na na na na 0 na NA na na na na PD6513a 0 1 4 77 RCMD 13/01/2005 01/09/2004 27/09/2005 1 0 "46,XY[30]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 8 8.9 37 3 na 0 na na na na PD6514a 1 1 4 74 RA 16/02/2005 16/02/2005 08/01/2009 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11 1.7 243 4 na 0 na na na na PD6515a 0 1 4 83 RA 18/04/2005 18/04/2005 23/10/2005 1 0 "45,XY,-7[16]/46,XY[5]" na 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-2 na na na 6.9 1.7 161 na na 0 na na na na PD6516a 1 1 4 na RCMD na na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 na na na na na na na na na na na na na na PD6517a 0 1 4 77 CMML 30/06/2005 30/06/2005 22/03/2007 1 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.3 9.4 665 0 na 0 na na na na PD6518a 0 1 4 74 RCMD-RS 11/07/2005 11/07/2005 19/02/2010 1 0 "45, X,-Y[20]" na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 low na 334 na 8.4 2.4 274 2 na 20 na na na na PD6519a 1 1 4 80 RCMD 19/07/2005 19/07/2005 21/04/2011 0 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 7.8 2.1 152 2 na 0 na na na na PD6520a 0 1 4 66 RA 08/08/2005 08/08/2005 11/05/2011 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 0 na NA na na na na PD6521a 1 1 4 80 RA 14/09/2005 14/09/2005 10/09/2011 0 0 "47,XX,+15[15]/46,XX[5]" na 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 na na na 10.3 3.4 295 1 na 0 na na na na PD6522a 1 1 4 71 RARS 07/11/2005 07/11/2005 24/02/2011 0 0 46XX na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.1 3.4 287 2 na 50 na na na na PD6523a 1 1 4 85 CMML 17/11/2005 17/11/2005 22/06/2008 1 0 46XX na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.9 5.7 160 2 na 0 na na na na PD6524a 0 1 4 61 RAEB 1 21/11/2005 21/11/2005 na 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 14.6 0.6 31 6 na 0 na na na na PD6525a 1 0 4 78 RA 06/02/2006 06/02/2006 01/05/2006 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 12.2 1.7 60 3 na 0 na na na na PD6526a 1 0 4 85 5q- 08/03/2006 28/02/2005 21/09/2006 1 0 5q- na 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 int-1 na na na na na na 0 na NA na na na na PD6527a 1 1 4 83 CMML 12/04/2006 12/04/2006 03/08/2006 1 0 46XX na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na na 12.1 1.6 95 0 na 0 na na na na PD6528a 1 0 4 79 RA 08/05/2006 08/05/2006 21/03/2011 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10 3.3 277 1 na 0 na na na na PD6529a 1 1 4 72 RCMD 07/06/2006 07/06/2006 07/03/2007 1 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.3 6 374 1 na 0 na na na na PD6530a 1 1 4 82 RCMD 30/08/2006 30/08/2006 04/01/2012 0 0 "46,XX[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 80 na 11.7 1.2 170 1 na 0 na na na na PD6531a 1 0 4 76 RA 13/09/2006 13/08/2006 20/03/2011 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 10.4 7.9 327 1 na 0 na na na na PD6532a 0 1 4 75 CMML 26/10/2006 26/10/2006 09/02/2011 0 0 "45X-Y[22]/46,XY[28] BCR/ABL NORMAL" na 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 low na na na 9.2 6.2 473 4 na 0 na na na na PD6533a 1 0 4 66 RCMD 02/06/2008 02/06/2008 16/12/2010 0 0 01:07 na 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 int-1 na 383 na 11.9 1.7 110 1 na 0 na na na na PD6534a 0 1 4 78 RCMD 03/07/2008 03/07/2008 10/11/2009 1 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 85 na 11.1 7.8 9 1 na 0 na na na na PD6535a 0 0 4 80 RA 24/09/2008 24/09/2008 18/10/2009 1 24/08/2009 1 "44,XY,-5,del(7)(q22q36),-12,add(13)(q13),add(15)(p11),-20,+mar1[7]/44,idem,-13,+mar2[4]" na 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 0 int-1 na na na 8.8 4 122 4 na 0 na na na na PD6536a 1 1 4 73 RARS 08/12/2008 08/12/2008 22/04/2011 0 0 "46,XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.6 6.3 396 1 na 0 na na na na PD6537a 0 1 4 74 CMML 09/12/2008 09/12/2008 25/04/2011 0 0 "46,XY[20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 91 na 9.8 1.5 386 3 na 0 na na na na PD6538a 1 1 4 71 RARS 06/01/2009 06/01/2009 10/05/2011 0 0 "46,XX [20]" na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 930 na 10.2 4.7 287 2 na 30 na na na na PD6539a 1 1 4 70 RCMD 02/02/2009 02/02/2009 25/04/2011 0 0 46XY na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 49 na 10.4 2.7 na 4 na 0 na na na na PD6540a 1 1 4 87 RARS 11/03/2009 11/03/2009 09/03/2011 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 13.1 6.4 378 0 na 0 na na na na PD6541a 0 2 4 80 RCMD 04/08/2009 04/08/2009 24/04/2010 1 0 "46,X,t(Y;15)(q11;q11.2),-5,del(7)(q21q22),t(10;11)(p12;q22),del(13)(q14q22),+r1[4]/47,idem,+8,del(12)(p11.2p12),-r1,+2[9]/46,XY[14]" na 0 1 1 1 0 0 0 0 0 0 1 1 na na na na na na na na na na na 156 na 7.9 2.2 138 0 na 0 na na na na PD6542a 1 2 4 59 RAEB 1 05/03/2010 05/03/2010 29/03/2011 1 24/01/2011 1 "47,XX+8[14]/46,XX[4]" na 0 0 0 1 0 0 0 0 0 0 0 na 0 0 0 0 0 0 0 0 0 high na na na 6.9 5.8 96 12 na 0 na na na na PD6543a 0 1 4 55 RCMD 22/06/2010 22/06/2010 20/01/2012 0 0 "45,XY,-7[8],46,XY[4]" na 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 int-1 na 388 na 10.9 0.7 152 1 na 0 na na na na PD6544a 0 1 4 81 RA 12/08/2010 12/08/2010 09/05/2011 0 0 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 451 na 8.6 1.7 214 2 na 0 na na na na PD6545a 0 1 4 61 CMML 16/11/2010 01/05/2010 13/01/2011 1 15/11/2010 1 na na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na na na 12.2 38.2 46 0 na 0 na na na na PD6546a 0 0 4 79 RA 17/11/2010 17/11/2010 17/03/2011 0 0 "46,XY,del(20)[6]/46,XY[14]" na 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.4 5.4 72 0 na NA na na na na PD5714a 1 1 5 na RCMD 08/03/2010 08/03/2010 na 0 1 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 235 na 10.8 4.3 62 na na na na na 1 0 PD5715a 1 1 5 na RCMD 27/04/2010 29/04/2010 na 0 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 449 na 8.9 2 105 na na na na na 1 0 PD5716a 1 1 5 70 RCMD 04/05/2010 04/05/2010 na 0 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na 316 na 9.4 0.17 154 na na na na na 1 0.5 PD5764a 0 0 5 69 MDS-U 18/02/2008 18/02/2008 na 1 0 na Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 na 4037 na 4.9 1.9 2 0 na na 1 0 na 0.5 PD5731a 0 0 5 66 RCMD 20/02/2008 10/02/2009 na 1 0 na 10 0 0 0 0 0 na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na 77 na 7.8 3.2 84 5-10% na na na na 1 0.5 PD5773a 0 1 5 69 RT 19/03/2008 27/03/2008 na 0 0 na Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 na 63 na 12.6 2.9 85 2 na na 1 0 na 0 PD5738a 1 1 5 83 RCMD 22/05/2008 26/08/2008 na 0 NA na Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 2 na na na 11 1.12 57 5 na na 1 0.5 1 0.5 PD5739a 0 0 5 na RCMD 25/06/2008 26/08/2008 na 1 0 na Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 2 na na na 9.4 1.1 155 5 na na 1 0.5 1 0.5 PD5734a 1 1 5 na RCMD 02/07/2008 na na 1 0 na Normal 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na na na 10.7 1.2 354 3 na na 0 0 1 0 PD5736a 1 1 5 87 RCMD 12/08/2008 12/08/2008 na 1 0 na Normal 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na na na 9.3 2.02 137 0 na na 0 0 1 0 PD5746a 0 1 5 84 RAEB 1 25/09/2008 24/09/2008 na 1 1 na 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 2 na 432 na 9.6 0.76 60 8 na na na 0.5 2 0.5 PD5753a 1 2 5 74 RAEB 2 08/10/2008 na na 1 0 na 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 3 na na na 11.7 44 93 15 na na na 1.5 3 0 PD5719a 1 0 5 na RCMD 08/10/2008 16/10/2008 na 1 0 na Normal 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 42 na 11.1 7.1 78 2 na na 0 0 1 0 PD5752a na 2 5 na RAEB 2 08/10/2008 16/10/2008 na 1 1 na Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 high 4 na na na 10.6 0.46 146 25 na na 1 2 3 0 PD5729a 0 0 5 na RCMD 02/12/2008 11/12/2008 na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 1 na 282 na 7 2.8 135 <5% na na na na 1 0 PD5775a 0 0 5 na RA 04/02/2009 04/02/2009 na 0 0 Failed/normal Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 na 134 na 9.6 2.7 198 0 na na 1 0 na 0 PD5763a 0 0 5 na MDS-U 18/02/2009 18/02/2009 na 1 0 na Complex karyotype. Failed/normal 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 na na na 10.8 2.5 148 3 na na 1 0 na 0 PD5759a 1 1 5 na MDS-U 07/05/2009 06/05/2009 na 0 0 "_46,XX[20]" Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 123 na 10 5.1 291 na na na 0 na na 0 PD5760a 0 1 5 78 MDS-U 10/06/2009 na na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low na na na 15.3 21.5 73 <5% na na na na na 0 PD5761a 1 0 5 na MDS-U 07/10/2009 na na 0 0 "_46, XY" Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 na na na 14.2 3.1 60 na na na 1 na na 0 PD5713a 1 0 5 78 RCMD 25/01/2010 na na 0 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 99 na 13.4 4.1 63 na na na na na 1 0 PD5776a 1 1 5 na RA 21/02/2008 28/05/2003 na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low na 22 na 9.2 2.9 294 <5% na na na na na 0 PD5758a 0 1 5 na RAEB 2 07/03/2008 18/10/2007 na 1 1 46XY [20] Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 3 na na na 11.5 0.5 269 11 na na 0 na 3 0 PD5765a 1 1 5 na MDS-U 14/07/2008 30/06/2008 na 1 1 _46XX[2} Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na na na na 16 na na 0 na na na PD5747a 1 1 5 85 RAEB 1 21/07/2008 21/07/2008 na 1 1 46XX[20] Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 2 na 566 na 10.5 3.1 149 7 na na 0 0.5 2 0 PD5735a 1 1 5 79 RCMD 04/08/2008 04/08/2008 na 1 0 46XX[20] Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 83 na 8.7 1.9 153 4 na na 0 0 1 0 PD5754a 1 1 5 na RAEB 2 07/09/2009 07/01/2009 na 1 1 47XX+8[12]/46XX[8] na 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 3 na na na 8.6 1.7 178 19 na na na 1.5 3 0 PD5757a 0 1 5 na RAEB 2 18/03/2009 18/03/2009 na 1 1 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-2 3 na na na 10.2 1.6 84 14 na na 0 1.5 3 0.5 PD5781a 0 1 5 na RA 01/04/2009 01/04/2009 na 1 0 47XY+846XY Trisomy 8 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 248 na 10 3.4 269 0.5 na na 0.5 0 na 0 PD5741a 1 0 5 na RAEB 1 03/11/2010 12/07/2010 na 1 1 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 2 na 32 na 10.5 0.5 na <5% na na na na 2 0 PD5756a 0 1 5 73 RAEB 2 06/10/2008 21/10/2008 na 0 0 Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 3 na na na 9.2 4.2 90 10 na na na 0.5 3 0.5 PD5742a 1 1 5 na RAEB 1 28/07/2010 na na na NA na na na na na na na na na na na na na na 0 1 0 0 0 0 0 0 1 int-1 2 na na na 7.9 1.5 91 5-10% na na na na 2 0.5 PD5749a 0 1 5 na RAEB 1 16/10/2008 16/10/2008 09/03/2009 1 1 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 2 na 1388 na 8.7 7.7 247 7 na na 0 0.5 2 0 PD5777a 0 1 5 na RA 20/11/2008 20/11/2008 na 1 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9 3.4 159 3 na na 0 0 na 0 PD5774a 0 1 5 na RA 15/12/2008 15/12/2008 na 0 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 14.2 3.8 66 <5% na na 0 na na 0 PD5755a 0 1 5 na RAEB 2 09/02/2009 09/02/2009 19/9/2009 1 0 multiple chromosomal abnormalities Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 4 na 196 na 9 2.6 28 na na na 1 na 3 0.5 PD5782a 0 1 5 83 RA 16/10/2008 16/10/2008 na na 0 46XY[20] Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 na 233 na 11.3 3.83 142 3 na na 1 na na 0 PD5727a 1 1 5 82 RCMD 06/11/2008 12/11/2008 na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 1 na 50 na 14.7 4.94 44 <5% na na na na 1 0 PD5788a 1 2 5 52 MDSMPN 12/11/2008 12/11/2008 na 0 0 46XX [20] Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 na na na na 12.5 5.06 679 na na na 1 na na 0 PD5728a 1 1 5 85 RCMD 13/11/2008 12/11/2008 na 0 0 46XX Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 2 na na na 10.6 0.99 125 na na na 1 na 1 0 PD5711a 0 1 5 73 RCMD 14/07/2009 14/07/2009 05/03/2012 0 0 """46,XY [20]""" Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 2 na 76 na 11.3 1.5 71 2 na na 1 0 1 0.5 PD6339a 0 1 5 na RARS na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD5712a 0 0 5 79 RCMD 30/09/2009 30/09/2009 na na 0 "Lacking Y chromosome, 80% abnormal karyotype" Complex/other karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 2 na 147 na 10 0.63 101 0 na na 1 0 1 0 PD5770a 0 0 5 70 RT 01/10/2009 01/10/2009 na 0 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 12.8 2.5 59 1 na na 0 0 na 0 PD5771a 1 0 5 85 RA 14/10/2009 14/10/2009 na 0 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 96 na 8.7 2.65 289 <5% na na 0 na na 0 PD5769a 1 1 5 na RA 30/09/2009 01/10/2009 na 1 0 46 XX (20) Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-1 na 4723 na 9.1 7.1 157 3 na na 1 0 na 0 PD5780a 1 1 5 na RA 06/12/2007 08/01/2008 na 0 0 na Failed but probed for 5q and negative 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na 271 na 9.2 4.3 305 1 na na na 0 na 0 PD6337a 0 2 5 na RARS na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD5748a 0 1 5 64 RAEB 1 01/08/2008 30/07/2008 na 1 NA Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 2 na 356 na 9.7 0.31 35 5.5 na na na 0.5 2 0.5 PD5725a 0 1 5 62 RCMD 19/08/2008 26/08/2008 na na 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na na na 9.5 5.49 8 0 na na 0 0 1 0.5 PD5724a 0 1 5 95 RCMD 21/08/2008 27/08/2008 na na NA missing Y as sole abnormality na 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na na na 8.7 1.1 87 0 na na na 0 1 0.5 PD5745a 0 1 5 80 RAEB 1 28/08/2008 01/09/2008 na na NA na Complex w abnormalities in ch 5 an 7 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 int-2 3 na na na 9.7 2.27 65 9.2 na na 1 0.5 2 0.5 PD5726a 1 1 5 82 RCMD 04/09/2008 11/09/2008 na na NA Normal na 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na na na 11.5 7.61 194 na na na na na 1 0 PD5787a 0 2 5 na CMML 27/10/2008 27/10/2008 na 0 0 na Trisomy 8 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 na na 165 na 9.6 6.36 255 0.4 na na 0.5 na na 0 PD5718a 0 1 5 65 RCMD 02/10/2007 02/10/2007 na 0 NA Normal Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na na 1 11.2 1.66 52 na 11.2 1.66 0 0 1 2 PD5720a 1 1 5 40 RCMD 04/10/2007 04/10/2007 na na NA "46,XX[20]" "46,XX[20]" 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na na 12.5 2.43 48 na na na na na 1 0 PD5778a 0 1 5 78 RA 25/02/2008 25/02/2008 05/03/2012 0 0 cytogenetics - normal Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 0 na 0 11.5 1.98 142 3 11.5 1.98 0 3 na 0 PD5732a 0 1 5 66 RCMD 03/03/2008 18/04/2007 16/04/2009 1 14/04/2009 1 cytogenetics - normal Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na na 1 12.9 1.76 72 2 12.9 1.76 na 2 1 2 PD5722a 0 1 5 71 RCMD 09/01/2009 09/01/2009 05/03/2012 0 NA cytogenetics - normal Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 na 1 12.7 1.68 66 na 12.7 1.68 0 0 1 2 PD5744a 0 1 5 80 RAEB 1 09/02/2009 03/04/2007 23/07/2010 1 0 cytogenetics - normal Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 0 na 1 12.5 0.49 118 na 12.5 0.49 na 5 na na PD5710a 0 0 5 na RCMD 18/05/2009 na 2/2/2010 1 NA "Cytogenetics failed. FISH for abnormalities of chromosome 5 and 7. No evidence of monosomy 5 or 7, or deletion of 5q or 7q seen." Failed na 0 0 na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 1 na na na 10.4 0.26 105 2 na na na 0 1 0 PD5768a 0 1 5 75 RA 29/09/2009 29/09/2009 15/02/2010 1 19/01/2010 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low 1 na 1 na na na 4 10.4 0.17 na 4 na na PD5772a 0 0 5 70 RA 10/03/2010 10/03/2010 14/04/2012 1 0 "46,XY[20]" Complex 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 int-2 na na 1 9.2 4.1 309 na 9.2 4.1 1 na na 1 PD5733a 0 1 5 73 RCMD 23/11/2007 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 1 na na na na na na <5% na na na na 1 na PD5779a 1 1 5 78 RA 29/11/2007 na na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low na na na 8.7 6.1 519 0 na na na na na 0 PD5783a 0 2 5 na RA 23/04/2008 na na 0 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 9.7 6.8 238 0 na na 0 0 na 0 PD5750a 0 1 5 80 RAEB 1 28/05/2008 na na 0 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 2 na na na 10.1 1.3 162 8 na na 0 0.5 2 0 PD5766a 0 0 5 na MDS-U 08/08/2008 na na 0 0 na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 13.6 3.8 37 0 na na 0 0 na 0 PD5723a 1 0 5 na RCMD 21/08/2008 na na na NA na Failed na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 1 na na na 13.3 10.1 62 0 na na na 0 1 0 PD5730a 1 0 5 na RCMD 08/12/2008 na na na NA na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low 1 na na na 12 0.6 112 <5% na na na na 1 0 PD6338a 0 1 5 na RARS na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na na PD5767a 0 0 5 na MDS-U 26/03/2009 26/03/2009 na 0 0 na "Failed, not done" na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 low na na na 12.7 1.3 258 0 na na na 0 na 0 PD5789a 1 1 5 na MDSMPN 26/08/2009 27/10/2009 na 1 0 """47,XX,+19 [10]""" Complex karyotype 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 na na 371 na 7.5 15.6 328 na na na 1 na na 0 PD5751a 0 1 5 na RAEB 2 05/10/2009 16/11/2009 na 0 0 46XY[24] Complex karyotype 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 high 4 na 420 na 11 2 39 17 na na 1 1.5 3 0 PD5717a 0 1 5 na RCMD 06/08/2007 06/08/2007 22/04/2008 1 NA del (20) (q11q13) [7/20] na 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 low 1 na 123 na 11 3.18 57 na na na na na 1 0 PD5784a 0 2 5 47 RT 12/05/2008 12/05/2008 05/03/2012 0 0 47XYY [30] na 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 int-1 na 174 na 12.7 1.63 74 0 na na na 0 na 0.5 PD5786a 0 1 5 54 CMML 01/09/2008 01/09/2008 05/03/2012 0 0 46XY [20] Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 na 107 na 14 1.38 233 6 na na 0 0.5 na 0 PD5721a 0 1 5 80 RCMD 10/11/2008 10/11/2008 24/10/2010 1 1 46XY Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 1 na 979 na 12.8 1.25 73 3 na na 0 0 1 0.5 PD5743a 0 1 5 78 RAEB 1 22/01/2009 22/01/2009 na 0 NA na Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 int-1 2 na 84 na 13.2 1.49 61 2 na na 0 0 2 0.5 PD5737a 0 1 5 81 RCMD 06/04/2009 20/04/2009 29/04/2009 1 1 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 int-1 1 na na na 11.1 1.05 41 5-10% na na na na 1 0.5 PD5740a 0 1 5 na RAEB 1 21/04/2009 11/07/2009 26/09/2009 1 1 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 int-1 2 na na na 8.3 2.89 16 5-10% na na na na 2 0.5 PD5785a 0 1 5 na CMML 09/09/2009 na na 0 0 46XY Normal 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 low na na na 11.3 8 168 4.6 na na 0 0 na 0 PD5762a 0 1 5 72 MDS-U 28/09/2010 na na 0 0 na na na na na na na na na na na na na na 0 0 0 0 0 0 0 0 0 int-1 na na na 7.7 1.3 93 5-10% na na na na na 0.5 \ No newline at end of file diff --git a/_articles/RJ-2024-002/figures/Rlogo-5.png b/_articles/RJ-2024-002/figures/Rlogo-5.png new file mode 100644 index 0000000000..077505788a Binary files /dev/null and b/_articles/RJ-2024-002/figures/Rlogo-5.png differ diff --git a/_articles/RJ-2024-002/figures/coef_plots.pdf b/_articles/RJ-2024-002/figures/coef_plots.pdf new file mode 100644 index 0000000000..ae22023d3e Binary files /dev/null and b/_articles/RJ-2024-002/figures/coef_plots.pdf differ diff --git a/_articles/RJ-2024-002/figures/coef_plots.png b/_articles/RJ-2024-002/figures/coef_plots.png new file mode 100644 index 0000000000..eb06c62756 Binary files /dev/null and b/_articles/RJ-2024-002/figures/coef_plots.png differ diff --git a/_articles/RJ-2024-002/figures/data_summary_figs2.pdf b/_articles/RJ-2024-002/figures/data_summary_figs2.pdf new file mode 100644 index 0000000000..c7c611a8f9 Binary files /dev/null and b/_articles/RJ-2024-002/figures/data_summary_figs2.pdf differ diff --git a/_articles/RJ-2024-002/figures/data_summary_figs2.png b/_articles/RJ-2024-002/figures/data_summary_figs2.png new file mode 100644 index 0000000000..3d1e1a925d Binary files /dev/null and b/_articles/RJ-2024-002/figures/data_summary_figs2.png differ diff --git a/_articles/RJ-2024-002/figures/estimator_performance_boxplots_1000patients.pdf b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_1000patients.pdf new file mode 100644 index 0000000000..39cf7ebd12 Binary files /dev/null and b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_1000patients.pdf differ diff --git a/_articles/RJ-2024-002/figures/estimator_performance_boxplots_1000patients.png b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_1000patients.png new file mode 100644 index 0000000000..75cdf29709 Binary files /dev/null and b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_1000patients.png differ diff --git a/_articles/RJ-2024-002/figures/estimator_performance_boxplots_100patients.pdf b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_100patients.pdf new file mode 100644 index 0000000000..ce9ca3c51d Binary files /dev/null and b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_100patients.pdf differ diff --git a/_articles/RJ-2024-002/figures/estimator_performance_boxplots_100patients.png b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_100patients.png new file mode 100644 index 0000000000..95ff1abf44 Binary files /dev/null and b/_articles/RJ-2024-002/figures/estimator_performance_boxplots_100patients.png differ diff --git a/_articles/RJ-2024-002/figures/mssample_and_probtrans_fft.pdf b/_articles/RJ-2024-002/figures/mssample_and_probtrans_fft.pdf new file mode 100644 index 0000000000..95267bfa77 Binary files /dev/null and b/_articles/RJ-2024-002/figures/mssample_and_probtrans_fft.pdf differ diff --git a/_articles/RJ-2024-002/figures/mssample_and_probtrans_fft.png b/_articles/RJ-2024-002/figures/mssample_and_probtrans_fft.png new file mode 100644 index 0000000000..4a5d062aa3 Binary files /dev/null and b/_articles/RJ-2024-002/figures/mssample_and_probtrans_fft.png differ diff --git a/_articles/RJ-2024-002/figures/na_props_1000patients_coxph.pdf b/_articles/RJ-2024-002/figures/na_props_1000patients_coxph.pdf new file mode 100644 index 0000000000..88160ed727 Binary files /dev/null and b/_articles/RJ-2024-002/figures/na_props_1000patients_coxph.pdf differ diff --git a/_articles/RJ-2024-002/figures/na_props_1000patients_coxph.png b/_articles/RJ-2024-002/figures/na_props_1000patients_coxph.png new file mode 100644 index 0000000000..b374072256 Binary files /dev/null and b/_articles/RJ-2024-002/figures/na_props_1000patients_coxph.png differ diff --git a/_articles/RJ-2024-002/figures/na_props_100patients_coxph.pdf b/_articles/RJ-2024-002/figures/na_props_100patients_coxph.pdf new file mode 100644 index 0000000000..f3914b2fb8 Binary files /dev/null and b/_articles/RJ-2024-002/figures/na_props_100patients_coxph.pdf differ diff --git a/_articles/RJ-2024-002/figures/na_props_100patients_coxph.png b/_articles/RJ-2024-002/figures/na_props_100patients_coxph.png new file mode 100644 index 0000000000..6b20c970e4 Binary files /dev/null and b/_articles/RJ-2024-002/figures/na_props_100patients_coxph.png differ diff --git a/_articles/RJ-2024-002/figures/package_summary_figure.pdf b/_articles/RJ-2024-002/figures/package_summary_figure.pdf new file mode 100644 index 0000000000..b9d7a0e1a3 Binary files /dev/null and b/_articles/RJ-2024-002/figures/package_summary_figure.pdf differ diff --git a/_articles/RJ-2024-002/figures/package_summary_figure.png b/_articles/RJ-2024-002/figures/package_summary_figure.png new file mode 100644 index 0000000000..e39bbe27eb Binary files /dev/null and b/_articles/RJ-2024-002/figures/package_summary_figure.png differ diff --git a/_articles/RJ-2024-002/figures/patient78_cumhaz_final.png b/_articles/RJ-2024-002/figures/patient78_cumhaz_final.png new file mode 100644 index 0000000000..f6dd1b3910 Binary files /dev/null and b/_articles/RJ-2024-002/figures/patient78_cumhaz_final.png differ diff --git a/_articles/RJ-2024-002/figures/patient78_transProbs_final.png b/_articles/RJ-2024-002/figures/patient78_transProbs_final.png new file mode 100644 index 0000000000..3c0fa7ae95 Binary files /dev/null and b/_articles/RJ-2024-002/figures/patient78_transProbs_final.png differ diff --git a/_articles/RJ-2024-002/figures/transition_structures.pdf b/_articles/RJ-2024-002/figures/transition_structures.pdf new file mode 100644 index 0000000000..b9de9b833c Binary files /dev/null and b/_articles/RJ-2024-002/figures/transition_structures.pdf differ diff --git a/_articles/RJ-2024-002/figures/transition_structures.png b/_articles/RJ-2024-002/figures/transition_structures.png new file mode 100644 index 0000000000..b63f3398d3 Binary files /dev/null and b/_articles/RJ-2024-002/figures/transition_structures.png differ diff --git a/_articles/RJ-2024-002/figures/workflow0.pdf b/_articles/RJ-2024-002/figures/workflow0.pdf new file mode 100644 index 0000000000..7e5dfc2cd1 Binary files /dev/null and b/_articles/RJ-2024-002/figures/workflow0.pdf differ diff --git a/_articles/RJ-2024-002/figures/workflow0.png b/_articles/RJ-2024-002/figures/workflow0.png new file mode 100644 index 0000000000..56c954c422 Binary files /dev/null and b/_articles/RJ-2024-002/figures/workflow0.png differ diff --git a/_articles/RJ-2024-002/scripts/ESM_1.Rmd b/_articles/RJ-2024-002/scripts/ESM_1.Rmd new file mode 100644 index 0000000000..cebd039ad4 --- /dev/null +++ b/_articles/RJ-2024-002/scripts/ESM_1.Rmd @@ -0,0 +1,1483 @@ +--- +title: "Supplementary Notes" +author: +- Rui J Costa^[European Bioinformatics Institute (EMBL-EBI), ruibarrigana@hotmail.com] +- Moritz Gerstung^[Genome Biology Unit, EMBL] ^[German Cancer Research Center (DKFZ)] +toc-title: Contents +header-includes: \usepackage{amsmath,amsfonts,amssymb,amsthm,verbatim} +output: + bookdown::html_document2: + number_sections: yes + toc: yes + toc_depth: 1 + word_document: + toc: yes + toc_depth: '4' + pdf_document: + toc: yes + toc_depth: 1 +subtitle: + This document is part of the supplementary material to Costa, + R. J., Gerstung, M. (2024), ebmstate -- an R package For Disease Progression Analysis Under Empirical Bayes Cox Models, *The R Journal*. +--- + + + +```{r setup, include=FALSE} + knitr::opts_chunk$set(fig.width = 8) +# ,tidy.opts=list(width.cutoff=60), tidy=TRUE + +``` + + +# Consistency of the estimator in $\texttt{ebmstate::CoxRFX}$ +This section shows the results of a simulation study to assess the consistency of the estimator in $\texttt{CoxRFX}$ (an estimator of the regression coefficients in a Cox model). + +In the following grid of plots, each column of the grid is based on a different batch of 500 data sets simulated from the same illness-death model. This model is a clock-reset Cox model with 10 regression coefficients for each transition (one for each of 10 covariates), and a Gompertz baseline hazard. The (prior) distribution of the parameters in each transition was assumed to be Gaussian (with undetermined mean and variance). The only difference between batches/columns is the number of observations (i.e. patients) per data set: +250 observations in the first column, 1000 in the second, 2000 in the third and 4000 in the fourth. + +The top row plots compare the true coefficient values with the mean estimate over the 500 data sets in a batch; they give strong indication that the bias vanishes as the sample size increases. At the same time, the distribution of the estimator is concentrated inside increasingly smaller neighbourhoods around the true parameter value. This is strongly suggested by the middle row plots, which show a vanishing sample variance for each individual parameter, and also the bottom row ones, which show the empirical density of the sum of squared errors becoming concentrated in smaller and smaller neighbourhoods around zero. + +\ + +```{r eval=TRUE, echo=FALSE, fig.cap="consistency of the CoxRFX estimator."} + +file_paths<-c("../data/coxph_vs_coxrfx_sim_illness_death_250obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_1000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_2000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata") + + +par(mfcol=c(3,4),mar=c(3.1, 4.1, 0.2, 1.1),cex.lab=0.7,mgp=c(3,0.8,0),oma=c(2,0,2.7,0)) + +for(i in 1:length(file_paths)){ +#mse comparison +load(file_paths[i]) +file1<-file_paths[i] +#reorder matrix of estimates +coefficient_estimates<-coefficient_estimates[,c(seq(1,ncol(coefficient_estimates),3),seq(2,ncol(coefficient_estimates),3),seq(3,ncol(coefficient_estimates),3))] + + +#errors +errors_coxrfx<-t(coefficient_estimates)-param + +#sse +sse_coxrfx<-apply(errors_coxrfx^2,2,sum,na.rm=T) + +file2<-strsplit(file1,"/data/")[[1]][2] +file3<-strsplit(file2,".Rdata") + +plot(0,type="n",xlim=c(-0.5,1),ylim=c(-0.5,1),cex.axis=0.8,las=1,ylab="",xlab="") +points(param,apply(coefficient_estimates,2,mean,na.rm=T),cex=0.8,col="red") +mtext("true parameter",side = 1,line = 2,cex=0.5) +mtext("average point estimate",side = 2,line = 2,cex=0.5) +if(i==1){ + mtext(c("250 obs.","1000 obs.","2000 obs.","4000 obs."),outer = TRUE,at =c(0.15,0.4,0.65,0.9),line = 1.5 ) +} +abline(a=0,b=1) + +var_coef_estimates<-apply(coefficient_estimates,2,var,na.rm=T) + +barplot(var_coef_estimates,col=c("red"),beside=T,las=1,ylim=c(0,0.18),ylab = "sample variance",cex.lab=0.7,mgp=c(3,1,0),cex.axis = 0.8) +mtext("regression coefficient",side = 1,line = 0.7,cex=0.5) + +plot(density(sse_coxrfx,from = 0,kernel = "gaussian",na.rm = T),main="",ylab ="",xlab="",xlim=c(0,3.5),ylim=c(0,7.5),col="red",lty=1,las=1,cex.axis=0.8) +mtext("sum of squared errors",side = 1,line = 2,cex=0.5) +mtext("estimated density",side = 2,line = 2,cex=0.5) + +} + +``` + +\ + +## Code to perform the simulation {-} + +\ +Set the parameters of the simulation +```{r eval=FALSE,echo=TRUE} +set.seed(20078) +library(mvtnorm) +library(ebmstate) + +n<-4000 # number of patients +covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition) +nGroups<-1 #number of groups per transition +nParam<-3*length(covariate_names) #total number of parameters (regression coefficients) +nr_simulated_data_sets<-500 +param<-runif(n=nParam,min = -0.5,max = 1) #simulation of parameters +file1<-"../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata" + +``` + +Generate data by simulation +```{r eval=FALSE,echo=TRUE} +coefficient_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names)) +mu_estimates<-matrix(nrow = nr_simulated_data_sets,ncol=3*nGroups) +sigma2_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*nGroups) + +coefficient_estimates_coxph<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names)) + + +for (j in 1:nr_simulated_data_sets){ + #covariates + if(length(covariate_names)>1){ + covariate_matrix<-t(sapply(rep(length(covariate_names),n),function(x) rbinom(n=x,size = 1,prob = 0.5))) + }else{ + covariate_matrix<-matrix(rbinom(n,size = 1,prob = 0.5),ncol=1) + } + + colnames(covariate_matrix)<-covariate_names + + #relative risks (relative hazards) + rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)]) + rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)]) + rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)]) + + #Generate a transition history for each patient. Clock-reset Cox model. Baseline hazard is Gompertz for all transitions. + + m<-matrix(c(flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2) + v1<-apply(m,1,which.min) + m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1) + m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,flexsurv::rgompertz(1,shape = 0.15,rate = rel.risk_trans3[x]*exp(-5.5)),NA))) + m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T)) + m<-cbind(m,rexp(n,0.03)) + m<-cbind(m,(m[,5]m$state1_duration[i],m$transition[i]==2 & m$cens_time[i]>m$state1_duration[i])) + mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) + if(status[1]==1){ + id<-i + from<-2 + to<-4 + trans<-3 + Tstart<-Tstop[1] + Tstop<-min(m$state1_duration[i]+m$state2_duration[i],m$cens_time[i]) + time<-Tstop-Tstart + status<-as.numeric(m$state1_duration[i]+m$state2_duration[i]t_{k}-u|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ +&=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n \right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ +&=\int_{0}^{t_{k}} \exp\left[-\Lambda_{n}\left(t_{k}-u\right)\right]\, f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ +&\approx \sum_{l=0}^{k-1}r_{n}\left(k-l-1\right)q_{0n}\left(l\right)\quad. +\end{align*} + + + +# Testing $\texttt{probtrans_ebmstate}$ +The following plot shows that `probtrans_ebmstate` can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by `probtrans_ebmstate`, when the true (Gompertz) cumulative hazards are given to it as input. +\ + + +```{r include=FALSE} +library(ebmstate) +load("../data/testing_probtrans_ebmstate.Rdata") +``` + +```{r eval=TRUE,echo=FALSE, fig.cap="accuracy of the estimator of state occupation probabilities in the function probtrans_ebmstate."} + +plot(probtrans_object,legend = c("","","",""),lwd = 2) +lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3) +lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3) +lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3) +lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3) +text(10,0.3,"health") +text(30,0.4,"illness") +text(74,0.07,"death") +text(67,0.6,"death_after_illness") +legend("bottomleft", legend = c("using probtrans_ebmstate","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red")) + +``` + +\ +Code used to generate the previous plot. + +```{r eval=FALSE,echo=TRUE} +# Load packages and set working directory +set.seed(9873) +library(ebmstate) +library(flexsurv) + +#generate a vector of covariates for the patient +nCovs<-50 +covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1) + +#compute the relative risk (better said: relative hazard) of the patient for each transition +param<-runif(n=150,min = -0.5,max = 0.5) +rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)])) +rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)])) +rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)])) + +#generate 100,000 uncensored observations for the same patient +n<-100000 +m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2) +v1<-apply(m,1,which.min) +m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1) +m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA))) +m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T)) +colnames(m)<-c("state1_duration","transition","state2_duration","total_time") +m<-as.data.frame(m) + + +#Build a function that computes relative frequencies of each state at some time t +rel_freq<-function(state,t){ + if(state==1){ + sum(m[,1]>t)/nrow(m) + }else if(state==2){ + sum(m[,1]t)/nrow(m) + }else if(state==3){ + sum(m[,1]t)/nrow(m) + }else if(state==2){ + sum(m[,1]t)/nrow(m) + }else if(state==3){ + sum(m[,1]m$state1_duration[i]) + mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart, + Tstop=Tstop,time=time,status=status)) + if(status==1){ + id<-i + from<-2 + to<-3 + trans<-2 + Tstart<-Tstop + Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i]) + time<-Tstop-Tstart + status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i]) + mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart, + Tstop=Tstop,time=time,status=status)) + if(status==1){ + id<-i + from<-3 + to<-4 + trans<-3 + Tstart<-Tstop + Tstop<-min(m$total_time[i],m$cens_time[i]) + time<-Tstop-Tstart + status<-as.numeric(m$total_time[i]m[,1]&t<=m[,1]+m[,2])/nrow(m) + }else if(state=="state3"){ + sum(t>m[,1]+m[,2]&t<=m[,1]+m[,2]+m[,3])/nrow(m) + }else if(state=="state4"){ + sum(t>m[,1]+m[,2]+m[,3])/nrow(m) + } +} + + +simfun<-function(j){ + param<-param_fun(nParam) + marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3) + + true_rh_fun<-function(trans){ + exp(covariate_vector%*%param[(1+length(covariate_names)*(trans-1)):(length(covariate_names)*trans)]) + } + + rh_fun_coxrfx<-function(trans){ + exp(covariate_vector%*%coxrfx_object$coefficients[seq(trans,length(coxrfx_object$coefficients),nTrans)]) + } + + rh_fun_coxph<-function(trans){ + exp(covariate_vector%*%coxph_object$coefficients[seq(trans,length(coxph_object$coefficients),nTrans)]) + } + + covariate_matrix<-rmvbin(n,margprob = marg_probs) + + colnames(covariate_matrix)<-covariate_names + + + #relative risks (relative hazards) + rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)]) + rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)]) + rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)]) + + + #Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions. + + m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3) + m<-cbind(m,apply(m,1,sum,na.rm=T)) + m<-cbind(m,rexp(n,0.008)) + m<-cbind(m,(m[,5]m$state1_duration[i]) + mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) + if(status==1){ + id<-i + from<-2 + to<-3 + trans<-2 + Tstart<-Tstop + Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i]) + time<-Tstop-Tstart + status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i]) + mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) + if(status==1){ + id<-i + from<-3 + to<-4 + trans<-3 + Tstart<-Tstop + Tstop<-min(m$total_time[i],m$cens_time[i]) + time<-Tstop-Tstart + status<-as.numeric(m$total_time[i]1) + out[,invalid_estimates]<-NA + out + } + +} + +plot_abs3<-function(simfun_object){ + list1<-sapply(names(simfun_object[[1]]),unwrap_fun,simfun_object=simfun_object,USE.NAMES = TRUE) + names(list1)<-names(simfun_object[[1]]) + list1$nr_simulated_data_sets<-length(simfun_object) + param_order_fun<-function(nTrans,nParam){ + as.vector(sapply(1:(nParam/nTrans),function(x) seq(x,nParam,nParam/nTrans))) + } + param_order<-param_order_fun(list1$nTrans[1],list1$nParam[1]) + list1$param<-list1$param[param_order,] + + abs_errors_coef0<-abs(list1$param) + abs_errors_coef_coxph<-abs(list1$coxph_coefficients-list1$param) + abs_errors_coef_coxrfx<-abs(list1$coxrfx_coefficients-list1$param) + + abs_errors_rel_risk_coxph<-abs(list1$rel_risk_estimates_coxph-list1$true_rel_risk_sampled_patient) + abs_errors_rel_risk_coxrfx<-abs(list1$rel_risk_estimates_coxrfx-list1$true_rel_risk_sampled_patient) + abs_errors_rel_risk0<-abs(1-list1$true_rel_risk_sampled_patient) + + abs_errors_pred_coxph<-abs(list1$state_occup_estimates_coxph-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),]) + abs_errors_pred_coxrfx<-abs(list1$state_occup_estimates_coxrfx-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),]) + abs_errors_pred0<-abs(list1$state_occup_estimates0-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),]) + + out<-list(coef_errors=list(coxph=abs_errors_coef_coxph + ,coxrfx=abs_errors_coef_coxrfx + ,null=abs_errors_coef0 + ) + ,rel_risk_errors=list(coxph=abs_errors_rel_risk_coxph + ,coxrfx=abs_errors_rel_risk_coxrfx + ,null=abs_errors_rel_risk0) + ,pred_errors=list(coxph=abs_errors_pred_coxph + ,coxrfx=abs_errors_pred_coxrfx + ,null=abs_errors_pred0 + ) + ,true_values=list(coef=list1$param + ,rel_risk=list1$true_rel_risk_sampled_patient + ,pred=list1$true_probs_matrix + ) + ,pred_estimates=list(coxph=list1$state_occup_estimates_coxph + ,coxrfx=list1$state_occup_estimates_coxrfx + ,null=list1$state_occup_estimates0 + ) + ,coef_estimates=list(coxph=list1$coxph_coefficients + ,coxrfx=list1$coxrfx_coefficients + ) + ,rel_risk_estimates=list(coxph=list1$rel_risk_estimates_coxph + ,coxrfx=list1$rel_risk_estimates_coxrfx + ) + ) + out +} + +plot_object_lists<-vector("list",length = length(rdata_names)) +for(i in 1:length(simfun_object_lists)){ + plot_object_lists[[i]]<-lapply(simfun_object_lists[[i]],plot_abs3) +} + + +#### PLOTS + +##boxplots of average absolute error for each simulated data set + +boxplot_fun<-function(plot_obj_list,target){ + limits<-list(coef_errors=0.8,rel_risk_errors=10,pred_errors=0.5) + for(i in 1:length(plot_obj_list)){ + obj1<-apply(plot_obj_list[[i]][[target]][["coxph"]],2,mean) + obj2<-apply(plot_obj_list[[i]][[target]][["coxrfx"]],2,mean) + obj3<-apply(plot_obj_list[[i]][[target]][["null"]],2,mean) + obj1<-sapply(obj1,function(x) ifelse(x=0.8)) + ,rel_risk_errors=c(0,2,4,6,8,expression(NULL>=10)) + ,pred_errors=c(0,0.25,expression(NULL>=0.5)) +) +for (i in 1:length(plot_object_lists)){ + for(j in c("coef_errors","rel_risk_errors","pred_errors")){ + plot(NA,ylim=c(0,ylims[j]) + ,xlim=c(0.5,length(x_values[[i]])+0.5) + ,xlab="" + ,main="" + ,yaxt="n" + ,xaxt="n" + ,lwd=0.5 + ,bty="n" + + ) + + axis(1 + ,at=1:length(x_values[[i]]) + ,labels = x_values[[i]] + ,tick=FALSE + ,mgp=c(3,0.075,0) + ,cex.axis=0.8 + ,lwd = 0.75 + ) + axis(2 + ,tick=TRUE + ,tck=-0.05 + ,mgp=c(3,0.4,0) + ,cex.axis=0.7 + ,lwd=0.75 + ,at=y_at[[j]] + ,labels = y_labels[[j]] + ) + mtext("covariates per transition" + ,side=1 + ,line=1 + ,cex=0.5 + ) + mtext("average absolute error" + ,side=2 + ,line=1.3 + ,cex=0.5 + ) + + boxplot_fun(plot_object_lists[[i]],j) + box(lwd=0.75) + + } +} +dev.off() + + + +par(mfrow=c(1,1)) +plot(NA,ylim=c(0,1),bty="n",ylab="",xlab="",xaxt="n",yaxt="n") +legend(x=0.8,y=0.5,legend = c("Cox","EBCox","null"),fill = c("white"),border = c("black","red","blue")) + + + +## plots of proportions of failed estimates +na_function<-function(object,target,estimator){ + if(!target=="coef_estimates"){ + na_prp<-sum(is.na(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]])) + inf_prp<-sum(is.infinite(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]])) + c(na_prp,inf_prp,1-na_prp-inf_prp) + }else{ + na_prp<-sum(is.na(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]])) + inf_prp<-sum(is.infinite(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]])) + out<-c(na_prp,inf_prp,1-na_prp-inf_prp) + names(out)<-c("NA","Inf","valid") + out + } +} + +# batch with 1000 patients per data set +pdf(file ="./plots/na_props.pdf" + ,width = 6 + ,height = 4) +par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0)) +for(i in 1:3){ + for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){ + barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph") + barplot(barplot_matrix,border=NA + ,col = c(1,2,4) + ,width=1,xlim = c(0,7),cex.axis =0.8,las=2,xaxt="n") + axis(1,at=c(0.7,1.9,3.1,4.3,5.5,6.7) + ,labels=c(10,100,200,300,400,500) + ,tick = FALSE + ,mgp=c(3,0.3,0) + ,las=1 + ,cex.axis=0.8) + mtext("covariates per trans",cex=0.5,line=1.3,side=1) + mtext("proportion",cex=0.5,line=1.9,side=2,las=3) + } +} +dev.off() + +#batch with 100 patients per data set +pdf(file ="./plots/na_props_100.pdf" + ,width = 6 + ,height = 4) +par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0)) +for(i in 1:3){ + for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){ + barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph") + barplot(barplot_matrix,border=NA + ,col = c(1,2,4) + ,width=1,xlim = c(0,5),cex.axis =0.8,las=2,xaxt="n") + axis(1,at=c(0.7,1.9,3.1,4.3) + ,labels=c(10,40,70,100) + ,tick = FALSE + ,mgp=c(3,0.4,0) + ,las=1 + ,cex.axis=0.8) + mtext("covariates per trans",cex=0.5,line=1.4,side=1) + mtext("proportion",cex=0.5,line=1.9,side=2,las=3) + } +} +dev.off() + +#legend +pdf(file ="./plots/na_props_100_legend.pdf" + ,width = 6 + ,height = 4) +par(mfrow=c(1,1)) +plot(NA,ylim=c(0,1),bty="n",ylab="",xaxt="n",yaxt="n") +legend(x=0.8,y=0.8 + ,legend = c("valid","infinite","NA") + ,fill = c(4,2,1),) +dev.off() + + +``` + + + diff --git a/_articles/RJ-2024-002/scripts/ESM_1.html b/_articles/RJ-2024-002/scripts/ESM_1.html new file mode 100644 index 0000000000..31535a80ca --- /dev/null +++ b/_articles/RJ-2024-002/scripts/ESM_1.html @@ -0,0 +1,1644 @@ + + + + + + + + + + + + + + + +Supplementary Notes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + +
    +

    1 Consistency of the estimator in \(\texttt{ebmstate::CoxRFX}\)

    +

    This section shows the results of a simulation study to assess the consistency of the estimator in \(\texttt{CoxRFX}\) (an estimator of the regression coefficients in a Cox model).

    +

    In the following grid of plots, each column of the grid is based on a different batch of 500 data sets simulated from the same illness-death model. This model is a clock-reset Cox model with 10 regression coefficients for each transition (one for each of 10 covariates), and a Gompertz baseline hazard. The (prior) distribution of the parameters in each transition was assumed to be Gaussian (with undetermined mean and variance). The only difference between batches/columns is the number of observations (i.e. patients) per data set: +250 observations in the first column, 1000 in the second, 2000 in the third and 4000 in the fourth.

    +

    The top row plots compare the true coefficient values with the mean estimate over the 500 data sets in a batch; they give strong indication that the bias vanishes as the sample size increases. At the same time, the distribution of the estimator is concentrated inside increasingly smaller neighbourhoods around the true parameter value. This is strongly suggested by the middle row plots, which show a vanishing sample variance for each individual parameter, and also the bottom row ones, which show the empirical density of the sum of squared errors becoming concentrated in smaller and smaller neighbourhoods around zero.

    +


    +

    +
    +consistency of the CoxRFX estimator. +

    +Figure 1.1: consistency of the CoxRFX estimator. +

    +
    +


    +

    +
    +

    Code to perform the simulation

    +


    +Set the parameters of the simulation

    +
    set.seed(20078)
    +library(mvtnorm)
    +library(ebmstate)
    +
    +n<-4000 # number of patients
    +covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition)
    +nGroups<-1 #number of groups per transition
    +nParam<-3*length(covariate_names) #total number of parameters (regression coefficients)
    +nr_simulated_data_sets<-500
    +param<-runif(n=nParam,min = -0.5,max = 1) #simulation of parameters
    +file1<-"../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata"
    +

    Generate data by simulation

    +
    coefficient_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names))
    +mu_estimates<-matrix(nrow = nr_simulated_data_sets,ncol=3*nGroups)
    +sigma2_estimates<-matrix(nrow = nr_simulated_data_sets,ncol = 3*nGroups)
    +
    +coefficient_estimates_coxph<-matrix(nrow = nr_simulated_data_sets,ncol = 3*length(covariate_names))
    +
    +
    +for (j in 1:nr_simulated_data_sets){
    +  #covariates
    +  if(length(covariate_names)>1){
    +    covariate_matrix<-t(sapply(rep(length(covariate_names),n),function(x) rbinom(n=x,size = 1,prob = 0.5)))
    +  }else{
    +    covariate_matrix<-matrix(rbinom(n,size = 1,prob = 0.5),ncol=1)
    +  }
    +  
    +  colnames(covariate_matrix)<-covariate_names
    +
    +  #relative risks (relative hazards)
    +  rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
    +  rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
    +  rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
    +  
    +  #Generate a transition history for each patient. Clock-reset Cox model. Baseline hazard is Gompertz for all transitions. 
    +
    +  m<-matrix(c(flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),flexsurv::rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
    +  v1<-apply(m,1,which.min)
    +  m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
    +  m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,flexsurv::rgompertz(1,shape = 0.15,rate = rel.risk_trans3[x]*exp(-5.5)),NA)))
    +  m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
    +  m<-cbind(m,rexp(n,0.03))
    +  m<-cbind(m,(m[,5]<m[,4]))
    +  colnames(m)<-c("state1_duration","transition","state2_duration","total_time", "cens_time","cens=1")
    +  m<-as.data.frame(m)
    +
    +  #convert the data to long format
    +  mstate.data<-data.frame()
    +
    +  for(i in 1:nrow(m)){
    +    id<-rep(i,2)
    +    from<-c(1,1)
    +    to<-c(2,3)
    +    trans<-c(1,2)
    +    Tstart<-c(0,0)
    +    Tstop<-rep(min(m$state1_duration[i],m$cens_time[i]),2)
    +    time<-Tstop-Tstart
    +    status<-as.numeric(c(m$transition[i]==1 & m$cens_time[i]>m$state1_duration[i],m$transition[i]==2 & m$cens_time[i]>m$state1_duration[i]))
    +    mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,                                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) 
    +    if(status[1]==1){
    +      id<-i
    +      from<-2
    +      to<-4
    +      trans<-3
    +      Tstart<-Tstop[1]
    +      Tstop<-min(m$state1_duration[i]+m$state2_duration[i],m$cens_time[i])
    +      time<-Tstop-Tstart
    +      status<-as.numeric(m$state1_duration[i]+m$state2_duration[i]<m$cens_time[i])
    +      mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,
    +                                                Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    +    }
    +  }
    +  
    +  #add covariates
    +  mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
    +  
    +  #attributes and class
    +  tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_after_illness"))
    +  class(mstate.data)<-c("data.frame","msdata")
    +  attr(mstate.data,"trans")<-tmat
    +  
    +  #expand covariates
    +  mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
    +  
    +  #Fit empirical Bayes clock-reset Cox model. 
    +  
    +  #argument 'Z' of coxrfx
    +  Z<-mstate.data[,-(1:(8+length(covariate_names)))]
    +  Z$strata<-Z$trans<-mstate.data$trans
    +  class(Z) <- c("data.frame")
    +  
    +  #argument 'surv' of coxrfx
    +  surv<-survival::Surv(mstate.data$time,mstate.data$status)
    +  
    +  #argument 'groups' of coxrfx
    +  groups<-as.vector(sapply(c("trans1","trans2","trans3"),function(x) rep(x,nParam/3)))
    +  
    +  #fit random effects model
    +  coxrfx_object<-CoxRFX(Z,surv,groups,max.iter = 600,tol = 0.0001,sigma.hat = "df")
    +
    +  coefficient_estimates[j,]<-coxrfx_object$coefficients
    +  mu_estimates[j,]<-coxrfx_object$mu
    +  sigma2_estimates[j,]<-coxrfx_object$sigma2
    +  
    + 
    +  if(j %%10==0){
    +    save(coefficient_estimates,mu_estimates,sigma2_estimates,param,file = file1)
    +  }
    +
    +  print(j)
    +}
    +

    Code to generate grid of plots

    +
    file_paths<-c("../data/coxph_vs_coxrfx_sim_illness_death_250obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_1000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_2000obs.Rdata","../data/coxph_vs_coxrfx_sim_illness_death_4000obs.Rdata")
    +
    +pdf("./plots/coxrfx_vs_coxph_plot_grid_10covs_per_trans_one_prior_per_trans.pdf",height=8,width = 9)
    +
    +par(mfcol=c(3,4),mar=c(5.1, 4.1, 3.1, 1.1))
    +for(file1 in file_paths){
    +#mse comparison 
    +load(file1)
    +#reorder matrix of estimates
    +coefficient_estimates<-coefficient_estimates[,c(seq(1,ncol(coefficient_estimates),3),seq(2,ncol(coefficient_estimates),3),seq(3,ncol(coefficient_estimates),3))]
    +  
    +#errors
    +errors_coxrfx<-t(coefficient_estimates)-param
    +errors_coxph<-t(coefficient_estimates_coxph)-param
    +
    +#sse
    +sse_coxrfx<-apply(errors_coxrfx^2,2,sum,na.rm=T)
    +
    +file2<-strsplit(file1,"/data/")[[1]][2]
    +file3<-strsplit(file2,".Rdata")
    +
    +plot(0,type="n",xlim=c(-0.5,1),ylim=c(-0.5,1),ylab="average point estimate",xlab="true parameter",cex.lab=0.8,cex.axis=1,las=1,cex.lab=1.3)
    +points(param,apply(coefficient_estimates,2,mean,na.rm=T),cex=0.8,col="red")
    +abline(a=0,b=1)
    +
    +var_coef_estimates<-apply(coefficient_estimates,2,var,na.rm=T)
    +var_coef_estimates_coxph<-apply(coefficient_estimates_coxph,2,var,na.rm=T)
    +barplot(var_coef_estimates,col=c("red"),beside=T,las=1,ylim=c(0,0.18),ylab = "sample variance",cex.lab=1.1,mgp=c(3,1,0))
    +mtext("regression coefficient",side = 1,line = 1,cex=0.8)
    +
    +plot(density(sse_coxrfx,from = 0,kernel = "gaussian",na.rm = T),main="",ylab ="estimated density",xlab="sum of squared errors",xlim=c(0,5.5),ylim=c(0,7.5),col="red",lty=1,las=1,cex.lab=1.3)
    +
    +}
    +
    +dev.off()
    +
    +
    +
    +

    2 Computation of state occupation probabilities for clock-reset models

    +

    In \(\texttt{ebmstate}\), the estimation of state occupation probabilities under clock-reset models, implemented in the functions \(\texttt{probtrans_ebmstate}\) and \(\texttt{probtrans_fft}\), relies on the estimators defined in (3) to (5) of section 3.2 of the paper. Substituting \(:=\) for \(\approx\) and removing the ‘hats’ in definitions (3) to (5), we obtain the approximate equalities that justify the estimators. We justify these approximate equalities as follows.
    +
    +Approximate equality justifying the estimator in definition (3), section 3.2: +\[\begin{align*} +q_{j-1,j}\left(k\right)&:=\mathrm{P}\left[X_{j}=j, \tau_{j-1,j}\in \left[t_{k},t_{k+1}\right)\,|\,X_{j-1}=j-1\right]\\ +&\approx f\left[X_{j}=j,\tau_{j-1,j}=t_{k}\,|\, X_{j-1}=j-1\right] \Delta t\\ +&=\lim_{h \downarrow 0}\frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k},t_{k}+h\right)\,|\,X_{j-1}=j-1\right]}{h}\Delta t\\ +&=\lim_{h \downarrow 0}\mathrm{P} \left[\tau_{j-1,j} \geq t_{k}\,|\,X_{j-1}=j-1\right]\frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k},t_{k}+h\right)|\tau_{j-1,j} \geq t_{k},X_{j-1}=j-1\right]}{h} \Delta t\\ +&=\mathrm{P} \left[\tau_{j-1,j} \geq t_{k}\,|\,X_{j-1}=j-1\right]\lambda_{j-1,j}\left(t_{k}\right)\,\Delta t\\ +&\approx \exp\left[-\Lambda_{j-1}\left(t_{k}\right)\right]\,\Delta\Lambda_{j-1,j}\left(t_{k}\right)\;. +\end{align*}\]
    +Approximate equality justifying the estimator in definition (4), section 3.2:

    +

    \[\begin{align*} +q_{0j}\left(k\right)&:=\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in\left[t_{k},t_{k+1}\right)\,|\,X_{0}=0\right]\\ +&\approx f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{0}=0\right]\Delta t\\ +&=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k},X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{j-1}=j-1,\tau_{0,j-1}=u ,X_{0}=0\right]f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&=\int_{0}^{t_{k}}f\left[X_{j}=j,\tau_{0,j}=t_{k}\,|\,X_{j-1}=j-1,\tau_{0,j-1}=u \right]f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \frac{\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in \left[t_{k},t_{k}+h\right)\,|\, X_{j-1}=j-1,\tau_{0,j-1}=u \right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \mathrm{P}\left[\tau_{0,j}\geq t_{k} \,|\,X_{j-1}=j-1,\tau_{0,j-1}=u\right] \frac{\mathrm{P}\left[X_{j}=j,\tau_{0,j}\in \left[t_{k},t_{k}+h\right)\,|\, \tau_{0,j}\geq t_{k},X_{j-1}=j-1,\tau_{0,j-1}=u \right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&=\int_{0}^{t_{k}}\lim_{h \downarrow 0} \mathrm{P}\left[\tau_{j-1,j}\geq t_{k}-u \,|\,X_{j-1}=j-1\right] \frac{\mathrm{P}\left[X_{j}=j,\tau_{j-1,j}\in \left[t_{k}-u,t_{k}-u+h\right)\,|\, \tau_{j-1,j}\geq t_{k}-u,X_{j-1}=j-1\right]}{h}\,f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&=\int_{0}^{t_{k}}\exp\left[-\Lambda_{j-1}\left(t_{k}-u\right)\right]\,\lambda_{j-1,j}\left(t_{k}-u\right) f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\Delta t\\ +&\approx \int_{0}^{t_{k}}\exp\left[-\Lambda_{j-1}\left(t_{k}-u\right)\right]\,\Delta\Lambda_{j-1,j}\left(t_{k}-u\right) f\left[X_{j-1}=j-1,\tau_{0,j-1}=u\,|\,X_{0}=0\right]\mathrm{d}u\\ +&\approx \sum_{l=0}^{k-1} q_{j-1,j}\left(k-l-1\right)q_{0,j-1}\left(l\right) \;. +\end{align*}\]
    +Approximate equality justifying the estimator in definition (5), section 3.2:

    +

    \[\begin{align*} +p_{0n}\left(k\right)&=\mathrm{P}\left[X_{n}=n,\tau_{ 0, n}< t_{k} , \tau_{ n, n+1}\geq t_{k}-\tau_{ 0, n}\,|\, X_{0}=0\right]\\ +&=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[X_{n}=n,\tau_{ 0, n}=u,\tau_{ n, n+1}=v\,|\, X_{0}=0\right]\mathrm{d}v\,\mathrm{d}u\\ +&=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[\tau_{ n, n+1}=v|X_{n}=n,\tau_{ 0, n}=u,X_{0}=0\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}v \,\mathrm{d}u\\ +&=\int_{0}^{t_{k}}\int_{t_{k}-u}^{\infty}f\left[\tau_{ n, n+1}=v|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}v \,\mathrm{d}u\\ +&=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n,\tau_{ 0, n}=u\right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ +&=\int_{0}^{t_{k}} \mathrm{P}\left[\tau_{ n, n+1}>t_{k}-u|X_{n}=n \right]f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ +&=\int_{0}^{t_{k}} \exp\left[-\Lambda_{n}\left(t_{k}-u\right)\right]\, f\left[X_{n}=n,\tau_{ 0, n}=u\,|\, X_{0}=0\right]\mathrm{d}u\\ +&\approx \sum_{l=0}^{k-1}r_{n}\left(k-l-1\right)q_{0n}\left(l\right)\quad. +\end{align*}\]

    +
    +
    +

    3 Testing \(\texttt{probtrans_ebmstate}\)

    +

    The following plot shows that probtrans_ebmstate can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by probtrans_ebmstate, when the true (Gompertz) cumulative hazards are given to it as input.
    +

    +
    +accuracy of the estimator of state occupation probabilities in the function probtrans_ebmstate. +

    +Figure 3.1: accuracy of the estimator of state occupation probabilities in the function probtrans_ebmstate. +

    +
    +


    +Code used to generate the previous plot.

    +
    # Load packages and set working directory
    +set.seed(9873)
    +library(ebmstate)
    +library(flexsurv)
    +
    +#generate a vector of covariates for the patient
    +nCovs<-50
    +covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1)
    +
    +#compute the relative risk (better said: relative hazard) of the patient for each transition
    +param<-runif(n=150,min = -0.5,max = 0.5)
    +rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)]))
    +rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)]))
    +rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)]))
    +
    +#generate 100,000 uncensored observations for the same patient
    +n<-100000
    +m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
    +v1<-apply(m,1,which.min)
    +m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
    +m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA)))
    +m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
    +colnames(m)<-c("state1_duration","transition","state2_duration","total_time")
    +m<-as.data.frame(m)
    +
    +
    +#Build a function that computes relative frequencies of each state at some time t
    +rel_freq<-function(state,t){
    +  if(state==1){
    +    sum(m[,1]>t)/nrow(m)
    +  }else if(state==2){
    +    sum(m[,1]<t & m[,2]==1 & m[,4]>t)/nrow(m)
    +  }else if(state==3){
    +    sum(m[,1]<t & m[,2]==2)/nrow(m)
    +  }else if(state==4){
    +    sum(m[,2]==1 & m[,4]<t)/nrow(m)
    +  }
    +}
    +
    +#Vectorise the cumulative hazards of the patient
    +time_vector<-seq(0,80,0.01)
    +cumhaz1<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans1*exp(-4.5))
    +cumhaz2<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans2*exp(-4.65))
    +cumhaz3<-Hgompertz(time_vector, shape=0.15, rate = rel.risk_trans3*exp(-5.5))
    +cumhaz1<-data.frame(time=time_vector,Haz=cumhaz1,trans=1)
    +cumhaz2<-data.frame(time=time_vector,Haz=cumhaz2,trans=2)
    +cumhaz3<-data.frame(time=time_vector,Haz=cumhaz3,trans=3)
    +
    +#build an msfit object
    +tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_illness"))
    +msfit_object<-list(Haz=rbind(cumhaz1,cumhaz2,cumhaz3),trans=tmat)
    +class(msfit_object)<-c("msfit","coxrfx")
    +
    +#Calculate state occupation probabilities
    +probtrans_object<-probtrans_ebmstate("health",msfit_object,'clockreset')
    +rel_freq_1<-sapply(time_vector,rel_freq,state=1)
    +rel_freq_12<-rel_freq_1+sapply(time_vector,rel_freq,state=2)
    +rel_freq_123<-rel_freq_12+sapply(time_vector,rel_freq,state=3)
    +rel_freq_1234<-rel_freq_123+sapply(time_vector,rel_freq,state=4)
    +save(probtrans_object,rel_freq_1,rel_freq_12,rel_freq_123,rel_freq_1234,time_vector,file = "../data/testing_probtrans_ebmstate.Rdata")
    +
    +#compare estimates obtained by simulation with the probabilities computed by probtrans_ebmstate
    +plot(probtrans_object,legend = c("","","",""),lwd = 2)
    +lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3)
    +lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3)
    +lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3)
    +lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3)
    +text(10,0.3,"health")
    +text(37,0.4,"illness")
    +text(78,0.07,"death")
    +text(67,0.6,"death_after_illness")
    +legend("bottomleft", legend = c("using probtrans_ebmstate","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red"))
    +
    +
    +

    4 Testing \(\texttt{probtrans_fft}\)

    +

    The following plot shows that probtrans_fft can accurately compute state occupation probabilities under clock-reset Cox models, when it is given the vectors of cumulative hazards for each transition. The dashed red lines were computed using a data set of 100,000 simulated disease histories for the same patient (or, equivalently, 100,000 patients with the same vector of covariates). For any time t, these lines give the relative frequencies of each state. They are superimposed on solid black lines which represent the state occupation probabilities as computed by probtrans_fft, when the true (Gompertz) cumulative hazards are given to it as input.
    +

    +
    +accuracy of the estimator of state occupation probabilities in the function probtrans_fft. +

    +Figure 4.1: accuracy of the estimator of state occupation probabilities in the function probtrans_fft. +

    +
    +


    +Code used to generate the previous plot.

    +
    # Load packages and set working directory
    +set.seed(981735)
    +library(ebmstate)
    +library(flexsurv)
    +
    +#generate a vector of covariates for the patient
    +nCovs<-50
    +covariate_vector<-rbinom(nCovs,size = 1,prob = 0.1)
    +
    +#compute the relative risk (better said: relative hazard) of the patient for each transition
    +param<-runif(n=150,min = -0.5,max = 0.5)
    +rel.risk_trans1<-exp(sum(covariate_vector*param[(1+nCovs*0):(nCovs*1)]))
    +rel.risk_trans2<-exp(sum(covariate_vector*param[(1+nCovs*1):(nCovs*2)]))
    +rel.risk_trans3<-exp(sum(covariate_vector*param[(1+nCovs*2):(nCovs*3)]))
    +
    +#generate 100,000 uncensored observations for the same patient
    +n<-100000
    +m<-matrix(c(rgompertz(n, shape=0.1, rate = rel.risk_trans1*exp(-4.5)),rgompertz(n, shape=0.1, rate = rel.risk_trans2*exp(-4.65))),ncol = 2)
    +v1<-apply(m,1,which.min)
    +m<-cbind(sapply(1:nrow(m),function(x) m[x,v1[x]]),v1)
    +m<-cbind(m,sapply(1:nrow(m), function(x) ifelse(m[x,2]==1,rgompertz(1,shape = 0.15,rate = rel.risk_trans3*exp(-5.5)),NA)))
    +m<-cbind(m,apply(m[,c(1,3)],1,sum,na.rm=T))
    +colnames(m)<-c("state1_duration","transition","state2_duration","total_time")
    +m<-as.data.frame(m)
    +
    +
    +#Build a function that computes relative frequencies of each state at some time t
    +rel_freq<-function(state,t){
    +  if(state==1){
    +    sum(m[,1]>t)/nrow(m)
    +  }else if(state==2){
    +    sum(m[,1]<t & m[,2]==1 & m[,4]>t)/nrow(m)
    +  }else if(state==3){
    +    sum(m[,1]<t & m[,2]==2)/nrow(m)
    +  }else if(state==4){
    +    sum(m[,2]==1 & m[,4]<t)/nrow(m)
    +  }
    +}
    +
    +#Vectorise the cumulative hazards of the patient
    +time_vector<-seq(0,80,0.01)
    +cumhaz1<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans1*exp(-4.5))
    +cumhaz2<-Hgompertz(time_vector, shape=0.1, rate = rel.risk_trans2*exp(-4.65))
    +cumhaz3<-Hgompertz(time_vector, shape=0.15, rate = rel.risk_trans3*exp(-5.5))
    +cumhaz1<-data.frame(time=time_vector,Haz=cumhaz1,trans=1)
    +cumhaz2<-data.frame(time=time_vector,Haz=cumhaz2,trans=2)
    +cumhaz3<-data.frame(time=time_vector,Haz=cumhaz3,trans=3)
    +
    +#build an msfit object
    +tmat<-mstate::transMat(x=list(c(2,3),c(4),c(),c()),names=c("health","illness","death","death_illness"))
    +msfit_object<-list(Haz=rbind(cumhaz1,cumhaz2,cumhaz3),trans=tmat)
    +class(msfit_object)<-c("msfit","coxrfx")
    +
    +#Calculate state occupation probabilities
    +probtrans_object<-probtrans_fft("health",msfit_object,max_time=80,nr_steps = 40000)
    +rel_freq_1<-sapply(time_vector,rel_freq,state=1)
    +rel_freq_12<-rel_freq_1+sapply(time_vector,rel_freq,state=2)
    +rel_freq_123<-rel_freq_12+sapply(time_vector,rel_freq,state=3)
    +rel_freq_1234<-rel_freq_123+sapply(time_vector,rel_freq,state=4)
    +save(probtrans_object,rel_freq_1,rel_freq_12,rel_freq_123,rel_freq_1234,time_vector,file = "../data/testing_probtrans_fft.Rdata")
    +
    +#compare estimates obtained by simulation with the probabilities computed by probtrans_ebmstate
    +plot(probtrans_object,legend = c("","","",""),lwd = 2)
    +lines(time_vector,rel_freq_1,lwd=2,col="red",lty=3)
    +lines(time_vector,rel_freq_12,lwd=2,col="red",lty=3)
    +lines(time_vector,rel_freq_123,lwd=2,col="red",lty=3)
    +lines(time_vector,rel_freq_1234,lwd=2,col="red",lty=3)
    +text(10,0.3,"health")
    +text(27,0.4,"illness")
    +text(78,0.07,"death")
    +text(67,0.6,"death_after_illness")
    +legend("bottomleft", legend = c("using probtrans_fft","by simulation"),cex = 0.7,lty = c(1,3),col = c(1,"red"))
    +
    +
    +

    5 \(\texttt{mssample}\) and \(\texttt{probtrans_fft}\): running time and estimation accuracy

    +

    The following script generates plots comparing the running time and the estimation accuracy of \(\texttt{mstate::mssample}\) and \(\texttt{ebmstate::probtrans_fft}\).

    +
    #Generate some data from which cumulative hazards are to be estimated
    +
    +set.seed(89910225)
    +library(parallel)
    +library(flexsurv)
    +library(ebmstate)
    +library(bindata)
    +
    +n<-1000 # number of patients
    +covariate_names<-paste0("Cov",1:10) #number of covariates (for each transition)
    +nGroups<-1 #overall number of priors
    +nTrans<-3 # number of transitions
    +nParam<-nTrans*length(covariate_names) #total number of parameters (regression coefficients)
    +nr_simulated_data_sets<-100
    +param_fun<-function(nParam){
    +  out<-sqrt(10/length(covariate_names))*rnorm(n = nParam,mean = 0,sd = 0.65)
    +  out
    +}
    +TimePoints<-seq(10,70,length.out=7) #time points at which state occupation probability estimates are to be retrieved
    +nTimePoints<-length(TimePoints)
    +
    +tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c()),names=c("state1","state2","state3","state4"))
    +
    +shape1<-0.1
    +rate1<-exp(-4.5)
    +shape2<-0.1
    +rate2<-exp(-2.7)
    +shape3<-0.15
    +rate3<-exp(-3.5)
    +
    +
    +param<-param_fun(nParam)
    +marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3)
    +
    +covariate_matrix<-rmvbin(n,margprob = marg_probs)
    +
    +colnames(covariate_matrix)<-covariate_names
    +
    +
    +#relative risks (relative hazards)
    +rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
    +rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
    +rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
    +
    +
    +#Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions. 
    +
    +m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
    +m<-cbind(m,apply(m,1,sum,na.rm=T))
    +m<-cbind(m,rexp(n,0.008))
    +m<-cbind(m,(m[,5]<m[,4]))
    +colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time", "cens_time","cens=1")
    +m<-as.data.frame(m)
    +print(sum(m$`cens=1`)/nrow(m))
    +
    +#convert the data to long format
    +mstate.data<-data.frame()
    +
    +for(i in 1:nrow(m)){
    +  id<-i
    +  from<-1
    +  to<-2
    +  trans<-1
    +  Tstart<-0
    +  Tstop<-min(m$state1_duration[i],m$cens_time[i])
    +  time<-Tstop-Tstart
    +  status<-as.numeric(m$cens_time[i]>m$state1_duration[i])
    +  mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
    +                                            Tstop=Tstop,time=time,status=status)) 
    +  if(status==1){
    +    id<-i
    +    from<-2
    +    to<-3
    +    trans<-2
    +    Tstart<-Tstop
    +    Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i])
    +    time<-Tstop-Tstart
    +    status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i])
    +    mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
    +                                              Tstop=Tstop,time=time,status=status))
    +    if(status==1){
    +      id<-i
    +      from<-3
    +      to<-4
    +      trans<-3
    +      Tstart<-Tstop
    +      Tstop<-min(m$total_time[i],m$cens_time[i])
    +      time<-Tstop-Tstart
    +      status<-as.numeric(m$total_time[i]<m$cens_time[i])
    +      mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,Tstart=Tstart,
    +                                                Tstop=Tstop,time=time,status=status))
    +    }
    +  }
    +  
    +}
    +
    +#add covariates
    +mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
    +
    +#attributes and class
    +class(mstate.data)<-c("data.frame","msdata")
    +attr(mstate.data,"trans")<-tmat
    +
    +#expand covariates
    +mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
    +
    +#Fit non-parametric model and estimate cumulative hazards
    +surv0<-survival::Surv(mstate.data$time,mstate.data$status)
    +coxph_object0<-coxph(as.formula("surv0~strata(trans)"),data = mstate.data)
    +msfit_object0<-msfit(coxph_object0,trans = tmat)
    +
    +#State occupation probabilities by probtrans_fft
    +
    +system.time(probtrans_object0<-probtrans_fft("state1",msfit_object0,time = seq(0,150,length.out=100000)))
    +
    +tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c())
    +)
    +
    +#State occupation probabilities by mssample
    +
    +system.time(probtrans_object_100<-mssample(msfit_object0$Haz
    +         ,tmat
    +         ,history = list(state=1,time=0,tstate=NULL)
    +         ,clock="reset"
    +         ,tvec=seq(0,100,length.out=5000)
    +         ,output = "state"
    +         ,M=100)
    +)
    +
    +system.time(probtrans_object_1000<-mssample(msfit_object0$Haz
    +                                           ,tmat
    +                                           ,history = list(state=1,time=0,tstate=NULL)
    +                                           ,clock="reset"
    +                                           ,tvec=seq(0,100,length.out=5000)
    +                                           ,output = "state"
    +                                           ,M=1000)
    +)
    +
    +system.time(probtrans_object_10000<-mssample(msfit_object0$Haz
    +                                           ,tmat
    +                                           ,history = list(state=1,time=0,tstate=NULL)
    +                                           ,clock="reset"
    +                                           ,tvec=seq(0,100,length.out=5000)
    +                                           ,output = "state"
    +                                           ,M=10000)
    +)
    +
    +system.time(probtrans_object_100000<-mssample(msfit_object0$Haz
    +                                           ,tmat
    +                                           ,history = list(state=1,time=0,tstate=NULL)
    +                                           ,clock="reset"
    +                                           ,tvec=seq(0,100,length.out=5000)
    +                                           ,output = "state"
    +                                           ,M=100000)
    +)
    +
    +
    +#100 000: 1026 secs
    +#10 000: 93 secs
    +# 1000: 9.3 secs
    +# 100: 0.92 secs
    +# fft: 0.94 secs
    +
    +#Plot code
    +
    +probtrans_objects<-list(probtrans_object_100
    +                        ,probtrans_object_1000
    +                        ,probtrans_object_10000
    +                        ,probtrans_object0[[1]])
    +pdf(file ="./mssample_and_probtrans_fft.pdf"
    +    ,width = 6
    +    ,height = 4)
    +par(mfcol=c(4,4),mar=c(0,0,0.2,0),mgp=c(3,0.5,0),oma=c(3,3,1,1),tck=-0.05)
    +for(j in 1:4){
    +  for(i in 2:5){
    +    ifelse(j==1,yaxt_value<-"s",yaxt_value<-"n")
    +    plot(probtrans_object_100000$time,probtrans_object_100000[[i]]
    +         ,type="l"
    +         ,ylab="",xlab="",xaxt="n"
    +         ,yaxt=yaxt_value,cex.axis=0.7,las=1
    +    )
    +    if(i==5){
    +      axis(1,at=seq(0,90,15),cex.axis=0.75,mgp=c(3,0.2,0))
    +      mtext("time",side = 1,line = 1.2,cex = 0.6)
    +    }
    +    if(j==1){
    +      mtext("probability",side = 2,line = 2,cex = 0.6)
    +    }
    +
    +    points(probtrans_objects[[j]]$time,probtrans_objects[[j]][[i]],type="l"
    +           ,col="red")
    +  }
    +  
    +}
    +dev.off()
    +
    +
    +

    6 Simulation study

    +
    +

    Supplementary figures

    +

    The following figures are supplementary figures to the simulation study reported in section 4 of the main text. They show proportions of valid, infinite and missing (‘NA’) estimates produced by the empirical Bayes Cox estimators implemented in \(\texttt{ebmstate}\) and the fully non-parametric (null) estimators.

    +
    +
    + +
    +estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing ('NA') values for data sets with 100 patients. +

    +Figure 6.1: estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing (‘NA’) values for data sets with 100 patients. +

    +
    +


    +
    +

    +
    +estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing ('NA') values for data sets with 1000 patients. +

    +Figure 6.2: estimation of regression coefficients, relative hazards, and state occupation probabilities with empirical Bayes Cox estimators: proportions of valid, infinite and missing (‘NA’) values for data sets with 1000 patients. +

    +
    +


    +
    +

    +


    +
    +

    +
    +estimation of state occupation probabilities with non-parametric estimators: proportions of valid, infinite and missing ('NA') values. +

    +Figure 6.3: estimation of state occupation probabilities with non-parametric estimators: proportions of valid, infinite and missing (‘NA’) values. +

    +
    +
    +
    +

    Sample script

    +

    The sample script below performs the following tasks: +a) generates data sets of 1000 patients under a Cox model with a ‘linear’ transition structure and 500 covariates/coefficients per transition (non-sparse but scaled regression coefficients); +b) fits the fixed effects Cox model, an empirical Bayes Cox model and a fully non-parametric model; +c) for each data set, estimates state occupation probabilities under these three models; +d) for each data set, estimates true occupation probabilities by simulating from the true model.

    +


    +

    +
    set.seed(0840350)
    +library(parallel)
    +library(flexsurv)
    +library(ebmstate)
    +library(bindata)
    +
    +n<-1000 # number of patients
    +covariate_names<-paste0("Cov",1:500) #number of covariates (for each transition)
    +nGroups<-1 #overall number of priors
    +nTrans<-3 # number of transitions
    +nParam<-nTrans*length(covariate_names) #total number of parameters (regression coefficients)
    +nr_simulated_data_sets<-100
    +param_fun<-function(nParam){
    +  out<-sqrt(10/length(covariate_names))*rnorm(n = nParam,mean = 0,sd = 0.65)
    +  out
    +}
    +TimePoints<-seq(10,70,10) #time points at which state occupation probability estimates are to be retrieved
    +nTimePoints<-length(TimePoints)
    +
    +tmat<-mstate::transMat(x=list(c(2),c(3),c(4),c()),names=c("state1","state2","state3","state4"))
    +
    +shape1<-0.1
    +rate1<-exp(-4.5)
    +shape2<-0.1
    +rate2<-exp(-2.7)
    +shape3<-0.15
    +rate3<-exp(-3.5)
    +
    +file1<-"../data/dense_sim_1000obs_1group_for_all_trans_500vars_LinearStructure.Rdata"
    +
    +
    +#A function that computes relative frequencies of each state at some time t
    +rel_freq<-function(state,t,m){
    +  if(state=="state1"){
    +    sum(t<=m[,1])/nrow(m)
    +  }else if(state=="state2"){
    +    sum(t>m[,1]&t<=m[,1]+m[,2])/nrow(m)
    +  }else if(state=="state3"){
    +    sum(t>m[,1]+m[,2]&t<=m[,1]+m[,2]+m[,3])/nrow(m)
    +  }else if(state=="state4"){
    +    sum(t>m[,1]+m[,2]+m[,3])/nrow(m)
    +  }
    +}
    +
    +
    +simfun<-function(j){
    +  param<-param_fun(nParam)
    +  marg_probs<-runif(n = length(covariate_names),min = 0.05,max = 0.3)
    +  
    +  true_rh_fun<-function(trans){
    +    exp(covariate_vector%*%param[(1+length(covariate_names)*(trans-1)):(length(covariate_names)*trans)])
    +  }
    +  
    +  rh_fun_coxrfx<-function(trans){
    +    exp(covariate_vector%*%coxrfx_object$coefficients[seq(trans,length(coxrfx_object$coefficients),nTrans)])
    +  }
    +  
    +  rh_fun_coxph<-function(trans){
    +    exp(covariate_vector%*%coxph_object$coefficients[seq(trans,length(coxph_object$coefficients),nTrans)])
    +  }
    +  
    +  covariate_matrix<-rmvbin(n,margprob = marg_probs)
    +  
    +  colnames(covariate_matrix)<-covariate_names
    +  
    +  
    +  #relative risks (relative hazards)
    +  rel.risk_trans1<-exp(covariate_matrix%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
    +  rel.risk_trans2<-exp(covariate_matrix%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
    +  rel.risk_trans3<-exp(covariate_matrix%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
    +  
    +  
    +  #Generate a transition history for each patient. Clock-reset model. Baseline hazard is Gompertz for all transitions. 
    +  
    +  m<-matrix(c(flexsurv::rgompertz(n, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
    +  m<-cbind(m,apply(m,1,sum,na.rm=T))
    +  m<-cbind(m,rexp(n,0.008))
    +  m<-cbind(m,(m[,5]<m[,4]))
    +  colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time", "cens_time","cens=1")
    +  m<-as.data.frame(m)
    +  print(sum(m$`cens=1`)/nrow(m))
    +  
    +  #convert the data to long format
    +  mstate.data<-data.frame()
    +  
    +  for(i in 1:nrow(m)){
    +    id<-i
    +    from<-1
    +    to<-2
    +    trans<-1
    +    Tstart<-0
    +    Tstop<-min(m$state1_duration[i],m$cens_time[i])
    +    time<-Tstop-Tstart
    +    status<-as.numeric(m$cens_time[i]>m$state1_duration[i])
    +    mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,                                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) 
    +    if(status==1){
    +      id<-i
    +      from<-2
    +      to<-3
    +      trans<-2
    +      Tstart<-Tstop
    +      Tstop<-min(Tstart[1]+m$state2_duration[i],m$cens_time[i])
    +      time<-Tstop-Tstart
    +      status<-as.numeric(m$cens_time[i]>m$state1_duration[i]+m$state2_duration[i])
    +      mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,                                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    +      if(status==1){
    +        id<-i
    +        from<-3
    +        to<-4
    +        trans<-3
    +        Tstart<-Tstop
    +        Tstop<-min(m$total_time[i],m$cens_time[i])
    +        time<-Tstop-Tstart
    +        status<-as.numeric(m$total_time[i]<m$cens_time[i])
    +        mstate.data<-rbind(mstate.data,data.frame(id=id,from=from,to=to,trans=trans,                                     Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    +      }
    +    }
    +    
    +  }
    +  
    +  #add covariates
    +  mstate.data<-cbind(mstate.data,covariate_matrix[mstate.data$id,])
    +  
    +  #attributes and class
    +  class(mstate.data)<-c("data.frame","msdata")
    +  attr(mstate.data,"trans")<-tmat
    +  
    +  
    +  #expand covariates
    +  mstate.data<-mstate::expand.covs(mstate.data,covs =names(mstate.data)[-(1:8)])
    +  
    +  #Fit non-parametric model
    +  surv0<-survival::Surv(mstate.data$Tstart,mstate.data$Tstop,mstate.data$status)
    +  coxph_object0<-coxph(as.formula("surv0~strata(trans)"),data = mstate.data)
    +  msfit_object0<-msfit_generic(coxph_object0,trans = tmat)
    +  probtrans_object0<-probtrans_mstate(msfit_object0,predt = 0)
    +  
    +  #retrieve state occupation prob estimates at times 10,20,...,50
    +  time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object0[[1]]$time-x)))
    +  probtrans_object0_reduced<-probtrans_object0[[1]][time_indices,1:(ncol(tmat)+1)]
    +  state_occup_estimates0<-as.matrix(probtrans_object0_reduced)
    +  
    +  
    +  #Fit homogeneous clock-reset empirical Bayes model. 
    +  
    +  #argument 'Z' of coxrfx
    +  Z<-mstate.data[,-(1:(8+length(covariate_names)))]
    +  Z$strata<-Z$trans<-mstate.data$trans
    +  
    +  #argument 'surv' of coxrfx
    +  surv<-survival::Surv(mstate.data$time,mstate.data$status)
    +
    +  #argument 'groups' of coxrfx
    +  groups<-rep("unique_group",length(param))
    +  
    +  #fit random effects model
    +  coxrfx_object<-CoxRFX(Z,surv,groups,max.iter = 600,tol = 0.0001,sigma.hat = "df")
    +  cat("coxrfx_concordance:",concordance(coxrfx_object)$concordance)
    +  
    +  #fit fixed effects model
    +  
    +  model_formula<-as.formula(paste0("surv~",paste(head(names(Z),length(names(Z))-2),collapse = "+"),"+strata(strata)"))
    +  try(coxph_object<-ebmstate:::coxph(formula = model_formula,data=Z,control = coxph.control(iter.max = 100)),silent=TRUE)
    +  try(cat("coxph_concordance:",concordance(coxph_object)$concordance),silent=TRUE)
    +  
    +  #simulate single patient covariate data for computing relative and cumulative hazards 
    +  covariate_vector<-rmvbin(1,marg_probs)
    +  
    +  #single patient true relative hazards
    +  true_rel_risk_sampled_patient<-sapply(1:nTrans,true_rh_fun)
    +  
    +  #estimated relative hazard for patient (coxrfx)
    +  rel_risk_estimates_coxrfx<-sapply(1:nTrans,rh_fun_coxrfx)
    +  
    +  #estimated relative risks for patient (coxph)
    +  try(rel_risk_estimates_coxph<-sapply(1:nTrans,rh_fun_coxph),silent=TRUE)
    +  
    +  #put patient covariate vector in long format
    +  newdata<-data.frame(matrix(rep(covariate_vector,length(na.exclude(as.vector(tmat)))),nrow=length(na.exclude(as.vector(tmat))),byrow = T))
    +  names(newdata)<-covariate_names
    +  newdata$strata<-newdata$trans<-1:nTrans
    +  attr(newdata,"trans")<-tmat
    +  class(newdata)<-c("data.frame","msdata")
    +  newdata<-mstate::expand.covs(newdata,covs=names(newdata)[!names(newdata)%in%c("trans","strata")],append=TRUE)
    +  newdata<-newdata[-(1:length(covariate_names))]
    +  
    +  
    +  #compute cumulative hazards and state occupation probs (coxrfx)
    +  msfit_object_coxrfx<-msfit_generic(coxrfx_object,newdata,trans = tmat)
    +  probtrans_object_coxrfx<-probtrans_fft("state1",msfit_object_coxrfx,time = seq(0,80,length.out=50000))
    +  
    +  #retrieve state occupation prob estimates at times 10,20,...,50
    +  time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object_coxrfx[[1]]$time-x)))
    +  probtrans_object_coxrfx_reduced<-probtrans_object_coxrfx[[1]][time_indices,]
    +  state_occup_estimates_coxrfx<-as.matrix(probtrans_object_coxrfx_reduced)
    +  
    +  #compute cumulative hazards and state occupation probs (coxph)
    +  try(msfit_object_coxph<-msfit_generic(coxph_object,newdata,trans = tmat),silent = TRUE)
    +  try(probtrans_object_coxph<-probtrans_fft("state1",msfit_object_coxph,time = seq(0,80,length.out=50000)),silent = TRUE)
    +  
    +  #retrieve state occupation prob estimates at times 10,20,...,50
    +  try(time_indices<-sapply(TimePoints,function(x) which.min(abs(probtrans_object_coxph[[1]]$time-x))),silent = TRUE)
    +  try(probtrans_object_coxph_reduced<-probtrans_object_coxph[[1]][time_indices,],silent = TRUE)
    +  try(state_occup_estimates_coxph<-as.matrix(probtrans_object_coxph_reduced),silent = TRUE)
    +  
    +  
    +  #Get very precise estimates of state occupation probabilities for particular patient
    +  rel.risk_trans1<-exp(covariate_vector%*%param[(1+length(covariate_names)*0):(length(covariate_names)*1)])
    +  rel.risk_trans2<-exp(covariate_vector%*%param[(1+length(covariate_names)*1):(length(covariate_names)*2)])
    +  rel.risk_trans3<-exp(covariate_vector%*%param[(1+length(covariate_names)*2):(length(covariate_names)*3)])
    +  
    +  #generate 100,000 uncensored observations for the same patient
    +  n2<-100000
    +  
    +  m<-matrix(c(flexsurv::rgompertz(n2, shape=shape1, rate = rel.risk_trans1*rate1),flexsurv::rgompertz(n2, shape=shape2, rate = rel.risk_trans2*rate2),flexsurv::rgompertz(n2, shape=shape3, rate = rel.risk_trans3*rate3)),ncol = 3)
    +  m<-cbind(m,rowSums(m,na.rm=TRUE))
    +  colnames(m)<-c("state1_duration","state2_duration","state3_duration","total_time")
    +  m<-as.data.frame(m)
    +  
    +  time_vector<-TimePoints
    +  rel_freq_1<-sapply(time_vector,rel_freq,state="state1",m=m)
    +  rel_freq_2<-sapply(time_vector,rel_freq,state="state2",m=m)
    +  rel_freq_3<-sapply(time_vector,rel_freq,state="state3",m=m)
    +  rel_freq_4<-sapply(time_vector,rel_freq,state="state4",m=m)
    +  
    +  true_probs_matrix<-cbind(time_vector,rel_freq_1,rel_freq_2,rel_freq_3,rel_freq_4)
    +  if(!'coxph_object'%in%ls()) coxph_object<-list(coefficients=NA)
    +  if(!'rel_risk_estimates_coxph'%in%ls()) rel_risk_estimates_coxph<-NA 
    +  if(!'state_occup_estimates_coxph'%in%ls()) state_occup_estimates_coxph<-NA 
    +  
    +  
    +  list(n=n,nGroups=nGroups,nTrans=nTrans,nParam=nParam,param=param,
    +       TimePoints=TimePoints,tmat=tmat,
    +       coxrfx_coefficients=coxrfx_object$coefficients,
    +       coxrfx_mu=coxrfx_object$mu,
    +       coxrfx_sigma2=coxrfx_object$sigma2,
    +       coxph_coefficients=coxph_object$coefficients,
    +       covariate_vector=covariate_vector,
    +       true_rel_risk_sampled_patient=true_rel_risk_sampled_patient,
    +       rel_risk_estimates_coxph=rel_risk_estimates_coxph,
    +       rel_risk_estimates_coxrfx=rel_risk_estimates_coxrfx,
    +       state_occup_estimates_coxph=state_occup_estimates_coxph,
    +       state_occup_estimates_coxrfx=state_occup_estimates_coxrfx,
    +       state_occup_estimates0=state_occup_estimates0,
    +       true_probs_matrix=true_probs_matrix,
    +       marg_probs=marg_probs)
    +  
    +  
    +}
    +
    +simfun_object<-mclapply(1:300,simfun,mc.cores = 50)
    +save(simfun_object,file=file1)
    +


    +Example script to generate boxplots of average absolute errors and plots of proportions of failed estimates. It takes as input a set of objects with model estimates, such as the one generated by the previous block of code.

    +
    ######### IMPORTING SIMULATED DATA
    +#Choose one of the following batches of data to import
    +
    +### Batch 1: each simulated data set has 1000 patients
    +rdata_names<-vector("list",3)
    +rdata_names[[1]]<-c(
    +  "../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_LinearStructure.Rdata"
    +  ,  "../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_LinearStructure.Rdata"
    +  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_LinearStructure.Rdata"
    +  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_LinearStructure.Rdata"
    +  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_LinearStructure.Rdata"
    +  ,"../data/Linear_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_LinearStructure.Rdata"
    +)
    +rdata_names[[2]]<-c(
    +  "../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_CompetingRisksTransStructure.Rdata"
    +)
    +rdata_names[[3]]<-c(
    +  "../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_10vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_100vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_200vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_300vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_400vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_1000patients/dense_sim_1000obs_1group_for_all_trans_500vars_MtransStructure.Rdata"
    +)
    +
    +### Batch 2: each simulated data set has 100 patients
    +rdata_names<-vector("list",3)
    +rdata_names[[1]]<-c(
    +  "../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_LinearStructure.Rdata"
    +  ,"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_LinearStructure.Rdata"
    +  ,"../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_LinearStructure.Rdata"
    +  ,  "../data/Linear_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_LinearStructure.Rdata"
    +)
    +rdata_names[[2]]<-c(
    +  "../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_CompetingRisksTransStructure.Rdata"
    +  ,"../data/Comp_risks_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_CompetingRisksTransStructure.Rdata"
    +)
    +rdata_names[[3]]<-c(
    +  "../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_10vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_40vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_70vars_MtransStructure.Rdata"
    +  ,"../data/M_structure_100patients/dense_sim_100obs_1group_for_all_trans_100vars_MtransStructure.Rdata"
    +)
    +
    +## load the batch of simulated data chosen
    +
    +x_values<-vector("list",3)
    +simfun_object_lists<-vector("list",length(rdata_names))
    +
    +for(i in 1:length(rdata_names)){
    +  x_values[[i]]<-rep(NA,length(rdata_names[[i]]))
    +  for(j in 1:length(x_values[[i]])){
    +    load(rdata_names[[i]][j])
    +    simfun_object_lists[[i]][[j]]<-simfun_object
    +    x_values[[i]][j]<-length(simfun_object[[1]]$covariate_vector)
    +  }
    +}
    +rm(simfun_object)
    +
    +unwrap_fun<-function(simfun_object,name){
    +  if(name%in%c("n","nGroups","nTrans","nParam","param","TimePoints"
    +               ,"tmat","coxrfx_mu","coxrfx_sigma2","covariate_vector"
    +               ,"true_rel_risk_sampled_patient","true_probs_matrix"
    +               ,"marg_probs")){
    +    sapply(simfun_object,function(x){
    +      return(x[[name]])
    +    } 
    +    )
    +  }else if(name%in%c("coxph_coefficients","coxrfx_coefficients")){
    +    sapply(simfun_object,function(x){
    +      if(length(x[[name]])==x$nParam){
    +        return(x[[name]])
    +      }else{
    +        return(rep(NA,x$nParam))
    +      }
    +    } 
    +    )   
    +  }else if(grepl("rel_risk_estimates",name)){
    +    sapply(simfun_object,function(x){
    +      if(length(x[[name]])==x$nTrans){
    +        return(x[[name]])
    +      }else{
    +        return(rep(NA,x$nTrans))
    +      }
    +    } 
    +    )
    +  }else if(grepl("state_occup_estimates",name)){
    +    out<-sapply(simfun_object,function(x){
    +      if(length(as.vector(x[[name]]))==prod(dim(x$true_probs_matrix))){
    +        return(as.vector(x[[name]])[-(1:dim(x$true_probs_matrix)[1])])
    +      }else{
    +        return(rep(NA,prod(dim(x$true_probs_matrix))-dim(x$true_probs_matrix)[1]))
    +      }
    +    } 
    +    )
    +    invalid_estimates<-which(abs(colSums(out)-nrow(simfun_object[[1]]$true_probs_matrix))>1)
    +    out[,invalid_estimates]<-NA
    +    out
    +  }
    +  
    +}
    +
    +plot_abs3<-function(simfun_object){
    +  list1<-sapply(names(simfun_object[[1]]),unwrap_fun,simfun_object=simfun_object,USE.NAMES = TRUE)
    +  names(list1)<-names(simfun_object[[1]])
    +  list1$nr_simulated_data_sets<-length(simfun_object)
    +  param_order_fun<-function(nTrans,nParam){
    +    as.vector(sapply(1:(nParam/nTrans),function(x) seq(x,nParam,nParam/nTrans)))
    +  }
    +  param_order<-param_order_fun(list1$nTrans[1],list1$nParam[1])
    +  list1$param<-list1$param[param_order,]
    +
    +  abs_errors_coef0<-abs(list1$param)
    +  abs_errors_coef_coxph<-abs(list1$coxph_coefficients-list1$param)
    +  abs_errors_coef_coxrfx<-abs(list1$coxrfx_coefficients-list1$param)
    +  
    +  abs_errors_rel_risk_coxph<-abs(list1$rel_risk_estimates_coxph-list1$true_rel_risk_sampled_patient)
    +  abs_errors_rel_risk_coxrfx<-abs(list1$rel_risk_estimates_coxrfx-list1$true_rel_risk_sampled_patient)
    +  abs_errors_rel_risk0<-abs(1-list1$true_rel_risk_sampled_patient)
    +  
    +  abs_errors_pred_coxph<-abs(list1$state_occup_estimates_coxph-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
    +  abs_errors_pred_coxrfx<-abs(list1$state_occup_estimates_coxrfx-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
    +  abs_errors_pred0<-abs(list1$state_occup_estimates0-list1$true_probs_matrix[-(1:dim(simfun_object[[1]]$true_probs_matrix)[1]),])
    +  
    +  out<-list(coef_errors=list(coxph=abs_errors_coef_coxph
    +                             ,coxrfx=abs_errors_coef_coxrfx
    +                             ,null=abs_errors_coef0
    +                             )
    +            ,rel_risk_errors=list(coxph=abs_errors_rel_risk_coxph
    +                           ,coxrfx=abs_errors_rel_risk_coxrfx
    +                           ,null=abs_errors_rel_risk0)
    +            ,pred_errors=list(coxph=abs_errors_pred_coxph
    +                       ,coxrfx=abs_errors_pred_coxrfx
    +                       ,null=abs_errors_pred0
    +            )
    +            ,true_values=list(coef=list1$param
    +                              ,rel_risk=list1$true_rel_risk_sampled_patient
    +                              ,pred=list1$true_probs_matrix
    +                              )
    +            ,pred_estimates=list(coxph=list1$state_occup_estimates_coxph
    +                            ,coxrfx=list1$state_occup_estimates_coxrfx
    +                            ,null=list1$state_occup_estimates0
    +                            )
    +            ,coef_estimates=list(coxph=list1$coxph_coefficients
    +                                 ,coxrfx=list1$coxrfx_coefficients
    +                                 )
    +            ,rel_risk_estimates=list(coxph=list1$rel_risk_estimates_coxph
    +                                     ,coxrfx=list1$rel_risk_estimates_coxrfx
    +                                     )
    +  )
    +  out
    +}
    +
    +plot_object_lists<-vector("list",length = length(rdata_names))
    +for(i in 1:length(simfun_object_lists)){
    +  plot_object_lists[[i]]<-lapply(simfun_object_lists[[i]],plot_abs3)
    +}
    +
    +
    +#### PLOTS
    +
    +##boxplots of average absolute error for each simulated data set
    +
    +boxplot_fun<-function(plot_obj_list,target){
    +  limits<-list(coef_errors=0.8,rel_risk_errors=10,pred_errors=0.5)
    +  for(i in 1:length(plot_obj_list)){
    +    obj1<-apply(plot_obj_list[[i]][[target]][["coxph"]],2,mean)
    +    obj2<-apply(plot_obj_list[[i]][[target]][["coxrfx"]],2,mean)
    +    obj3<-apply(plot_obj_list[[i]][[target]][["null"]],2,mean)
    +    obj1<-sapply(obj1,function(x) ifelse(x<limits[[target]]
    +                                         ,x,limits[[target]]))
    +    obj2<-sapply(obj2,function(x) ifelse(x<limits[[target]]
    +                                         ,x,limits[[target]]))
    +    obj3<-sapply(obj3,function(x) ifelse(x<limits[[target]]
    +                                         ,x,limits[[target]]))
    +    try(boxplot(obj1,add=TRUE, at = i-0.22,boxwex=0.35,pars=list(outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE))
    +    try(boxplot(obj2,add=TRUE,at = i,boxwex=0.35,pars=list(outcol="red",outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE,border="red"))
    +    try(boxplot(obj3,add=TRUE,at = i+0.22,boxwex=0.35,pars=list(outcex=0.2,boxfill="white",lwd=0.7,medlwd=0.75,whisklty=1),yaxt="n",axes=FALSE,border="blue"))
    +  }
    +  
    +}
    +
    +
    +
    +pdf(file ="./plots/rplots.pdf"
    +    ,width = 6
    +    ,height = 4)
    +par(mfrow=c(3,3),mar=c(2,2,1,0.5))
    +ylims<-c(0.8,10,0.5)
    +names(ylims)<-c("coef_errors","rel_risk_errors","pred_errors")
    +y_at<-list(coef_errors=seq(0,0.8,0.2)
    +           ,rel_risk_errors=seq(0,10,2)
    +           ,pred_errors=seq(0,0.5,0.25)
    +           )
    +y_labels<-list(coef_errors=c(0,0.2,0.4,0.6,expression(NULL>=0.8))
    +           ,rel_risk_errors=c(0,2,4,6,8,expression(NULL>=10))
    +           ,pred_errors=c(0,0.25,expression(NULL>=0.5))
    +)
    +for (i in 1:length(plot_object_lists)){
    +  for(j in c("coef_errors","rel_risk_errors","pred_errors")){
    +    plot(NA,ylim=c(0,ylims[j])
    +            ,xlim=c(0.5,length(x_values[[i]])+0.5)
    +            ,xlab=""
    +            ,main=""
    +            ,yaxt="n"
    +            ,xaxt="n"
    +            ,lwd=0.5
    +            ,bty="n"
    +            
    +    )
    +
    +    axis(1
    +         ,at=1:length(x_values[[i]])
    +         ,labels = x_values[[i]]
    +         ,tick=FALSE
    +         ,mgp=c(3,0.075,0)
    +         ,cex.axis=0.8
    +         ,lwd = 0.75
    +         )
    +    axis(2
    +         ,tick=TRUE
    +         ,tck=-0.05
    +         ,mgp=c(3,0.4,0)
    +         ,cex.axis=0.7
    +         ,lwd=0.75
    +         ,at=y_at[[j]]
    +         ,labels = y_labels[[j]]
    +    )
    +    mtext("covariates per transition"
    +          ,side=1
    +          ,line=1
    +          ,cex=0.5
    +          )
    +    mtext("average absolute error"
    +          ,side=2
    +          ,line=1.3
    +          ,cex=0.5
    +    )
    +    
    +    boxplot_fun(plot_object_lists[[i]],j)
    +    box(lwd=0.75)
    +    
    +  }
    +}
    +dev.off()
    +
    +
    +
    +par(mfrow=c(1,1))
    +plot(NA,ylim=c(0,1),bty="n",ylab="",xlab="",xaxt="n",yaxt="n")
    +legend(x=0.8,y=0.5,legend = c("Cox","EBCox","null"),fill = c("white"),border = c("black","red","blue"))
    +
    +
    +
    +## plots of proportions of failed estimates
    +na_function<-function(object,target,estimator){
    +  if(!target=="coef_estimates"){
    +    na_prp<-sum(is.na(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]]))
    +    inf_prp<-sum(is.infinite(as.vector(object[[target]][[estimator]])))/length(as.vector(object[[target]][[estimator]]))
    +    c(na_prp,inf_prp,1-na_prp-inf_prp)
    +  }else{
    +    na_prp<-sum(is.na(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]]))
    +    inf_prp<-sum(is.infinite(unlist(object[[target]][[estimator]])))/length(unlist(object[[target]][[estimator]]))
    +    out<-c(na_prp,inf_prp,1-na_prp-inf_prp)
    +    names(out)<-c("NA","Inf","valid")
    +    out
    +  }
    +}
    +
    +# batch with 1000 patients per data set
    +pdf(file ="./plots/na_props.pdf"
    +    ,width = 6
    +    ,height = 4)
    +par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0))
    +for(i in 1:3){
    +  for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){
    +    barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph")
    +    barplot(barplot_matrix,border=NA
    +            ,col = c(1,2,4)
    +            ,width=1,xlim = c(0,7),cex.axis =0.8,las=2,xaxt="n")
    +    axis(1,at=c(0.7,1.9,3.1,4.3,5.5,6.7)
    +         ,labels=c(10,100,200,300,400,500)
    +         ,tick = FALSE
    +         ,mgp=c(3,0.3,0)
    +         ,las=1
    +         ,cex.axis=0.8)
    +    mtext("covariates per trans",cex=0.5,line=1.3,side=1)
    +    mtext("proportion",cex=0.5,line=1.9,side=2,las=3)
    +  }
    +}
    +dev.off()
    +
    +#batch with 100 patients per data set
    +pdf(file ="./plots/na_props_100.pdf"
    +    ,width = 6
    +    ,height = 4)
    +par(mfrow=c(3,3),mar=c(3,3.5,1,0),mgp=c(3,0.75,0))
    +for(i in 1:3){
    +  for(j in c("coef_estimates","rel_risk_estimates","pred_estimates")){
    +    barplot_matrix<-sapply(plot_object_lists[[i]],na_function,target=j,estimator="coxph")
    +    barplot(barplot_matrix,border=NA
    +            ,col = c(1,2,4)
    +            ,width=1,xlim = c(0,5),cex.axis =0.8,las=2,xaxt="n")
    +    axis(1,at=c(0.7,1.9,3.1,4.3)
    +         ,labels=c(10,40,70,100)
    +         ,tick = FALSE
    +         ,mgp=c(3,0.4,0)
    +         ,las=1
    +         ,cex.axis=0.8)
    +    mtext("covariates per trans",cex=0.5,line=1.4,side=1)
    +    mtext("proportion",cex=0.5,line=1.9,side=2,las=3)
    +  }
    +}
    +dev.off()
    +
    +#legend
    +pdf(file ="./plots/na_props_100_legend.pdf"
    +    ,width = 6
    +    ,height = 4)
    +par(mfrow=c(1,1))
    +plot(NA,ylim=c(0,1),bty="n",ylab="",xaxt="n",yaxt="n")
    +legend(x=0.8,y=0.8
    +       ,legend = c("valid","infinite","NA")
    +       ,fill = c(4,2,1),)
    +dev.off()
    +
    +
    +
    +
    +
      +
    1. European Bioinformatics Institute (EMBL-EBI), ↩︎

    2. +
    3. Genome Biology Unit, EMBL↩︎

    4. +
    5. German Cancer Research Center (DKFZ)↩︎

    6. +
    +
    + + + + +
    + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-002/scripts/ESM_2.Rmd b/_articles/RJ-2024-002/scripts/ESM_2.Rmd new file mode 100644 index 0000000000..987821f9d9 --- /dev/null +++ b/_articles/RJ-2024-002/scripts/ESM_2.Rmd @@ -0,0 +1,671 @@ +--- +title: "MDS data analysis" +author: +- Rui J Costa^[European Bioinformatics Institute (EMBL-EBI), ruibarrigana@hotmail.com] +- Moritz Gerstung^[Genome Biology Unit, EMBL] ^[German Cancer Research Center (DKFZ)] +toc-title: Contents +header-includes: \usepackage{amsmath,amsfonts,amssymb,amsthm,verbatim} +output: + bookdown::html_document2: + number_sections: no + toc: yes + toc_depth: 4 + pdf_document: + toc: yes + toc_depth: '4' + word_document: + toc: yes + toc_depth: '4' +subtitle: This document is part of the supplementary material to Costa, + R. J., Gerstung, M. (2024), ebmstate -- an R package For Disease + Progression Analysis Under Empirical Bayes Cox Models, *The R Journal*. +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE,eval = FALSE + ,tidy.opts=list(width.cutoff=80), tidy=TRUE + ) + +``` +\ +\ + +#### Preamble +```{r} +set.seed(973009) +library(ebmstate) +library(RColorBrewer) +library(plotrix) +library(caret) + +``` + +#### Pre-processing the data + +```{r echo=FALSE, eval=FALSE} +load("../data/processed_data.Rdata") +``` + +The pre-processing steps produce a data frame called 'imputedData', with the covariate data after imputation has been carried out, and a data frame called 'mdsClinicalData', with the disease progression data and unprocessed covariate data. + +```{r} +mdsClinicalData <- read.table("../data/mds.paper.clin.txt", header=T, sep="\t", fill=T, na.strings = c("NA","na")) ## Import clinical data +mdsClinicalData <- mdsClinicalData [!duplicated(mdsClinicalData$PDID),] ## remove duplicated observations +mdsClinicalData[mdsClinicalData==""] = NA #replace empty string with NAs +mdsClinicalData = mdsClinicalData[mdsClinicalData$X0.no.mut.1.seq.2.removedbyqc.3.failed <2,] +mdsClinicalData$IPSS.norm = factor(tolower(as.character(mdsClinicalData$IPSS.norm)), levels=c("low", "int-1", "int-2", "high")) # removes factor level "Low", keeping factor level "low" +``` + + +```{r} +mdsGeneticData <- read.table("../data/MDS.TPD.20Nov2012.csv", sep=",", header=T, fill=T, quote = "\"") ## Genotypes +mdsGeneticData$Gene<-factor(mdsGeneticData$Gene) +levels(mdsGeneticData$Gene)[levels(mdsGeneticData$Gene)=="SFRS2"] = "SRSF2" #SFRS2 is alternative label for gene SRSF2 +levels(mdsGeneticData$Gene)[levels(mdsGeneticData$Gene)=="ENSG00000091592"] = "NLRP1" #change level name +mdsGeneticData$Decision<-factor(mdsGeneticData$Decision) +``` + +Create a matrix whose i,j entry corresponds to the patient i and gene j (there are no duplicated patients or genes). +The entry in this matrix is 3 if patient i has at least one oncogenic mutation in gene j. It is 2, if the patient has at least one possibly oncogenic mutation in this gene. Or 1 if there's at least one mutation of unknown oncogenic status in the gene. +```{r} +IDs <- mdsClinicalData$PDID +allGenotypes <- matrix(0,nrow = length(IDs), ncol = length(levels(mdsGeneticData$Gene))) +rownames(allGenotypes) <- IDs +colnames(allGenotypes) <- levels(mdsGeneticData$Gene) +allGenotypes <- allGenotypes[rownames(allGenotypes)!="", colnames(allGenotypes)!=""] +for(i in seq_along(mdsGeneticData$Gene)){ + if(mdsGeneticData$SAMPLE.NAME[i] %in% IDs) + allGenotypes[as.character(mdsGeneticData$SAMPLE.NAME[i]), as.character(mdsGeneticData$Gene[i])] <- max(c(3,2,1)[as.numeric(mdsGeneticData$Decision[i])], allGenotypes[as.character(mdsGeneticData$SAMPLE.NAME[i]), as.character(mdsGeneticData$Gene[i])]) +} +``` + +Restrict to matching PDIDs, mutated genes +```{r} +genotypes <- allGenotypes[,colSums(allGenotypes)>0] +``` + +Create 5 indicator (binary) variables (one for each center). +```{r} +centers <- sapply(unique(1:5), function(i) mdsClinicalData$center==i) + 0 +colnames(centers) <- paste("center",1:5, sep="") +``` + +Create object cytoMerged merging some cytogenetic variables from mdsClinicalData. +CytoMerged includes all observations on variables with prefix "CYTO_" in mdsClinicalData. +If observation i on variable "CYTO_X" is missing, observation i on variable "SEQ_X" is used (in case "SEQ_X"[i] is not also missing). + +```{r} +cyto = mdsClinicalData[,grepl("CYTO_",colnames(mdsClinicalData))] +colnames(cyto) = c( "chr3" , "del5q" ,"del7_7q" ,"tri8" , "del11" , "del12", "alt17q" , "tri19" ,"del20q" ,"delY", "other" , "complex") +ascat = mdsClinicalData[,grepl("SEQ_",colnames(mdsClinicalData))] +colnames(ascat) = c("tri8" , "del5" ,"del7_7q" , "del11q" ,"del12p", "alt17q" , "tri19" , "del20q","other") + +cytoMerged = cyto +for(c in colnames(cyto)) + if(c %in% colnames(ascat)) + cytoMerged[,c][is.na(cytoMerged[,c])] = ascat[,c][is.na(cytoMerged[,c])] +``` + +Simplified WHO types +```{r} +#indicator variables for simplified who classes +whoSimple = data.frame( + ra = mdsClinicalData$WHO.category %in% c("RA","RT"), + rars = mdsClinicalData$WHO.category == "RARS", + rars_t = mdsClinicalData$WHO.category == "RARS-T", + rcmd = mdsClinicalData$WHO.category == "RCMD", + rcmd_rs = mdsClinicalData$WHO.category == "RCMD-RS", + raeb = mdsClinicalData$WHO.category %in% c("RAEB", "RAEB 1", "RAEB 2"), + d5q = mdsClinicalData$WHO.category == "5q-", + cmml = mdsClinicalData$WHO.category == "CMML", + mds_mpn = mdsClinicalData$WHO.category == "MDSMPN", + mds_u = mdsClinicalData$WHO.category =="MDS-U", + mds_aml = mdsClinicalData$WHO.category == "AML-MDS" +) + 0 + +# factor vector for simplified WHO classes +whoSimpleFactor = factor(rowSums(whoSimple * rep(1:ncol(whoSimple), each=nrow(whoSimple))), labels = c("RA","RARS","RARS-T","RCMD","RCMD-RS","RAEB","5q-","CMML","MDS-MPN","MDS-U","MDS-AML")) +``` + +Combine into single data.frame (only covariates) +```{r} +d <- genotypes >= 3 #only oncogeneic mut +d <- d[,colSums(d) >0] # only genes mutated at least once +rawData <- data.frame(d, + cytoMerged, + age_log = log(as.numeric(as.character(mdsClinicalData$AGE))), + sex = mdsClinicalData$Gender, + pb_cytopenia = as.numeric(mdsClinicalData$PB.CYTOPENIA), + hb = as.numeric(mdsClinicalData$HB), + anc_log = log(as.numeric(as.character(mdsClinicalData$ANC))+1e-3), + plt_log = log(as.numeric(mdsClinicalData$PLT)), + bm_blasts_logit = car::logit(as.numeric(as.character(mdsClinicalData$X..BM.BLASTS))), + ring_sideroblasts_logit = car::logit(as.numeric(as.character(mdsClinicalData$X..RING.SIDEROBLASTS))), + ipss = as.numeric(mdsClinicalData$IPSS.norm), + who_simple_factor = ebmstate:::MakeInteger(whoSimpleFactor), #essentially the same as 'whoSimple' above + center = ebmstate:::MakeInteger(as.factor(mdsClinicalData$center)), #essentially the same as 'centers' above + date = (as.numeric(as.Date(mdsClinicalData$DATE.OF.DIAGNOSIS, format="%d/%m/%Y"))-4122)/(365.25*5)#date is the time since the oldest diagnosis in units of 5 years +) +``` + +Correct covariate classes +```{r } +logical_covs<-c('ASXL1','ATRX','BCOR','BRAF','CBL','CDKN2A','CEBPA','CREBBP','CTNNA1','CUX1','DNMT3A','EP300','ETV6','EZH2','FLT3','GATA2','GNAS','IDH1','IDH2','IRF1','JAK2','KDM6A','KIT','KRAS','MLL2','MPL','NF1','NPM1','NRAS','PHF6','PTEN','PTPN11','RAD21','RUNX1','SF3B1','SH2B3','SRSF2','STAG2','TET2','TP53','U2AF1','WT1','ZRSR2','chr3','del5q','del7_7q','tri8','del11','del12','alt17q','tri19','del20q','delY','other','complex','sex','pb_cytopenia','who_simple_factor.RA','who_simple_factor.RARS','who_simple_factor.RARS.T','who_simple_factor.RCMD','who_simple_factor.RCMD.RS','who_simple_factor.RAEB','who_simple_factor.5q.','who_simple_factor.CMML','who_simple_factor.MDS.MPN','who_simple_factor.MDS.U','who_simple_factor.MDS.AML','center.1','center.2','center.3','center.4','center.5') +numeric_covs<-c('age_log','hb','anc_log','plt_log','bm_blasts_logit','ring_sideroblasts_logit','date') +factor_covs<-c('ipss') +covariate_classes<-list(logical_covs=logical_covs,numeric_covs=numeric_covs,factor_covs=factor_covs) + +for (i in names(rawData)){ + class_to_assign<-c("logical","numeric","factor")[sapply(covariate_classes,function(x) i%in%x)] + if(class_to_assign!="factor"){ + class(rawData[[i]])<-class_to_assign + }else{ + rawData[[i]]<-as.factor(rawData[[i]]) + } +} + + +``` + +Imputation of missing values by covariate-wise hot deck imputation. +```{r } +poorMansImpute <- function(x) {x[is.na(x)] <- sample(x[!is.na(x)],sum(is.na(x)),replace = T); return(x)} +imputedData <- as.data.frame(sapply(rawData,poorMansImpute)) + +``` + +Include only patients which have a date of diagnosis, a last follow-up date, and indicator variables for death and AML progression (153 patients are excluded). +```{r } +imputedData<-imputedData[!(is.na(mdsClinicalData$DATE.OF.DIAGNOSIS)|is.na(mdsClinicalData$DATE.LAST.FU)|is.na(mdsClinicalData$OUTCOME)|is.na(mdsClinicalData$AML.PROGRESSION)),] +mdsClinicalData<-mdsClinicalData[!(is.na(mdsClinicalData$DATE.OF.DIAGNOSIS)|is.na(mdsClinicalData$DATE.LAST.FU)|is.na(mdsClinicalData$OUTCOME)|is.na(mdsClinicalData$AML.PROGRESSION)),] +``` + +Remove variables that are no longer of use in mdsClinicalData +```{r } +rownames(mdsClinicalData)<-mdsClinicalData$PDID +mdsClinicalData<-mdsClinicalData[c("DATE.OF.DIAGNOSIS","AML.PROGRESSION","DATE.AML.PROGRESSION","DATE.LAST.FU","OUTCOME")] +``` + +Change dates to numeric +```{r } +mdsClinicalData[c("DATE.OF.DIAGNOSIS","DATE.LAST.FU","DATE.AML.PROGRESSION")]<-sapply(mdsClinicalData[c("DATE.OF.DIAGNOSIS","DATE.LAST.FU","DATE.AML.PROGRESSION")], function(x) as.numeric(as.Date(x,format="%d/%m/%Y"))) +``` + +Remove some patients with abnormal data. +```{r } +#Remove patient whose last follow-up time is the same as the date of diagnosis (excludes one patient). +imputedData<- imputedData[mdsClinicalData$DATE.OF.DIAGNOSIS!=mdsClinicalData$DATE.LAST.FU,] +mdsClinicalData<- mdsClinicalData[mdsClinicalData$DATE.OF.DIAGNOSIS!=mdsClinicalData$DATE.LAST.FU,] + +#Remove patients who progressed to AML but have no date of AML progression (excludes 4). +imputedData<-imputedData[!(mdsClinicalData$AML.PROGRESSION==1&is.na(mdsClinicalData$DATE.AML.PROGRESSION)),] +mdsClinicalData<-mdsClinicalData[!(mdsClinicalData$AML.PROGRESSION==1&is.na(mdsClinicalData$DATE.AML.PROGRESSION)),] + +#Remove patients whose date of AML progression is equal to the date of death (excludes 2). +imputedData<-imputedData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$OUTCOME==1&mdsClinicalData$DATE.AML.PROGRESSION==mdsClinicalData$DATE.LAST.FU),] +mdsClinicalData<-mdsClinicalData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$OUTCOME==1&mdsClinicalData$DATE.AML.PROGRESSION==mdsClinicalData$DATE.LAST.FU),] + +#Remove patients who died before they progressed (excludes 12). +imputedData<-imputedData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$DATE.AML.PROGRESSION>mdsClinicalData$DATE.LAST.FU),] +mdsClinicalData<-mdsClinicalData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$DATE.AML.PROGRESSION>mdsClinicalData$DATE.LAST.FU),] + +``` + +Convert all variables to "numeric". +```{r } +imputedData<-as.data.frame(lapply(imputedData,function(x) as.numeric(x))) +imputedDataNonCentered<-imputedData +``` + +Center non-categorical variables to facilitate interpretation of the baseline hazard. + +```{r } +imputedData[,c("age_log","hb","anc_log","plt_log","bm_blasts_logit","ring_sideroblasts_logit")]<-scale(imputedData[,c("age_log","hb","anc_log","plt_log","bm_blasts_logit","ring_sideroblasts_logit")],center = T,scale = F) + +# imputedData<-scale(imputedData,center = T,scale = F) + +``` + +To avoid confusion later on, when variables are expanded +```{r} + +names(imputedData)<-sub("center.","center",names(imputedData),fixed = T) +names(imputedDataNonCentered)<-sub("center.","center",names(imputedDataNonCentered),fixed = T) +``` + +Group variable names +```{r } +gene_vars<-names(imputedData)[1:43] +cytogenetic_vars<-names(imputedData)[44:55] +clinical_vars<-names(imputedData)[56:75] +nuisance_vars<-names(imputedData)[76:81] + +mutation_vars<-names(imputedData)[1:55] +all_clinical_vars<-names(imputedData)[56:81] + +``` + +Remove variables for which there is no variation in the data set. + +```{r} +imputedData<-imputedData[,which(apply(imputedData, 2, function(x) length(unique(x)))>0)] + +``` + + +Converting the data set to 'long format' +```{r} +mstate_data<-data.frame() + +for(i in 1:nrow(mdsClinicalData)){ + id<-rep(i,2) + from<-c(1,1) + to<-c(2,3) + trans<-c(1,2) + Tstart<-c(0,0) + if(mdsClinicalData$AML.PROGRESSION[i]==1){ + Tstop<-rep(mdsClinicalData$DATE.AML.PROGRESSION[i]-mdsClinicalData$DATE.OF.DIAGNOSIS[i],2) + time<-Tstop-Tstart + status<-c(1,0) + mstate_data<-rbind(mstate_data,data.frame(id=id,from=from,to=to, trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) + if(mdsClinicalData$DATE.LAST.FU[i]>mdsClinicalData$DATE.AML.PROGRESSION[i]){ + id<-i + from<-2 + to<-4 + trans<-3 + Tstart<-Tstop[1] + Tstop<-mdsClinicalData$DATE.LAST.FU[i]-mdsClinicalData$DATE.OF.DIAGNOSIS[i] + time<-Tstop-Tstart + status<-mdsClinicalData$OUTCOME[i] + mstate_data<-rbind(mstate_data,data.frame(id=id,from=from,to=to,trans=trans, + Tstart=Tstart,Tstop=Tstop,time=time,status=status)) + } + next + }else{ + Tstop<-rep(mdsClinicalData$DATE.LAST.FU[i]-mdsClinicalData$DATE.OF.DIAGNOSIS[i],2) + time<-Tstop-Tstart + status<-c(0,mdsClinicalData$OUTCOME[i]) + mstate_data<-rbind(mstate_data,data.frame(id=id,from=from,to=to, + trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status)) + } +} + +#check that no rows have NA's +mstate_data[apply(mstate_data,1,function(x) sum(is.na(x))>0),] + +mstate_data<-cbind(mstate_data,imputedData[mstate_data$id,]) +mstate_data$strata<-mstate_data$trans + + +``` + +For each transition separately, exclude variables with little variance. +```{r } + +percentage_of_ones_fun<-function(x){ + sum(x)/length(x) +} +vars_to_exclude_2<-vector("list",3) +for(i in 1:3){ + dummy_dataset<-mstate_data[mstate_data$trans==i,!names(mstate_data)%in%c("id","from","to","trans","Tstart","Tstop","time","status","strata","type")] + which_have_variance<-apply(dummy_dataset, 2, function(x) var(x)>0) + vars_to_exclude_2[[i]]<-names(dummy_dataset)[!which_have_variance] + dummy_dataset<-dummy_dataset[which_have_variance] + non_categorical_vars<-c("age_log","hb","anc_log","plt_log","bm_blasts_logit","ring_sideroblasts_logit","ipss","date") + percentage_of_ones<-apply(dummy_dataset[!names(dummy_dataset)%in%non_categorical_vars], 2, percentage_of_ones_fun) +which_less_than_five_percent<-which(percentage_of_ones<0.05) +vars_to_exclude_2[[i]]<-c(vars_to_exclude_2[[i]],names(percentage_of_ones)[which_less_than_five_percent]) +} + + +#variables to exclude for transition 1 are the same as for transition 2 +vars_to_exclude_2[[1]]==vars_to_exclude_2[[2]] + +#variables to exclude for transition 3 are a subset of those for transition 1 and 2 +vars_to_exclude_2[[3]]%in%vars_to_exclude_2[[2]] + +#use vars_to_exclude_2[[1]] as the variables to exclude in all transitions + +mstate_data<-mstate_data[!names(mstate_data)%in%vars_to_exclude_2[[1]]] +``` + + +#### Model estimation + +Argument 'Z' of CoxRFX for a model assuming that the +impact of each covariate is the same for all transitions +(one coefficient per covariate). This block of code is +not necessary, we keep it here for the sake of following +the explanation in the main text of the paper. +```{r} +# Z<-mstate_data[!names(mstate_data)%in%c("id","from","to", +# "Tstart","Tstop","time","status")] +``` + +Model: all covariates for transitions 1 and 2, none for transition 3 +```{r} +#Sort out class and attributes of 'mstate_data' +tmat<-transMat(x=list(c(2,3),c(4),c(),c()),names=c("MDS","AML","death","death_after_AML")) +class(mstate_data)<-c("data.frame","msdata") +attr(mstate_data,"trans")<-tmat + +# expand covariates by transition: +outcome_covs <- c("id", "from", "to", "trans", + "Tstart", "Tstop", + "time", "status", + "strata" +) +covariates_expanded_123 <- mstate::expand.covs( + mstate_data, + covs = names(mstate_data)[ + !names(mstate_data) %in% outcome_covs + ], + append = FALSE +) + +# remove all covariates for transition 3 from 'covariates_expanded_123' +# to fit a fully non-parametric model on this transition: +covariates_expanded_12 <- covariates_expanded_123[ + ! grepl(".3", names(covariates_expanded_123), fixed = TRUE) +] + +#argument 'Z' of coxrfx +Z_12<-data.frame(covariates_expanded_12,strata=mstate_data$trans,trans=mstate_data$trans) + +#argument 'groups' +groups_12<-paste0(rep("group",ncol(Z_12)-2),c("_1","_2")) + +#argument 'surv' +surv<-survival::Surv(mstate_data$time,mstate_data$status) + +#fit random effects model +model_12<-CoxRFX(Z=Z_12,surv=surv,groups=groups_12) + +# cumulative hazards and transition probabilities for patient 1 +# Build 'patient_data' data frame with the covariate values for which +# cumulative hazards are to be computed (covariate values of patient 78) +patient_data <- mstate_data[ + mstate_data$id == 78, + , + drop = FALSE][rep(1, 3), ] + +patient_data$strata <- patient_data$trans <- 1:3 + +patient_data <- mstate::expand.covs( + patient_data, + covs = names(patient_data)[ + ! names(patient_data) %in% outcome_covs + ], + append = TRUE +) + +patient_data <- patient_data[!grepl(".3", names(patient_data), fixed = TRUE)] + + +msfit_object<-msfit_generic(model_12,patient_data,tmat) +probtrans_object<-probtrans_ebmstate("MDS",msfit_object,"clockreset",max_time = 4000) +save(model_12,msfit_object,probtrans_object,file = + "../data/fit_objects.Rdata") + +#interval estimates +names(groups_12) <- names(covariates_expanded_12) + +# 'mstate_data_expanded' argument +# (similar to 'covariates_expanded_12' +# but including outcome variables) +mstate_data_expanded <- cbind( + mstate_data[names(mstate_data) %in% outcome_covs], + covariates_expanded_12 +) + +# create the non-parametric bootstrap confidence intervals +boot_ebmstate_object <- boot_ebmstate( + mstate_data = mstate_data_expanded, + which_group = groups_12, + min_nr_samples = 100, + patient_data = patient_data, + tmat = tmat, + initial_state = "MDS", + time_model = "clockreset", +# input_file = "../data/boot_ebmstate_backup.Rdata", + coxrfx_args = list(max.iter = 200), + probtrans_args = list(max_time = 4000) +) + +save(boot_ebmstate_object,file="../data/boot_object.Rdata") + +``` + +```{r echo=FALSE,eval=FALSE} +load("../data/fit_objects.Rdata") +load("../data/boot_object.Rdata") + +``` + +#### Plots of estimates + +Functions to generate plots of relative hazards +```{r } +labels_fun<-function(n){ + result_pos<-vector("numeric",0) + result_neg<-vector("numeric",0) + for(i in 1:n){ + result_pos<-c(result_pos,c(rep(NA,8),10^i)) + } + for(i in 1:n){ + result_neg<-c(result_neg,c(rep(NA,8),10^-i)) + } + as.character(c(rev(result_neg),1,result_pos)) +} + + +coefs_plot_fun<-function(k,coxrfx_object,coefficients_CIs,mar=NULL){ + #keep only covariate names from transition k + string_split<-strsplit(names(coxrfx_object$coefficients),"[.]") + is_name_from_trans_k<-sapply(string_split,function(x) x[length(x)]==as.character(k)) + CI_labels<-names(coxrfx_object$coefficients)[is_name_from_trans_k] + #get rid of suffix ".k" + CI_labels_split<-strsplit(CI_labels,"[.]") + CI_labels<-sapply(CI_labels_split,function(x) paste0(x[-length(x)],collapse = ".")) + + #simplify covariate names + for(i in c("_simple_factor","ring_","_logit","_log")){ + CI_labels<-gsub(i,"",CI_labels) + } + + for(i in c("age","anc","plt")){ + CI_labels<-gsub(i,paste0("log_",i),CI_labels) + } + + for(i in c("bm_blasts","sideroblasts")){ + CI_labels<-gsub(i,paste0("logit_",i),CI_labels) + } + + #log-scale on the x-axis + max_dist_x_axis<-max(abs(coefficients_CIs[c(1,2),seq(k,ncol(coefficients_CIs),3)])) + x_axis_positive_ticks<-log(c(seq(1,9,1),seq(10,90,10),seq(100,900,100),seq(1000,9000,1000),seq(10000,100000,10000))) + x_axis_negative_ticks<-log(c(seq(0.9,0.2,-10^(-1)),seq(0.1,0.02,-10^(-2)),seq(0.01,0.002,-10^(-3)),seq(0.001,0.0002,-10^(-4)),seq(0.0001,0.00001,-10^(-5)))) + x_axis_ticks<-c(rev(x_axis_negative_ticks),x_axis_positive_ticks) + x_axis_labels<-labels_fun(5) + + par(bty="o", mgp=c(2,1.5,0)) + old_mar<-par()$mar + if(!is.null(mar)) par(mar=mar) + plot(1, type="n",ylab="", xlab="",yaxt="n",xaxt="n", + xaxs="i",yaxs="i", + xlim=c(log(0.04),log(20)), + ylim=c(1-0.6, length(CI_labels)+0.6),cex=2) + plotCI(add=T,y=(length(CI_labels)):1, x=coxrfx_object$coefficients[is_name_from_trans_k],ui=coefficients_CIs[2,is_name_from_trans_k],li=coefficients_CIs[1,is_name_from_trans_k],ylab="",xaxt="n",cex=1,err = "x",pch=16) + axis(side = 1, cex=1,at = x_axis_ticks,cex.axis=1.5,labels = labels_fun(5),lwd.ticks = 3,tck=-0.02) + axis(side = 1, cex=2,at =log(c(10^-4,10^-3,10^-2,10^-1,1,10,10^2,10^3,10^4)),cex.axis=3,labels =F,lwd.ticks = 3,tck=-0.03) + text(labels = CI_labels,x=log(0.04)-0.3,y=length(CI_labels):1,xpd=NA,font=2,cex = 1,adj=1) + abline(v=x_axis_ticks,lty=2,col="#999999") + abline(v=0,lty=2,col=2) + + par(mar=old_mar) +} +``` + +Generate plots +```{r} + +rfx_object<-model_12 +coefficients_CIs<-boot_ebmstate_object$coefficients_CIs +file_name<-"../coef_plots.png" +trans_with_covs<-c(1,2) + +png(file_name,width=1080,height=1.2*1080) +colGroups <- c(brewer.pal(12, "Paired")[c(10)],brewer.pal(12, "Paired")[c(6,4,3,5,12,9,1,2,7)],"#999999", brewer.pal(12, "Paired")[c(8)]) +colGroups <- colGroups[rep(1:6,3)] +par(mfrow=c(1,2)) +for(k in trans_with_covs){ + coefs_plot_fun(k,rfx_object,coefficients_CIs = coefficients_CIs,mar =c(4.1,9,4.1,0.4)) +} +dev.off() + +``` + +Plots of cumulative hazards with CIs for patient 1 +```{r} + +cumhaz_object<-msfit_object +boot_object<-boot_ebmstate_object +file_name<-"../patient78_cumhaz.png" + +png(file_name,width =680,height = 280) +par(mfrow=c(1,3),mar=c(2,2,2,2)) +for(transition in sort(unique(mstate_data_expanded$trans))){ + cumhaz<-cumhaz_object$Haz[cumhaz_object$Haz$trans==transition,] + plot( + cumhaz$time[ + sapply( + seq(from=0,to=4000,length.out = 400), + function(x) which.min(abs(cumhaz$time-x)) + ) + ], + cumhaz[ + sapply( + seq(from=0,to=4000,length.out = 400),function(x) which.min(abs(cumhaz$time-x)) + ), + "Haz"], + pch=".", + ylab = "cumulative hazard", + xlab = "days since diagnosis", + font.main=1, + type="l" + ) + lines(x=colnames(boot_object$cumhaz_CIs[[transition]]),y=boot_object$cumhaz_CIs[[transition]][1,],lwd=1.6,lty=2,col=2) + lines(x=colnames(boot_object$cumhaz_CIs[[transition]]),y=boot_object$cumhaz_CIs[[transition]][2,],lwd=1.6,lty=2,col=2) +} + +dev.off() + +``` + +Plots of state occupation probabilities with CIs for patient 1 +```{r} + +pt_object<-probtrans_object +boot_object<-boot_ebmstate_object +file_name<-"../patient78_transProbs.png" + +png(file_name) +par(mfrow=c(2,2),mar=c(2,2,2,2)) + for(target_state in colnames(tmat)){ + if(target_state=="AML"){ + ylim_max<-0.5 + }else{ + ylim_max<-1 + } + if(target_state=="death"){ + target_state_title<-"death_before_AML" + }else{ + target_state_title<-target_state + } + plot(pt_object[[1]]$time,pt_object[[1]][,target_state],ylim = c(0,ylim_max),pch=".",ylab = "probability",xlab = "days since diagnosis",main = target_state_title,font.main=1) + lines(x=seq(from=0,to=4000,length.out =formals(probtrans_ebmstate)$nr_steps),y=boot_object$probtrans_CIs[[target_state]][1,],lwd=1.6,lty=2,col=2) + lines(x=seq(from=0,to=4000,length.out = formals(probtrans_ebmstate)$nr_steps),y=boot_object$probtrans_CIs[[target_state]][2,],lwd=1.6,lty=2,col=2) + } + mtext("95% bootstrap confidence intervals",outer = T,cex = 1.3,font=2,line = 1) + dev.off() + +``` + + +#### MDS data summary table + +The following code block build a data frame with summary statistics (to be exported to excel and converted to pdf). + +Build non-expanded covariate data set; only covariates used in the final analysis. +```{r} + +covariate_names<-names(mstate_data_expanded)[sapply(names(mstate_data_expanded),function(x) tail(unlist(strsplit(x,split="[.]")),1))=="1"] + +covariate_names<-gsub(".1","",covariate_names,fixed = T) +covariate_names[covariate_names=="center"]<-"center.1" + +covariate_data<-imputedDataNonCentered[names(imputedDataNonCentered)%in%covariate_names] + +``` + +Undo log and logit transformations +```{r} +undo_log<-function(x){ + if(grepl("logit",names(covariate_data)[x])==T){ + return(exp(covariate_data[x])/(exp(covariate_data[x])+1)) + }else if(grepl("log",names(covariate_data)[x])==T){ + return(exp(covariate_data[x])) + }else{ + return(covariate_data[x]) + } +} + +covariate_data<-data.frame(sapply(1:length(covariate_data),undo_log)) +covariate_data$date<-365.25*5*covariate_data$date+4122 + +``` + +Compute summary statistics +```{r} +summary_stat_fun<-function(statistic){ + apply(covariate_data,2,statistic) +} +covs_table<-round(data.frame(sapply(c(min,max,mean,sd),summary_stat_fun)),2) + +``` + +Changing names and other formating. +```{r} +rownames(covs_table)<-gsub("_simple_factor","",rownames(covs_table)) +rownames(covs_table)<-gsub("_logit","",rownames(covs_table),fixed = T) +rownames(covs_table)<-gsub("_log","",rownames(covs_table),fixed = T) + +colnames(covs_table)<-c("min","max","mean","std dev") +covs_table<-as.data.frame(t(covs_table)) +covs_table[,"date"]<-as.Date(unlist(covs_table[,"date"]),origin = "1970-01-01") +write.csv(covs_table,"./tables/numeric_covs_table.csv") + +``` + diff --git a/_articles/RJ-2024-002/scripts/ESM_2.html b/_articles/RJ-2024-002/scripts/ESM_2.html new file mode 100644 index 0000000000..abd835921d --- /dev/null +++ b/_articles/RJ-2024-002/scripts/ESM_2.html @@ -0,0 +1,920 @@ + + + + + + + + + + + + + + + +MDS data analysis + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + + + +


    +
    +

    +
    +

    Preamble

    +
    set.seed(973009)
    +library(ebmstate)
    +library(RColorBrewer)
    +library(plotrix)
    +library(caret)
    +
    +
    +

    Pre-processing the data

    +

    The pre-processing steps produce a data frame called ‘imputedData’, with the covariate data after imputation has been carried out, and a data frame called ‘mdsClinicalData’, with the disease progression data and unprocessed covariate data.

    +
    mdsClinicalData <- read.table("../data/mds.paper.clin.txt", header=T, sep="\t", fill=T, na.strings = c("NA","na")) ## Import clinical data
    +mdsClinicalData <- mdsClinicalData [!duplicated(mdsClinicalData$PDID),] ## remove duplicated observations
    +mdsClinicalData[mdsClinicalData==""] = NA #replace empty string with NAs
    +mdsClinicalData = mdsClinicalData[mdsClinicalData$X0.no.mut.1.seq.2.removedbyqc.3.failed <2,] 
    +mdsClinicalData$IPSS.norm = factor(tolower(as.character(mdsClinicalData$IPSS.norm)), levels=c("low", "int-1", "int-2", "high")) # removes factor level "Low", keeping factor level "low"
    +
    mdsGeneticData <- read.table("../data/MDS.TPD.20Nov2012.csv", sep=",", header=T, fill=T, quote = "\"") ## Genotypes
    +mdsGeneticData$Gene<-factor(mdsGeneticData$Gene)
    +levels(mdsGeneticData$Gene)[levels(mdsGeneticData$Gene)=="SFRS2"] = "SRSF2" #SFRS2 is alternative label for gene SRSF2
    +levels(mdsGeneticData$Gene)[levels(mdsGeneticData$Gene)=="ENSG00000091592"] = "NLRP1" #change level name
    +mdsGeneticData$Decision<-factor(mdsGeneticData$Decision)
    +

    Create a matrix whose i,j entry corresponds to the patient i and gene j (there are no duplicated patients or genes). +The entry in this matrix is 3 if patient i has at least one oncogenic mutation in gene j. It is 2, if the patient has at least one possibly oncogenic mutation in this gene. Or 1 if there’s at least one mutation of unknown oncogenic status in the gene.

    +
    IDs <- mdsClinicalData$PDID
    +allGenotypes <- matrix(0,nrow = length(IDs), ncol = length(levels(mdsGeneticData$Gene)))
    +rownames(allGenotypes) <- IDs
    +colnames(allGenotypes) <- levels(mdsGeneticData$Gene)
    +allGenotypes <- allGenotypes[rownames(allGenotypes)!="", colnames(allGenotypes)!=""]
    +for(i in seq_along(mdsGeneticData$Gene)){
    +    if(mdsGeneticData$SAMPLE.NAME[i] %in% IDs)
    +        allGenotypes[as.character(mdsGeneticData$SAMPLE.NAME[i]), as.character(mdsGeneticData$Gene[i])] <- max(c(3,2,1)[as.numeric(mdsGeneticData$Decision[i])], allGenotypes[as.character(mdsGeneticData$SAMPLE.NAME[i]), as.character(mdsGeneticData$Gene[i])])
    +}
    +

    Restrict to matching PDIDs, mutated genes

    +
    genotypes <- allGenotypes[,colSums(allGenotypes)>0]
    +

    Create 5 indicator (binary) variables (one for each center).

    +
    centers <- sapply(unique(1:5), function(i) mdsClinicalData$center==i) + 0
    +colnames(centers) <- paste("center",1:5, sep="")
    +

    Create object cytoMerged merging some cytogenetic variables from mdsClinicalData. +CytoMerged includes all observations on variables with prefix “CYTO_” in mdsClinicalData. +If observation i on variable “CYTO_X” is missing, observation i on variable “SEQ_X” is used (in case “SEQ_X”[i] is not also missing).

    +
    cyto = mdsClinicalData[,grepl("CYTO_",colnames(mdsClinicalData))]
    +colnames(cyto) = c( "chr3" ,   "del5q" ,"del7_7q" ,"tri8"   , "del11" , "del12", "alt17q"  , "tri19"   ,"del20q" ,"delY", "other" , "complex")
    +ascat =  mdsClinicalData[,grepl("SEQ_",colnames(mdsClinicalData))]
    +colnames(ascat) = c("tri8"  , "del5"  ,"del7_7q" , "del11q" ,"del12p", "alt17q"  , "tri19" , "del20q","other")
    +
    +cytoMerged = cyto
    +for(c in colnames(cyto))
    +    if(c %in% colnames(ascat))
    +        cytoMerged[,c][is.na(cytoMerged[,c])] = ascat[,c][is.na(cytoMerged[,c])]
    +

    Simplified WHO types

    +
    #indicator variables for simplified who classes
    +whoSimple = data.frame(
    +        ra = mdsClinicalData$WHO.category %in% c("RA","RT"),
    +        rars = mdsClinicalData$WHO.category == "RARS",
    +        rars_t = mdsClinicalData$WHO.category == "RARS-T",
    +        rcmd = mdsClinicalData$WHO.category == "RCMD",
    +        rcmd_rs = mdsClinicalData$WHO.category == "RCMD-RS",
    +        raeb = mdsClinicalData$WHO.category %in% c("RAEB", "RAEB 1", "RAEB 2"), 
    +        d5q = mdsClinicalData$WHO.category == "5q-",
    +        cmml =  mdsClinicalData$WHO.category == "CMML",
    +        mds_mpn = mdsClinicalData$WHO.category == "MDSMPN",
    +        mds_u = mdsClinicalData$WHO.category =="MDS-U",
    +        mds_aml = mdsClinicalData$WHO.category ==  "AML-MDS"
    +) + 0
    +
    +# factor vector for simplified WHO classes
    +whoSimpleFactor = factor(rowSums(whoSimple * rep(1:ncol(whoSimple), each=nrow(whoSimple))), labels = c("RA","RARS","RARS-T","RCMD","RCMD-RS","RAEB","5q-","CMML","MDS-MPN","MDS-U","MDS-AML"))
    +

    Combine into single data.frame (only covariates)

    +
    d <- genotypes >= 3 #only oncogeneic mut
    +d <- d[,colSums(d) >0] # only genes mutated at least once
    +rawData <- data.frame(d,
    +        cytoMerged,
    +        age_log = log(as.numeric(as.character(mdsClinicalData$AGE))),
    +        sex = mdsClinicalData$Gender,
    +        pb_cytopenia = as.numeric(mdsClinicalData$PB.CYTOPENIA),
    +        hb = as.numeric(mdsClinicalData$HB),
    +        anc_log = log(as.numeric(as.character(mdsClinicalData$ANC))+1e-3),
    +        plt_log = log(as.numeric(mdsClinicalData$PLT)),
    +        bm_blasts_logit = car::logit(as.numeric(as.character(mdsClinicalData$X..BM.BLASTS))),
    +        ring_sideroblasts_logit = car::logit(as.numeric(as.character(mdsClinicalData$X..RING.SIDEROBLASTS))),
    +        ipss = as.numeric(mdsClinicalData$IPSS.norm),
    +        who_simple_factor = ebmstate:::MakeInteger(whoSimpleFactor), #essentially the same as 'whoSimple' above
    +        center = ebmstate:::MakeInteger(as.factor(mdsClinicalData$center)), #essentially the same as 'centers' above
    +        date = (as.numeric(as.Date(mdsClinicalData$DATE.OF.DIAGNOSIS, format="%d/%m/%Y"))-4122)/(365.25*5)#date is the time since the oldest diagnosis in units of 5 years
    +)
    +

    Correct covariate classes

    +
    logical_covs<-c('ASXL1','ATRX','BCOR','BRAF','CBL','CDKN2A','CEBPA','CREBBP','CTNNA1','CUX1','DNMT3A','EP300','ETV6','EZH2','FLT3','GATA2','GNAS','IDH1','IDH2','IRF1','JAK2','KDM6A','KIT','KRAS','MLL2','MPL','NF1','NPM1','NRAS','PHF6','PTEN','PTPN11','RAD21','RUNX1','SF3B1','SH2B3','SRSF2','STAG2','TET2','TP53','U2AF1','WT1','ZRSR2','chr3','del5q','del7_7q','tri8','del11','del12','alt17q','tri19','del20q','delY','other','complex','sex','pb_cytopenia','who_simple_factor.RA','who_simple_factor.RARS','who_simple_factor.RARS.T','who_simple_factor.RCMD','who_simple_factor.RCMD.RS','who_simple_factor.RAEB','who_simple_factor.5q.','who_simple_factor.CMML','who_simple_factor.MDS.MPN','who_simple_factor.MDS.U','who_simple_factor.MDS.AML','center.1','center.2','center.3','center.4','center.5')
    +numeric_covs<-c('age_log','hb','anc_log','plt_log','bm_blasts_logit','ring_sideroblasts_logit','date')
    +factor_covs<-c('ipss')
    +covariate_classes<-list(logical_covs=logical_covs,numeric_covs=numeric_covs,factor_covs=factor_covs)
    +
    +for (i in names(rawData)){
    +      class_to_assign<-c("logical","numeric","factor")[sapply(covariate_classes,function(x) i%in%x)]
    +      if(class_to_assign!="factor"){
    +        class(rawData[[i]])<-class_to_assign
    +      }else{
    +        rawData[[i]]<-as.factor(rawData[[i]])
    +      }
    +}
    +

    Imputation of missing values by covariate-wise hot deck imputation.

    +
    poorMansImpute <- function(x) {x[is.na(x)] <- sample(x[!is.na(x)],sum(is.na(x)),replace = T); return(x)}
    +imputedData <- as.data.frame(sapply(rawData,poorMansImpute))
    +

    Include only patients which have a date of diagnosis, a last follow-up date, and indicator variables for death and AML progression (153 patients are excluded).

    +
    imputedData<-imputedData[!(is.na(mdsClinicalData$DATE.OF.DIAGNOSIS)|is.na(mdsClinicalData$DATE.LAST.FU)|is.na(mdsClinicalData$OUTCOME)|is.na(mdsClinicalData$AML.PROGRESSION)),]
    +mdsClinicalData<-mdsClinicalData[!(is.na(mdsClinicalData$DATE.OF.DIAGNOSIS)|is.na(mdsClinicalData$DATE.LAST.FU)|is.na(mdsClinicalData$OUTCOME)|is.na(mdsClinicalData$AML.PROGRESSION)),]
    +

    Remove variables that are no longer of use in mdsClinicalData

    +
    rownames(mdsClinicalData)<-mdsClinicalData$PDID
    +mdsClinicalData<-mdsClinicalData[c("DATE.OF.DIAGNOSIS","AML.PROGRESSION","DATE.AML.PROGRESSION","DATE.LAST.FU","OUTCOME")]
    +

    Change dates to numeric

    +
    mdsClinicalData[c("DATE.OF.DIAGNOSIS","DATE.LAST.FU","DATE.AML.PROGRESSION")]<-sapply(mdsClinicalData[c("DATE.OF.DIAGNOSIS","DATE.LAST.FU","DATE.AML.PROGRESSION")], function(x) as.numeric(as.Date(x,format="%d/%m/%Y")))
    +

    Remove some patients with abnormal data.

    +
    #Remove patient whose last follow-up time is the same as the date of diagnosis (excludes one patient).
    +imputedData<- imputedData[mdsClinicalData$DATE.OF.DIAGNOSIS!=mdsClinicalData$DATE.LAST.FU,]
    +mdsClinicalData<- mdsClinicalData[mdsClinicalData$DATE.OF.DIAGNOSIS!=mdsClinicalData$DATE.LAST.FU,]
    +
    +#Remove patients who progressed to AML but have no date of AML progression (excludes 4).
    +imputedData<-imputedData[!(mdsClinicalData$AML.PROGRESSION==1&is.na(mdsClinicalData$DATE.AML.PROGRESSION)),]
    +mdsClinicalData<-mdsClinicalData[!(mdsClinicalData$AML.PROGRESSION==1&is.na(mdsClinicalData$DATE.AML.PROGRESSION)),]
    +
    +#Remove patients whose date of AML progression is equal to the date of death (excludes 2).
    +imputedData<-imputedData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$OUTCOME==1&mdsClinicalData$DATE.AML.PROGRESSION==mdsClinicalData$DATE.LAST.FU),]
    +mdsClinicalData<-mdsClinicalData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$OUTCOME==1&mdsClinicalData$DATE.AML.PROGRESSION==mdsClinicalData$DATE.LAST.FU),]
    +
    +#Remove patients who died before they progressed (excludes 12).
    +imputedData<-imputedData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$DATE.AML.PROGRESSION>mdsClinicalData$DATE.LAST.FU),]
    +mdsClinicalData<-mdsClinicalData[!(mdsClinicalData$AML.PROGRESSION==1&mdsClinicalData$DATE.AML.PROGRESSION>mdsClinicalData$DATE.LAST.FU),]
    +

    Convert all variables to “numeric”.

    +
    imputedData<-as.data.frame(lapply(imputedData,function(x) as.numeric(x)))
    +imputedDataNonCentered<-imputedData
    +

    Center non-categorical variables to facilitate interpretation of the baseline hazard.

    +
    imputedData[,c("age_log","hb","anc_log","plt_log","bm_blasts_logit","ring_sideroblasts_logit")]<-scale(imputedData[,c("age_log","hb","anc_log","plt_log","bm_blasts_logit","ring_sideroblasts_logit")],center = T,scale = F)
    +
    +# imputedData<-scale(imputedData,center = T,scale = F)
    +

    To avoid confusion later on, when variables are expanded

    +
    names(imputedData)<-sub("center.","center",names(imputedData),fixed = T)
    +names(imputedDataNonCentered)<-sub("center.","center",names(imputedDataNonCentered),fixed = T)
    +

    Group variable names

    +
    gene_vars<-names(imputedData)[1:43]
    +cytogenetic_vars<-names(imputedData)[44:55]
    +clinical_vars<-names(imputedData)[56:75]
    +nuisance_vars<-names(imputedData)[76:81]
    +
    +mutation_vars<-names(imputedData)[1:55]
    +all_clinical_vars<-names(imputedData)[56:81]
    +

    Remove variables for which there is no variation in the data set.

    +
    imputedData<-imputedData[,which(apply(imputedData, 2, function(x) length(unique(x)))>0)]
    +

    Converting the data set to ‘long format’

    +
    mstate_data<-data.frame()
    +
    +for(i in 1:nrow(mdsClinicalData)){
    +  id<-rep(i,2)
    +  from<-c(1,1)
    +  to<-c(2,3)
    +  trans<-c(1,2)
    +  Tstart<-c(0,0)
    +  if(mdsClinicalData$AML.PROGRESSION[i]==1){
    +    Tstop<-rep(mdsClinicalData$DATE.AML.PROGRESSION[i]-mdsClinicalData$DATE.OF.DIAGNOSIS[i],2)
    +    time<-Tstop-Tstart
    +    status<-c(1,0)
    +    mstate_data<-rbind(mstate_data,data.frame(id=id,from=from,to=to,                                              trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    +    if(mdsClinicalData$DATE.LAST.FU[i]>mdsClinicalData$DATE.AML.PROGRESSION[i]){
    +      id<-i
    +      from<-2
    +      to<-4
    +      trans<-3
    +      Tstart<-Tstop[1]
    +      Tstop<-mdsClinicalData$DATE.LAST.FU[i]-mdsClinicalData$DATE.OF.DIAGNOSIS[i]
    +      time<-Tstop-Tstart
    +      status<-mdsClinicalData$OUTCOME[i]
    +      mstate_data<-rbind(mstate_data,data.frame(id=id,from=from,to=to,trans=trans,
    +                                              Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    +    }
    +    next 
    +  }else{
    +    Tstop<-rep(mdsClinicalData$DATE.LAST.FU[i]-mdsClinicalData$DATE.OF.DIAGNOSIS[i],2)
    +    time<-Tstop-Tstart
    +    status<-c(0,mdsClinicalData$OUTCOME[i])
    +    mstate_data<-rbind(mstate_data,data.frame(id=id,from=from,to=to,
    +                             trans=trans,Tstart=Tstart,Tstop=Tstop,time=time,status=status))
    +  }
    +}
    +
    +#check that no rows have NA's
    +mstate_data[apply(mstate_data,1,function(x) sum(is.na(x))>0),]
    +
    +mstate_data<-cbind(mstate_data,imputedData[mstate_data$id,])
    +mstate_data$strata<-mstate_data$trans
    +

    For each transition separately, exclude variables with little variance.

    +
    percentage_of_ones_fun<-function(x){
    +  sum(x)/length(x)
    +}
    +vars_to_exclude_2<-vector("list",3)
    +for(i in 1:3){
    +  dummy_dataset<-mstate_data[mstate_data$trans==i,!names(mstate_data)%in%c("id","from","to","trans","Tstart","Tstop","time","status","strata","type")]
    +  which_have_variance<-apply(dummy_dataset, 2, function(x) var(x)>0)
    +  vars_to_exclude_2[[i]]<-names(dummy_dataset)[!which_have_variance]
    +  dummy_dataset<-dummy_dataset[which_have_variance]
    +  non_categorical_vars<-c("age_log","hb","anc_log","plt_log","bm_blasts_logit","ring_sideroblasts_logit","ipss","date")
    +  percentage_of_ones<-apply(dummy_dataset[!names(dummy_dataset)%in%non_categorical_vars], 2, percentage_of_ones_fun)
    +which_less_than_five_percent<-which(percentage_of_ones<0.05)
    +vars_to_exclude_2[[i]]<-c(vars_to_exclude_2[[i]],names(percentage_of_ones)[which_less_than_five_percent])
    +}
    +
    +
    +#variables to exclude for transition 1 are the same as for transition 2
    +vars_to_exclude_2[[1]]==vars_to_exclude_2[[2]]
    +
    +#variables to exclude for transition 3 are a subset of those for transition 1 and 2
    +vars_to_exclude_2[[3]]%in%vars_to_exclude_2[[2]]
    +
    +#use vars_to_exclude_2[[1]] as the variables to exclude in all transitions
    +
    +mstate_data<-mstate_data[!names(mstate_data)%in%vars_to_exclude_2[[1]]]
    +
    +
    +

    Model estimation

    +

    Argument ‘Z’ of CoxRFX for a model assuming that the +impact of each covariate is the same for all transitions +(one coefficient per covariate). This block of code is +not necessary, we keep it here for the sake of following +the explanation in the main text of the paper.

    +
    # Z<-mstate_data[!names(mstate_data)%in%c("id","from","to",
    +#   "Tstart","Tstop","time","status")]   
    +

    Model: all covariates for transitions 1 and 2, none for transition 3

    +
    #Sort out class and attributes of 'mstate_data'
    +tmat<-transMat(x=list(c(2,3),c(4),c(),c()),names=c("MDS","AML","death","death_after_AML"))
    +class(mstate_data)<-c("data.frame","msdata")
    +attr(mstate_data,"trans")<-tmat
    +
    +# expand covariates by transition:
    +outcome_covs <- c("id", "from", "to", "trans",
    +                      "Tstart", "Tstop",
    +                      "time", "status",
    +                      "strata"
    +)
    +covariates_expanded_123 <- mstate::expand.covs(
    +  mstate_data,
    +  covs = names(mstate_data)[
    +    !names(mstate_data) %in% outcome_covs
    +  ],
    +  append = FALSE
    +)
    +
    +# remove all covariates for transition 3 from 'covariates_expanded_123'
    +# to fit a fully non-parametric model on this transition:
    +covariates_expanded_12 <- covariates_expanded_123[
    +  ! grepl(".3", names(covariates_expanded_123), fixed = TRUE)
    +]
    +
    +#argument 'Z' of coxrfx
    +Z_12<-data.frame(covariates_expanded_12,strata=mstate_data$trans,trans=mstate_data$trans)
    +
    +#argument 'groups'
    +groups_12<-paste0(rep("group",ncol(Z_12)-2),c("_1","_2"))
    +
    +#argument 'surv'
    +surv<-survival::Surv(mstate_data$time,mstate_data$status)
    +
    +#fit random effects model
    +model_12<-CoxRFX(Z=Z_12,surv=surv,groups=groups_12)
    +
    +# cumulative hazards and transition probabilities for patient 1
    +# Build 'patient_data' data frame with the covariate values for which
    +# cumulative hazards are to be computed (covariate values of patient 78)
    +patient_data <- mstate_data[
    +  mstate_data$id == 78,
    +  ,
    +  drop = FALSE][rep(1, 3), ]
    +
    +patient_data$strata <- patient_data$trans <- 1:3
    +
    +patient_data <- mstate::expand.covs(
    +  patient_data,
    +  covs = names(patient_data)[
    +    ! names(patient_data) %in% outcome_covs
    +  ],
    +  append = TRUE
    +)
    +
    +patient_data <- patient_data[!grepl(".3", names(patient_data), fixed = TRUE)]
    +
    +
    +msfit_object<-msfit_generic(model_12,patient_data,tmat)
    +probtrans_object<-probtrans_ebmstate("MDS",msfit_object,"clockreset",max_time = 4000)
    +save(model_12,msfit_object,probtrans_object,file = 
    +       "../data/fit_objects.Rdata")
    +
    +#interval estimates
    +names(groups_12) <- names(covariates_expanded_12)
    +
    +# 'mstate_data_expanded' argument
    +# (similar to 'covariates_expanded_12' 
    +# but including outcome variables)
    +mstate_data_expanded <- cbind(
    +  mstate_data[names(mstate_data) %in% outcome_covs],
    +  covariates_expanded_12
    +)
    +
    +# create the non-parametric bootstrap confidence intervals
    +boot_ebmstate_object <- boot_ebmstate(
    +  mstate_data = mstate_data_expanded,
    +  which_group = groups_12,
    +  min_nr_samples = 100,
    +  patient_data = patient_data,
    +  tmat = tmat,
    +  initial_state = "MDS",
    +  time_model = "clockreset",
    +#   input_file = "../data/boot_ebmstate_backup.Rdata",
    +  coxrfx_args = list(max.iter = 200),
    +  probtrans_args = list(max_time = 4000)
    +)
    +
    +save(boot_ebmstate_object,file="../data/boot_object.Rdata")
    +
    +
    +

    Plots of estimates

    +

    Functions to generate plots of relative hazards

    +
    labels_fun<-function(n){
    +  result_pos<-vector("numeric",0)
    +  result_neg<-vector("numeric",0)
    +  for(i in 1:n){
    +    result_pos<-c(result_pos,c(rep(NA,8),10^i))
    +  }
    +    for(i in 1:n){
    +    result_neg<-c(result_neg,c(rep(NA,8),10^-i))
    +  }
    +  as.character(c(rev(result_neg),1,result_pos))
    +}
    +
    +
    +coefs_plot_fun<-function(k,coxrfx_object,coefficients_CIs,mar=NULL){
    +  #keep only covariate names from transition k
    +  string_split<-strsplit(names(coxrfx_object$coefficients),"[.]")
    +  is_name_from_trans_k<-sapply(string_split,function(x) x[length(x)]==as.character(k))
    +  CI_labels<-names(coxrfx_object$coefficients)[is_name_from_trans_k]
    +  #get rid of suffix ".k"
    +  CI_labels_split<-strsplit(CI_labels,"[.]")
    +  CI_labels<-sapply(CI_labels_split,function(x) paste0(x[-length(x)],collapse = "."))
    +
    +  #simplify covariate names
    +  for(i in c("_simple_factor","ring_","_logit","_log")){
    +    CI_labels<-gsub(i,"",CI_labels)
    +  }
    +  
    +  for(i in c("age","anc","plt")){
    +    CI_labels<-gsub(i,paste0("log_",i),CI_labels)
    +  }
    +  
    +  for(i in c("bm_blasts","sideroblasts")){
    +    CI_labels<-gsub(i,paste0("logit_",i),CI_labels)
    +  }
    +  
    +  #log-scale on the x-axis
    +  max_dist_x_axis<-max(abs(coefficients_CIs[c(1,2),seq(k,ncol(coefficients_CIs),3)]))
    +  x_axis_positive_ticks<-log(c(seq(1,9,1),seq(10,90,10),seq(100,900,100),seq(1000,9000,1000),seq(10000,100000,10000)))
    +  x_axis_negative_ticks<-log(c(seq(0.9,0.2,-10^(-1)),seq(0.1,0.02,-10^(-2)),seq(0.01,0.002,-10^(-3)),seq(0.001,0.0002,-10^(-4)),seq(0.0001,0.00001,-10^(-5))))
    +  x_axis_ticks<-c(rev(x_axis_negative_ticks),x_axis_positive_ticks)
    +  x_axis_labels<-labels_fun(5)
    +  
    +  par(bty="o", mgp=c(2,1.5,0))
    +  old_mar<-par()$mar
    +  if(!is.null(mar)) par(mar=mar)
    +  plot(1, type="n",ylab="", xlab="",yaxt="n",xaxt="n",
    +       xaxs="i",yaxs="i",
    +       xlim=c(log(0.04),log(20)),
    +       ylim=c(1-0.6, length(CI_labels)+0.6),cex=2)
    +  plotCI(add=T,y=(length(CI_labels)):1, x=coxrfx_object$coefficients[is_name_from_trans_k],ui=coefficients_CIs[2,is_name_from_trans_k],li=coefficients_CIs[1,is_name_from_trans_k],ylab="",xaxt="n",cex=1,err = "x",pch=16)
    +    axis(side = 1, cex=1,at = x_axis_ticks,cex.axis=1.5,labels = labels_fun(5),lwd.ticks = 3,tck=-0.02)
    +    axis(side = 1, cex=2,at =log(c(10^-4,10^-3,10^-2,10^-1,1,10,10^2,10^3,10^4)),cex.axis=3,labels =F,lwd.ticks = 3,tck=-0.03)
    +     text(labels = CI_labels,x=log(0.04)-0.3,y=length(CI_labels):1,xpd=NA,font=2,cex = 1,adj=1)
    +     abline(v=x_axis_ticks,lty=2,col="#999999")
    +      abline(v=0,lty=2,col=2)
    +
    +  par(mar=old_mar)
    +}
    +

    Generate plots

    +
    rfx_object<-model_12
    +coefficients_CIs<-boot_ebmstate_object$coefficients_CIs
    +file_name<-"../coef_plots.png"
    +trans_with_covs<-c(1,2)
    +
    +png(file_name,width=1080,height=1.2*1080)
    +colGroups <- c(brewer.pal(12, "Paired")[c(10)],brewer.pal(12, "Paired")[c(6,4,3,5,12,9,1,2,7)],"#999999", brewer.pal(12, "Paired")[c(8)])
    +colGroups <- colGroups[rep(1:6,3)]
    +par(mfrow=c(1,2))
    +for(k in trans_with_covs){
    +  coefs_plot_fun(k,rfx_object,coefficients_CIs = coefficients_CIs,mar =c(4.1,9,4.1,0.4))
    +}
    +dev.off()
    +

    Plots of cumulative hazards with CIs for patient 1

    +
    cumhaz_object<-msfit_object
    +boot_object<-boot_ebmstate_object
    +file_name<-"../patient78_cumhaz.png"
    +
    +png(file_name,width =680,height = 280)
    +par(mfrow=c(1,3),mar=c(2,2,2,2))
    +for(transition in sort(unique(mstate_data_expanded$trans))){
    +  cumhaz<-cumhaz_object$Haz[cumhaz_object$Haz$trans==transition,]
    +  plot(
    +    cumhaz$time[
    +      sapply(
    +        seq(from=0,to=4000,length.out = 400),
    +        function(x) which.min(abs(cumhaz$time-x))
    +        )
    +    ],
    +    cumhaz[
    +      sapply(
    +        seq(from=0,to=4000,length.out = 400),function(x) which.min(abs(cumhaz$time-x))
    +      ),
    +      "Haz"],
    +    pch=".",
    +    ylab = "cumulative hazard",
    +    xlab = "days since diagnosis",
    +    font.main=1,
    +    type="l"
    +  )
    +  lines(x=colnames(boot_object$cumhaz_CIs[[transition]]),y=boot_object$cumhaz_CIs[[transition]][1,],lwd=1.6,lty=2,col=2)
    +  lines(x=colnames(boot_object$cumhaz_CIs[[transition]]),y=boot_object$cumhaz_CIs[[transition]][2,],lwd=1.6,lty=2,col=2)
    +}
    +
    +dev.off()
    +

    Plots of state occupation probabilities with CIs for patient 1

    +
    pt_object<-probtrans_object
    +boot_object<-boot_ebmstate_object
    +file_name<-"../patient78_transProbs.png"
    +
    +png(file_name)
    +par(mfrow=c(2,2),mar=c(2,2,2,2))
    +  for(target_state in colnames(tmat)){
    +    if(target_state=="AML"){
    +      ylim_max<-0.5
    +    }else{
    +      ylim_max<-1
    +    }
    +    if(target_state=="death"){
    +      target_state_title<-"death_before_AML"
    +    }else{
    +      target_state_title<-target_state
    +    }
    +    plot(pt_object[[1]]$time,pt_object[[1]][,target_state],ylim = c(0,ylim_max),pch=".",ylab = "probability",xlab = "days since diagnosis",main = target_state_title,font.main=1)
    +    lines(x=seq(from=0,to=4000,length.out =formals(probtrans_ebmstate)$nr_steps),y=boot_object$probtrans_CIs[[target_state]][1,],lwd=1.6,lty=2,col=2)
    +    lines(x=seq(from=0,to=4000,length.out = formals(probtrans_ebmstate)$nr_steps),y=boot_object$probtrans_CIs[[target_state]][2,],lwd=1.6,lty=2,col=2)
    +  }
    +  mtext("95% bootstrap confidence intervals",outer = T,cex = 1.3,font=2,line = 1)
    +  dev.off()
    +
    +
    +

    MDS data summary table

    +

    The following code block build a data frame with summary statistics (to be exported to excel and converted to pdf).

    +

    Build non-expanded covariate data set; only covariates used in the final analysis.

    +
    covariate_names<-names(mstate_data_expanded)[sapply(names(mstate_data_expanded),function(x) tail(unlist(strsplit(x,split="[.]")),1))=="1"]
    +
    +covariate_names<-gsub(".1","",covariate_names,fixed = T)
    +covariate_names[covariate_names=="center"]<-"center.1"
    +
    +covariate_data<-imputedDataNonCentered[names(imputedDataNonCentered)%in%covariate_names]
    +

    Undo log and logit transformations

    +
    undo_log<-function(x){
    +  if(grepl("logit",names(covariate_data)[x])==T){
    +    return(exp(covariate_data[x])/(exp(covariate_data[x])+1))
    +  }else if(grepl("log",names(covariate_data)[x])==T){
    +    return(exp(covariate_data[x]))
    +  }else{
    +    return(covariate_data[x])
    +  }
    +}
    +
    +covariate_data<-data.frame(sapply(1:length(covariate_data),undo_log))
    +covariate_data$date<-365.25*5*covariate_data$date+4122
    +

    Compute summary statistics

    +
    summary_stat_fun<-function(statistic){
    +  apply(covariate_data,2,statistic)
    +}
    +covs_table<-round(data.frame(sapply(c(min,max,mean,sd),summary_stat_fun)),2)
    +

    Changing names and other formating.

    +
    rownames(covs_table)<-gsub("_simple_factor","",rownames(covs_table))
    +rownames(covs_table)<-gsub("_logit","",rownames(covs_table),fixed = T)
    +rownames(covs_table)<-gsub("_log","",rownames(covs_table),fixed = T)
    +
    +colnames(covs_table)<-c("min","max","mean","std dev")
    +covs_table<-as.data.frame(t(covs_table))
    +covs_table[,"date"]<-as.Date(unlist(covs_table[,"date"]),origin = "1970-01-01")
    +write.csv(covs_table,"./tables/numeric_covs_table.csv")
    +
    +
    +
    +
      +
    1. European Bioinformatics Institute (EMBL-EBI), ↩︎

    2. +
    3. Genome Biology Unit, EMBL↩︎

    4. +
    5. German Cancer Research Center (DKFZ)↩︎

    6. +
    +
    + + + + +
    + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-002/scripts/ESM_3.R b/_articles/RJ-2024-002/scripts/ESM_3.R new file mode 100644 index 0000000000..e5e05f158b --- /dev/null +++ b/_articles/RJ-2024-002/scripts/ESM_3.R @@ -0,0 +1,243 @@ +## SURVIVAL ANALYSIS WORKFLOW + +# (R script for section 5 of the paper Costa, R J and Gerstung, M, +# 'ebmstate: an R package for disease progression analysis +# under empirical Bayes Cox models.' + +library(ebmstate) +data(mstate_data) + +## SECTION 5.1 + +# Table 1 +print(mstate_data[mstate_data$id %in% c(77, 78), ]) + + +## SECTION 5.2 + +# Argument 'Z' of CoxRFX for a model assuming that the +# impact of each covariate is the same for all transitions +# (one coefficient per covariate). +Z <- mstate_data[ + !names(mstate_data) %in% c( + "id", + "from", + "to", + "Tstart", + "Tstop", + "time", + "status" + ) +] + +# Create transition matrix with mstate::transMat() +tmat <- mstate::transMat( + x = list(c(2, 3), c(4), c(), c()), + names = c("MDS", "AML", "death", "death_after_AML") +) + +# To expand covariates by transition using mstate::expand.covs, +# first set the class of 'mstate_data' as +class(mstate_data) <- c("data.frame", "msdata") + +# then add the transition matrix as attribute: +attr(mstate_data, "trans") <- tmat + +# expand covariates by transition: +outcome_covs <- c("id", "from", "to", "trans", + "Tstart", "Tstop", + "time", "status", + "strata" +) +covariates_expanded_123 <- mstate::expand.covs( + mstate_data, + covs = names(mstate_data)[ + !names(mstate_data) %in% outcome_covs + ], + append = FALSE +) + +# Columns `id ' and ` trans ' from ` mstate_data ' together with the first +# two expanded covariates (patients 77 and 78): +print( + cbind( + mstate_data, + covariates_expanded_123 + )[ + mstate_data$id %in% c(77, 78), + c("id", "trans", names(covariates_expanded_123)) + ] +) + + +# remove all covariates for transition 3 from 'covariates_expanded_123' +# to fit a fully non-parametric model on this transition: +covariates_expanded_12 <- covariates_expanded_123[ + ! grepl(".3", names(covariates_expanded_123), fixed = TRUE) +] + +# argument 'Z' of coxrfx +Z_12 <- data.frame( + covariates_expanded_12, + strata = mstate_data$trans, + trans = mstate_data$trans +) + +# argument 'surv' for a clock-forward model +surv <- survival::Surv( + mstate_data$Tstart, + mstate_data$Tstop, + mstate_data$status +) + +# argument 'surv' for a clock-reset model +surv <- survival::Surv( + mstate_data$time, + mstate_data$status +) + +# argument 'groups' of coxrfx +groups_12 <- paste0(rep("group", ncol(Z_12) - 2), c("_1", "_2")) + + +# fit random effects model +model_12 <- CoxRFX(Z = Z_12, surv = surv, groups = groups_12) + +## SECTION 5.3 + +# Build 'patient_data' data frame with the covariate values for which +# cumulative hazards are to be computed (covariate values of patient 78) +patient_data <- mstate_data[ + mstate_data$id == 78, + , + drop = FALSE][rep(1, 3), ] + +patient_data$strata <- patient_data$trans <- 1:3 + +patient_data <- mstate::expand.covs( + patient_data, + covs = names(patient_data)[ + ! names(patient_data) %in% outcome_covs + ], + append = TRUE +) + +patient_data <- patient_data[!grepl(".3", names(patient_data), fixed = TRUE)] + +# The 'patient_data' data frame has only 3 rows ( one for each transition ). +# The output below shows its 'id' and 'trans' columns +# and expanded covariates ASXL1 and DNMT3A: +print( + patient_data[ + , + names(patient_data) %in% c( + "id", + "trans", + "ASXL1.1", + "ASXL1.2", + "DNMT3A.1", + "DNMT3A.2" + ) + ] +) + +# compute cumulative hazards +msfit_object <- msfit_generic(model_12, patient_data, tmat) + + +## SECTION 5.4 + +# compute state occupation probabilities for patient 78: +probtrans_object <- probtrans_ebmstate( + "MDS", + msfit_object, + "clockreset", + max_time = 4000 +) + +# generate plot of state occupation probabilities: +plot(probtrans_object) + + +# compute state occupation probabilities for patient 78 using +# the fast Fourier transform: +probtrans_object <- probtrans_fft( + "MDS", + msfit_object, + max_time = 4000 +) + +# remake plot of state occupation probabilities: +plot(probtrans_object) + + +## INTERVAL ESTIMATES AND LEAVE-ONE-OUT PREDICTIONS + +# Creating the object arguments for boot_ebmstate() + +# 'groups' argument was already created, but we +# need to add names to it +names(groups_12) <- names(covariates_expanded_12) + +# 'mstate_data_expanded' argument +# (similar to 'covariates_expanded_12' +# but including outcome variables) +mstate_data_expanded <- cbind( + mstate_data[names(mstate_data) %in% outcome_covs], + covariates_expanded_12 +) + +# create the non-parametric bootstrap confidence intervals +boot_ebmstate_object <- boot_ebmstate( + mstate_data = mstate_data_expanded, + which_group = groups_12, + min_nr_samples = 100, + patient_data = patient_data, + tmat = tmat, + initial_state = "MDS", + time_model = "clockreset", + input_file = NULL, + coxrfx_args = list(max.iter = 200), + probtrans_args = list(max_time = 4000) +) + + +# #leave-one-out outcome predictions +# patient_IDs <- sample(unique(mstate_data$id), 14 * 14) +# loo_ebmstate_object <- loo_ebmstate( +# mstate_data, +# mstate_data_expanded, +# which_group = groups_12, +# patient_IDs = patient_IDs, +# initial_state = "MDS", +# tmat = tmat, +# input_file = NULL, +# time_model = "clockreset", +# coxrfx_args = list(max.iter = 200), +# probtrans_args = list(max_time = 4000) +# ) + +## SECTION 5.5 + +# All transitions semi-parametric + +# arguments 'groups' and 'Z' for fitting a Cox regression model on all transitions +Z_123 <- data.frame( + covariates_expanded_123, + strata = mstate_data$trans, + trans = mstate_data$trans +) +groups_123 <- paste0(rep("group", ncol(Z_123) - 2), c("_1", "_2", "_3")) + +# Fit a Cox regression model for all transitions +model_123 <- CoxRFX(Z = Z_123, surv = surv, groups = groups_123) + +# Concordance statistic for each model +concordance(model_12) +concordance(model_123) + +# BIC +model_12$BIC +model_123$BIC + + diff --git a/_articles/RJ-2024-002/web/RJwrapper.Rmd b/_articles/RJ-2024-002/web/RJwrapper.Rmd new file mode 100644 index 0000000000..8fdc6422d2 --- /dev/null +++ b/_articles/RJ-2024-002/web/RJwrapper.Rmd @@ -0,0 +1,1404 @@ +--- +title: 'ebmstate: An R Package For Disease Progression Analysis Under Empirical Bayes + Cox Models' +abstract: | + The new R package ebmstate is a package for multi-state survival + analysis. It is suitable for high-dimensional data and allows point + and interval estimation of relative transition hazards, cumulative + transition hazards and state occupation probabilities, under + clock-forward and clock-reset Cox models. Our package extends the + package mstate in a threefold manner: it transforms the Cox regression + model into an empirical Bayes model that can handle high-dimensional + data; it introduces an analytical, Fourier transform-based estimator + of state occupation probabilities for clock-reset models that is much + faster than the corresponding, simulation-based estimator in mstate; + and it replaces asymptotic confidence intervals meant for the + low-dimensional setting by non-parametric bootstrap confidence + intervals. Our package supports multi-state models of arbitrary + structure, but the estimators of state occupation probabilities are + valid for transition structures without cycles only. Once the input + data is in the required format, estimation is handled automatically. + The present paper includes a tutorial on how to use ebmstate to + estimate transition hazards and state occupation probabilities, as + well as a simulation study showing how it outperforms mstate in + higher-dimensional settings. +author: +- name: Rui J. Costa + affiliation: European Molecular Biology Laboratory + address: + - European Bioinformatics Institute (EMBL-EBI) + - Hinxton, CB10 1SD + - United Kingdom + - | + [ruibarrigana@hotmail.com](ruibarrigana@hotmail.com){.uri} +- name: Moritz Gerstung + affiliation: 'aff. 1: European Molecular Biology Laboratory' + address: + - European Bioinformatics Institute (EMBL-EBI) + - Hinxton, CB10 1SD + - United Kindom + - 'aff. 2: German Cancer Research Center (DKFZ)' + - Im Neuenheimer Feld 280 + - 69120 Heidelberg + - Germany + - | + [moritz.gerstung@dkfz.de](moritz.gerstung@dkfz.de){.uri} +date: '2025-01-10' +date_received: '2022-06-27' +journal: + firstpage: ~ + lastpage: ~ +volume: 16 +issue: 1 +slug: RJ-2022-122 +citation_url: https://rjournal.github.io/ +packages: + cran: + - msm + - SemiMarkov + - survival + - mstate + - mboost + - gamboostMSM + - penMSM + - ebmstate + bioc: [] +preview: preview.png +bibliography: costa-gerstung.bib +CTV: ~ +output: + rjtools::rjournal_web_article: + self_contained: yes + toc: no + legacy_pdf: yes + mathjax: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js + md_extension: -tex_math_single_backslash + +--- + +::: article +## Introduction + +Multi-state models based on transition hazard functions are often used +in the statistical analysis of longitudinal data, in particular disease +progression data [@Hougaard1999]. The multi-state model framework is +particularly suitable to accommodate the growing level of detail of +modern clinical data: as long as a clinical history can be framed as a +random process which, at any moment in time, occupies one of a few +states, a multi-state model is applicable. Another strong point of this +framework is that it can incorporate a *regression model*, i.e., a set +of assumptions on how covariates, possibly time-dependent ones, affect +the risk of transitioning between any two states of the disease. Once +estimated, multi-state models with regression features allow the +stratification of patients according to their transition hazards. In +addition, it is possible, under some models, to generate disease outcome +predictions. These come in the form of *state occupation probability* +estimates, meaning estimates of the probability of being in each state +of the disease over a given time frame. + +The survival analysis 'task view' of the Comprehensive R Archive Network +lists seven R packages that are able to fit *general* multi-state models +and, at the same time, feature some kind of regression model or +algorithm: `flexsurv` [@flexsurv_package], +[**msm**](https://CRAN.R-project.org/package=msm) [@Jackson2011], +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov) +[@Listwon2015], +[**survival**](https://CRAN.R-project.org/package=survival) +[@survival_package], +[**mstate**](https://CRAN.R-project.org/package=mstate) [@Wreede2010], +[**mboost**](https://CRAN.R-project.org/package=mboost) +[@mboost_package] -- as extended by +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM) +[@gamboostMSM_package] -- and +[**penMSM**](https://CRAN.R-project.org/package=penMSM) +[@penMSM_package]. All of them implement relative risk regression models +[as defined in @Aalen2008 p. 133]. The only exceptions are +[**survival**](https://CRAN.R-project.org/package=survival), which also +fits Aalen's additive regression model [@Aalen1989], and `flexsurv`, +which also implements accelerated failure time models . + +Recall that a Cox regression model is a semi-parametric model in which +every transition hazard is assumed to be the product of a baseline +hazard function of unspecified form (the non-parametric component) and +an exponential relative risk function (the parametric component) +[@Aalen2008 p. 133]. Generally, the relative risk regression models +implemented in these packages are Cox regression models. However, some +models in `flexsurv`, as well as those in +[**msm**](https://CRAN.R-project.org/package=msm) and +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov), also +restrict the baseline hazards to specific parametric families, i.e. they +are fully parametric. In +[**msm**](https://CRAN.R-project.org/package=msm) and +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov), the +stronger assumptions regarding the functional form of the hazard are +leveraged to do away with other common assumptions: +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov) drops +the usual Markov property to implement homogeneous semi-Markov models; +[**msm**](https://CRAN.R-project.org/package=msm) is suitable for *panel +data*, i.e., data in which the state of each individual is known only at +a finite series of times. + +Packages [**penMSM**](https://CRAN.R-project.org/package=penMSM) and +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM) are +the best suited to deal with higher-dimensional covariate data. The +first of these packages relies on a structured fusion lasso method, +while the second implements (jointly with +[**mboost**](https://CRAN.R-project.org/package=mboost)) a boosting +algorithm. Both methods induce sparsity in the number of non-zero +covariate effects, as well as equality among the different transition +effects of each covariate, and are thus especially useful to reduce +complicated multi-state models to more interpretable ones. The remaining +packages assume standard, fixed effects relative risk regression models +and do not include regularisation or variable selection features. + +It is also illustrative to order the seven packages mentioned according +to how extensive their analysis workflow is. Packages +[**SemiMarkov**](https://CRAN.R-project.org/package=SemiMarkov) and +[**penMSM**](https://CRAN.R-project.org/package=penMSM) are intended for +the estimation of relative transition hazards only (i.e., for estimating +the impact of covariates on each transition hazard). With the package +[**mboost**](https://CRAN.R-project.org/package=mboost) (as extended by +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM)) it is +also possible to estimate the baseline transition hazards. Finally, a +more complete workflow including estimates of both relative and +cumulative transition hazards, as well as state occupation +probabilities, is implemented in `flexsurv`, +[**msm**](https://CRAN.R-project.org/package=msm) and +[**mstate**](https://CRAN.R-project.org/package=mstate), and has been +under implementation in +[**survival**](https://CRAN.R-project.org/package=survival) (version 3.0 +or later). + +The present paper provides an introduction to +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), a new R +package for multi-state survival analysis available for download on the +Comprehensive R Archive Network (CRAN). The main goal of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is to +provide an analysis framework for the Cox model that performs better +with higher-dimensional covariate data and is also complete, in the +sense of being able to generate point and interval estimates of relative +transition hazards, cumulative transition hazards and state occupation +probabilities, both under clock-forward and clock-reset models. A +fundamental characteristic of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is that it +re-implements and extends the analysis framework of +[**mstate**](https://CRAN.R-project.org/package=mstate), which is +complete in the sense just mentioned. In fact, to a large extent, our +package was built by importing, adapting and replacing functions from +the [**mstate**](https://CRAN.R-project.org/package=mstate) package. +This not only eliminates redundancies, but also makes our package more +accessible to the numerous users of +[**mstate**](https://CRAN.R-project.org/package=mstate) (the three +papers associated with +[**mstate**](https://CRAN.R-project.org/package=mstate) have jointly +over 2000 citations). + +To improve the performance of +[**mstate**](https://CRAN.R-project.org/package=mstate)'s multi-state +Cox model when dealing with higher-dimensional covariate data, a +ridge-type regularisation feature was added. We allow the regression +coefficients of the model to be partitioned into groups, with each group +having its own Gaussian prior. A group can gather, for example, all the +regression coefficients for a given transition. Or, within a given +transition, coefficients can be grouped according to the covariate type +they refer to (for example, demographic, clinical or genomic type). The +resulting hierarchical Bayes model is *empirical* in that a full prior +elicitation is not required (the mean and variance hyper-parameters of +the Gaussian are estimated from the data). Model fitting relies on the +iterative algorithm introduced by @Schall1991, which typically converges +after a small number of steps. A simulation study showing that Schall's +algorithm performance compares well with that of other algorithms for +ridge penalty optimisation, including one based on cross-validation, can +be found in @Perperoglou2014. + +The asymptotic confidence intervals generated by +[**mstate**](https://CRAN.R-project.org/package=mstate) are applicable +when the number of observations is much larger than the number of +parameters to be estimated (see section [3.3](#sec:interval_estimation) +below). To preserve the completeness of +[**mstate**](https://CRAN.R-project.org/package=mstate)'s framework in +higher-dimensional settings, we therefore implemented non-parametric +bootstrap intervals of regression coefficients, cumulative transition +hazards and state occupation probabilities. + +The high computational cost implied by the non-parametric bootstrap +motivated a third extension to +[**mstate**](https://CRAN.R-project.org/package=mstate). We developed an +estimator of state occupation probabilities under clock-reset Cox models +that is based on a convolution argument [as in @Spitoni2012] and the +Fast Fourier transform (FFT). At present, the estimation of such +probabilities for clock-forward Cox models can be carried out using the +efficient, product-limit based algorithm available in +[**mstate**](https://CRAN.R-project.org/package=mstate). However, for +clock-reset Cox models, only a simulation-based estimator is available +in this package (see also the `flexsurv` package for a similar, +simulation-based estimator). The FFT estimator in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) was +conceived as a faster alternative to this simulation-based estimator, +but its scope is currently restricted to multi-state models with +transition structures that have no cycles, i.e. in which a transition +between two states is either not possible or follows a unique sequence +of states. Figure \@ref(fig:figpackage-summary-figure) provides a short +graphical summary of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), with the +main inputs -- a genomic-clinical data set and an empirical Bayes +multi-state Cox model -- and the main outputs -- the estimates of +relative hazards and state occupation probabilities (cumulative +transition hazards are omitted). + +As already mentioned, our empirical Bayes method improves estimator +performance in models with larger numbers of covariates (see section +[4](#sec:estimator_performance) on estimator performance). Also, as a +ridge-type regression method, it can be used as an alternative to the +lasso method of [**penMSM**](https://CRAN.R-project.org/package=penMSM) +in two particular cases: when the levels of correlation between +covariates are high enough to compromise the stability of lasso-based +covariate selection; or simply to improve prediction accuracy when +interpretability is not essential and the number of covariates is not +greater than the number of observations [@Zou2005]. In addition, and +perhaps more importantly, +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) goes beyond +the regularised estimation of transition hazards offered by +[**penMSM**](https://CRAN.R-project.org/package=penMSM) and +[**gamboostMSM**](https://CRAN.R-project.org/package=gamboostMSM): point +and interval estimates of state occupation probabilities under the +regularised Cox model can also be computed. + +## Models + +A multi-state Cox model is a continuous-time stochastic process with a +finite (and usually small) state space $\mathcal{S}$. To better describe +the models implemented in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), we define +the following notation. We let $t$ denote the time since some initiating +event (usually diagnosis or disease onset). For +$t \in \left[0, \infty\right)$, we define the following random +variables: $X(t)$ represents the disease state of the patient, $S(t)$ +the time spent in the current state, and $\vec{Z}\left(t\right)$ the +value of a covariate vector. The realisation of each component of the +process $\lbrace\vec{Z}\left(t\right)\rbrace$ is a step function, +possibly approximating the evolution in time of a continuous covariate. +In addition, $\lbrace\vec{Z}\left(t\right)\rbrace$ is assumed +not-adapted to the filtration generated by +$\lbrace X\left(t\right)\rbrace$ (an adapted covariate is one whose path +until $t$ is known once $\lbrace X \left(u\right)\rbrace$, $u \leq t$, +is known). The transition hazard rate of a patient from state $i$ to +state $j$ ($i\neq j$) at time $t$, conditional on the sojourn time and +the covariate vector, is defined as +$$\begin{aligned} + &\alpha_{ij}\left(t|\mathbf{z},s \right):=\lim_{h \downarrow 0}\frac{1}{h}\mathrm{P}\left[X(t+h)=j\,|\,X(t)=i,S(t)=s,\vec{Z}(t)=\mathbf{z} \right]\;, \;s\in \left[0,\infty\right)\;,\;t\in \left[s,\infty\right)\;. +\end{aligned}$$ +Independent right-censoring and left-truncation are assumed throughout +[@Aalen2008 p. 57]. The purpose of the present section is to give a (not +necessarily exhaustive) description of the scope of +[**mstate**](https://CRAN.R-project.org/package=mstate) and +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) with respect +to the multi-state Cox model. Using the terminology in @Putter2011, a +Cox model is termed a 'clock-reset' model when +$$\begin{aligned} +\label{eq:clock_reset_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}, s\right)&=\lambda_{ij}^{(0)}\left(s\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad, +\end{aligned} (\#eq:clock-reset-Cox)$$ +and it is termed a 'clock-forward' model when +$$\begin{aligned} +\label{eq:clock_forward_Cox} +\alpha_{ij}\left(t\,|\,\mathbf{z}\right)&=\alpha_{ij}^{(0)}\left(t\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right] \quad. +\end{aligned} (\#eq:clock-forward-Cox)$$ +In both cases, $i,j \in \mathcal{S}$, with $i\neq j$; +$\boldsymbol{\beta}_{\scriptscriptstyle ij}$ is an unknown vector of +regression coefficient parameters, and both +$\lambda^{\scriptscriptstyle (0)}_{ij}(\cdot)$ and +$\alpha^{\scriptscriptstyle (0)}_{ij}(\cdot)$ are unknown (baseline +hazard) functions, non-negative on $\mathbb{R}^{+}$. When, as in +equation \@ref(eq:clock-reset-Cox), +$\alpha_{ij}\left(t|\mathbf{z},s\right)$ is the same for all $t\geq s$, +we simplify its notation to $\lambda_{ij}\left(s|\mathbf{z}\right)$. As +can be seen from equations \@ref(eq:clock-reset-Cox) and +\@ref(eq:clock-forward-Cox), the 'clock-reset' and 'clock-forward' +models are models for how the transition hazard rates are affected by +time. In the former case, the only relevant time scale is the time $s$ +spent in the current state, whereas in the latter only the time $t$ +since the initiating event matters. While the 'clock-forward' model is +arguably the default one in multi-state survival analysis +[@Andersen1993; @Aalen2008], in some cases the 'clock-reset' model is +more appropriate. For example, in some forms of cancer, it can be +sensible to assume that the transition hazards from the state of +complete remission depend on the sojourn time, rather than on the time +since the initial diagnosis. + +### Relative transition hazards {#sec:models_relative_hazards} + +The parametric component of the transition hazard from $i$ to $j$, +written +$\exp\left[\boldsymbol{\beta}^{\intercal}_{ij} \,\mathbf{z}\right]$, is +termed the relative transition hazard. In +[**mstate**](https://CRAN.R-project.org/package=mstate) and +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), estimating +the relative transition hazard amounts to estimating the regression +coefficient vector $\boldsymbol{\beta}_{ij}\,$. In +[**mstate**](https://CRAN.R-project.org/package=mstate), these +parameters are assumed to be non-random. With +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +following prior distributions can be imposed. + +Define $\mathcal{P}$ as the set of all pairs of states between which a +direct transition is possible. Let +$\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ij} \rbrace$, for all +$(i, j) \in \mathcal{P}$, be a partition of $\boldsymbol \beta$, a +vector containing the regression coefficients for all direct transitions +allowed. Each $\boldsymbol{\beta}_{\scriptscriptstyle ij}$ is further +partitioned into +$\lbrace \boldsymbol{\beta}_{\scriptscriptstyle ijk} \rbrace$, for +$k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace$. In +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the most +general model regarding the prior distribution of $\boldsymbol{\beta}$ +makes two assumptions: a) the scalar components of $\boldsymbol{\beta}$ +are independent and normally distributed; b) the scalar components of +$\boldsymbol{\beta}_{\scriptscriptstyle i j k}$ have a common (and +undetermined) mean $\mu_{\scriptscriptstyle ijk}$ and a common (and also +undetermined) variance $\sigma^{2}_{\scriptscriptstyle ijk}\;$. + +The purpose of the framework just described is to allow the clustering +of covariate effects according to their prior distribution. If there is +no prior knowledge about how this clustering should be done, a single +Gaussian prior can be imposed on all regression coefficients at once. If +prior knowledge allows the grouping of effects according to the +transition they refer to, a different Gaussian prior can be assigned to +the coefficients of each transition. Even within each transition, +different groups of coefficients can be assigned different prior +distributions. In the analysis of biomedical data, for example, there +can be a split between genes which are known to affect the transition +hazard, and other genes whose effect is unknown. + +### Cumulative transition hazard functions + +Our package imports from +[**mstate**](https://CRAN.R-project.org/package=mstate) a Breslow +estimator of two types of cumulative transition hazard: one on a global +time scale, defined as +$$\begin{aligned} +\mathrm{A}_{ij}\left(t\,|\,\mathbf{z}\right)&:=\int_{0}^{t}\alpha_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad, +\end{aligned}$$ +and another on a sojourn time scale, defined as +$$\begin{aligned} +&\Lambda_{ij}(s\,|\,\mathbf{z}):=\int_{0}^{s}\lambda_{ij}^{(0)}\left(u\right)\exp\left[ \boldsymbol{\beta}^{\intercal}_{ij}\,\mathbf{z}\right]\mathrm{d}u\quad. +\end{aligned}$$ +Note that, in either case, the covariate vector is assumed to remain +constant. + +### State occupation probabilities + +By state occupation probability, we mean the probability that a patient +in state $i$ at time $0$ finds herself in state $j$ at time $t$. The +estimates of these probabilities can be seen as functionals of the +estimated cumulative transition hazard functions. For this reason, the +restriction to models with time-fixed covariates, which was just seen to +be applicable to the estimators of cumulative transition hazards, +carries over to the estimation of state occupation probabilities. + +When conditioning on a given covariate path (time-fixed or not), state +occupation probability estimates are not valid unless the covariates are +*external* [@Cortese2010; @Aalen2008 p. 142]. Note that a vector of +covariates $\lbrace \vec{Z}(u)\rbrace_{u\geq 0}$ is said to be +*external* if, for all $t \in \left[0,\infty\right)$, each transition +hazard at $t$, conditional on $\vec{Z}(t)$, is independent of +$\lbrace \vec{Z}(u)\rbrace_{u>t}$ (i.e. independent of the future path +of the covariate). Otherwise, it is said to be *internal* [for more +details on the distinction between internal and external covariates, see +@Kalbfleisch2002 chapter 6]. When one does not wish (or is not possible +due to $\vec{Z}$ being *internal*) to condition on a future covariate +path of the covariate process, the uncertainty introduced by this +process needs to be accounted for. This can be done by extending the +state space of the disease process, so that it includes information on +the disease *and* the covariate process [@Andersen1993 p. 170]. For +example, to include a dichotomous transplant covariate (an internal +covariate) in a simple survival model with two states, the state space +is expanded from $\lbrace$alive, deceased$\rbrace$ to $\lbrace$alive +without transplant, alive with transplant, deceased$\rbrace$. One can +then either assume that transplanted patients have a different baseline +death hazard or, more simply, that transplantation scales the death +hazard by some constant $\exp \left( \gamma\right)$. A similar but more +detailed example can be found in @Wreede2010 [section 2.3.2, 'model 3' +]. + +## Estimation + +In the current section, we present the estimation methods underlying the +extensions of [**mstate**](https://CRAN.R-project.org/package=mstate) +implemented in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate). +[]{#sec:estimation label="sec:estimation"} + +### Relative and cumulative hazard functions + +Let $\boldsymbol{\mu}_{\scriptscriptstyle ij}$, with +$\left(i,j\right) \in \mathcal{P}$ (the set of direct transitions +allowed), denote a vector whose scalar components are the parameters +$\mu_{\scriptscriptstyle ijk}$, +$k \in \left\lbrace 1,2,...,n_{\scriptscriptstyle ij} \right\rbrace$. +Similarly, let $\boldsymbol{\sigma}^{2}_{\scriptscriptstyle ij}$ be +composed of the parameters +$\left\lbrace \sigma^{2}_{\scriptscriptstyle ijk}\right\rbrace_{k}$. The +estimation of $\boldsymbol{\beta}$, +$\boldsymbol{\mu}:=\lbrace\boldsymbol{\mu}_{\scriptscriptstyle{ij}}\rbrace$ +and +$\boldsymbol{\sigma}^2:=\lbrace\boldsymbol{\sigma}^2_{\scriptscriptstyle ij }\rbrace$ +relies on the restricted maximum-likelihood (REML) type algorithm +described in [@Perperoglou2014], and introduced by [@Schall1991]. The +resulting estimate of $\boldsymbol{\beta}$ is a maximum *a posteriori* +estimate; the estimates of $\boldsymbol{\mu}$ and +$\boldsymbol{\sigma}^{2}$ are empirical Bayes estimates. In +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +estimator based on this algorithm is implemented in the function +`CoxRFX` . The results of a simulation study showing its consistency are +included in the Supporting Scripts and Data (file ESM_1.html, section +1). + +The computation of cumulative hazard rates for given covariate values +and an estimated regression coefficient vector relies on the function +`msfit_generic`, which is essentially a wrapper for the function +`mstate::msfit` (see section [5.3](#sec:computing_cumulative_hazards)). +For the mathematical details of this computation, we refer therefore the +reader to @Wreede2010. + +### State occupation probabilities {#sec:trans_probs} + +The package [**mstate**](https://CRAN.R-project.org/package=mstate) +includes a simulation-based estimator that can take as input either +$\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ or +$\hat{\Lambda}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ to generate +estimates of state occupation probabilities under the clock-forward or +the clock-reset model respectively. Another available estimator, an +Aalen-Johansen-type estimator based on product integration, is far more +efficient computationally and takes as input +$\hat{\mathrm{A}}_{ij}\left(\cdot\,|\,\mathbf{z}\right)$ only. As the +scope of this estimator has been restricted to clock-forward Cox models +[@Andersen1993; @Aalen2008], in our package we implemented a +convolution-based estimator as a computationally efficient alternative +(for models with a transition structure that has no cycles). + +For convenience, let the sequence of states from $0$ to $n$ have the +labels $0,1,2,...,n\,$, where $0$ is the initial state by definition, +and $n$ is some state that might (eventually) be reached by the process. +In addition, define $X_{0}:=X(0)$ and $T_{0}:=0$, and let +$\left(X_{i},T_{i}\right)$, $i \in \left\lbrace 1,2,... \right\rbrace$, +denote the marked point process associated with +$\left\lbrace X(t)\right\rbrace$, so that $T_{i}$ is the time of the +$i^{th}$ transition and $X_{i}$ is the state the process jumps to at +time $T_{i}$. The inter-transition times are denoted by +$\tau_{ij}:=T_{j}-T_{i}$, for $j>i$. We can write the probability that a +patient in state $0$ at time $0$ finds herself in state $n$ at time $t$, +conditional on $\vec{Z}(u)=\mathbf{z}$ for all $u \geq 0$, as +$$\begin{aligned} + &\mathrm{P}\left[X(t)=n\,|\,X(0)=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right]\\ + &\,=\mathrm{P}\left[X_{n}=n,\tau_{0,n} < t,\tau_{n,n+1}\geq t- \tau_{0,n} |X_{0}=0\,, \vec{Z}(u)=\mathbf{z},\,u \geq 0 \right] \,.\nonumber +\end{aligned}$$ + +Recall that $\lambda_{i,i+1}\left(s\,|\, \mathbf{z}\right)$ denotes the +hazard rate of a transition to state $i+1$ at time $s$ since arrival in +state $i$, for a patient that has covariate vector $\mathbf{z}$. The +cumulative hazard for the same transition between sojourn times $0$ and +$s$, if the patient's covariate vector remains constant at $\mathbf{z}$, +is represented by +$\Lambda_{i,i+1}\left(s \,|\, \mathbf{z}\right):=\int_{0}^{s}\lambda_{i,i+1}\left(x\,|\, \mathbf{z}\right)\mathrm{d}x$. +Similarly, we let $\lambda_{i}\left(s\,|\, \mathbf{z}\right)$ represent +the hazard rate of going to any state that can be reached directly from +$i$, at time $s$ since arrival in state $i$, for a patient with +covariate vector $\mathbf{z}$. The cumulative hazard for the same event +between sojourn times $0$ and $s$, if the patient's covariate vector +remains constant at $\mathbf{z}$, is represented by +$\Lambda_{i}\left(s \,|\, \mathbf{z}\right)$. The expressions +$\hat{\Lambda}_{i}\left(s \,|\, \mathbf{z}\right)$ and +$\hat{\Lambda}_{i,i+1}\left(s \,|\, \mathbf{z}\right)$ denote the +Breslow estimators of the cumulative hazards just defined. In what +follows, all references to probabilities, hazard rates and cumulative +hazards are to be understood as conditional on +$\vec{Z}(u)=\mathbf{z}\,$, for $u\geq 0$: this condition is omitted to +simplify the notation. + +In [**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +function `probtrans_ebmstate` generates a set of state occupation +probability estimates at equally spaced time points: +$$\begin{aligned} +&\left\lbrace \hat{p}_{0n}\left(k\right)\right\rbrace_{k} :=\left\lbrace \hat{\mathrm{P}}\left[X_{n}=n,\tau_{0,n} < t_{k},\tau_{n,n+1}\geq t_{k}- \tau_{0,n}\,|\, X_{0}=0 \right] \right\rbrace_{k}\;,\; k=0,1,2,...,K\,;\, t_{k}=k\times \Delta t \;. +\end{aligned}$$ +The number $K$ of time intervals is $10,000$ by default and $t_{K}$ is a +parameter set by the user. Defining the functions +$$\begin{aligned} +q_{ij}\left(k\right):=\mathrm{P}\left[X_{j}=j, \tau_{ij}\in \left[t_{k},t_{k+1}\right)\,|\,X_{i}=i\right] +\end{aligned}$$ +and +$$\begin{aligned} +r_{i}\left(k\right):=\mathrm{P}\left[\tau_{i,i+1} > t_{k} \,|\,X_{i}=i\right]\;, +\end{aligned}$$ +and the finite difference +$$\begin{aligned} + \Delta \hat{\Lambda}_{i,i+1}\left(t_{k}\right):=\hat{\Lambda}_{i,i+1}\left(t_{k+1}\right)-\hat{\Lambda}_{i,i+1}\left(t_{k}\right)\;, +\end{aligned}$$ +the algorithm behind `probtrans_ebmstate` can be described as follows: + +1. For $j=1,2,...,n$, compute + $$\begin{aligned} + \label{eq:est1} + \hat{q}_{j-1,j}\left(k\right)&:=\exp \left[-\hat{\Lambda}_{j-1}\left(t_{k}\right)\right]\Delta \hat{\Lambda}_{j-1,j}\left(t_{k}\right)&& + \end{aligned} (\#eq:est1)$$ + for $k=0,1,...,K-1$. + +2. For $j=2,3,...,n$, compute (iteratively) + $$\begin{aligned} + \label{eq:est2} + \hat{q}_{0j}\left(k\right):=&\sum_{l=0}^{k-1} \hat{q}_{j-1,j}\left(k-l-1\right) \hat{q}_{0,j-1} \left(l\right) && + \end{aligned} (\#eq:est2)$$ + for $k=0,1,...,K-1$. + +3. Finally, use the estimates obtained in the last iteration of step 2 + to compute + $$\begin{aligned} + \label{eq:est4} + \hat{p}_{0n}\left(k\right):=&\sum_{l=0}^{k-1} \hat{r}_{n}\left(k-l-1\right) \hat{q}_{0,n}\left(l\right)&& + \end{aligned} (\#eq:est4)$$ + for $k=0,1,...,K$, where + $\hat{r}_{n}\left(\cdot\right):=\exp \left[-\hat{\Lambda}_{n}\left(t_{\scriptscriptstyle\left(\cdot\right)}\right)\right]\,$. + +Substituting $:=$ for $\approx$ and removing the 'hats' in definitions +\@ref(eq:est1) to \@ref(eq:est4), we get the approximate equalities that +justify the algorithm. These approximate equalities are derived in the +Supporting Scripts and Data (file ESM_1.html, section 2). + +Apart from `probtrans_ebmstate`, the function `probtrans_fft` is also +based on the convolution argument just shown. However, this function +makes use of the convolution theorem, i.e., of the fact that the +convolution of two (vectorized) functions in the time domain is +equivalent to a pointwise product of the same functions in the frequency +domain. The estimation of state occupation probabilities is thus +simplified to +$$\begin{aligned} + \hat{p}_{0n}:=&\mathcal{F}^{\scriptscriptstyle -1}\left\lbrace \hat{\mathrm q}_{0,1} \boldsymbol{\cdot} \hat{\mathrm q}_{1,2}\boldsymbol{\cdot} \mathrm{...}\boldsymbol{\cdot}\hat{\mathrm q}_{n-1,n}\boldsymbol \cdot \hat{\mathrm r}_{n}\right\rbrace\;, +\end{aligned}$$ +where $\mathcal{F}$ denotes the discrete Fourier transform, +$\hat{\mathrm{q}}_{j-1,j}:=\mathcal{F}(\hat{q}_{j-1,j})$ and +$\hat{\mathrm{r}}_{n}:=\mathcal{F}(\hat{r}_{n})$. Conversion to and from +the frequency domain is carried out using the fast Fourier transform +algorithm implemented in the `fft` function of the base package `stats`. +The Supporting Scripts and Data contain a short simulation study +checking that state occupation probabilities can be accurately estimated +with `probtrans_ebmstate` and `probtrans_fft` (see file ESM_1.html, +sections 3 and 4). + +Figure \@ref(fig:figmssample) consists of a grid of plots with estimated +curves of state occupation probabilities. It compares, in terms of speed +and accuracy, the estimator in `probtrans_fft` with an estimator in +`mstate::mssample` that has the same target, but is simulation-based. +Each plot contains a black curve and a superimposed red curve. The red +curves in any given column of the grid are all based on the same run of +a function: columns 1 to 3 are based on runs of `mssample` with the +number of samples $n$ equal to $100$, $1000$ and $10.000$ respectively, +while column 4 is based on a run of `probtrans_fft`. Each column in the +grid reproduces the same 4 black curves. These are based on a single run +of `mssample` with $n=100.000$ and serve as benchmark. All function runs +are based on the same input: a set of cumulative transition hazard +estimates for a multi-state model with the 'linear' transition structure +given in the leftmost diagram of figure +\@ref(fig:figtransition-structures). Plots in a given row refer to the +same state of the model. The running times on top of each column refer +to the estimation of red curves. The main conclusion suggested by this +analysis of simulated data is that `probtrans_fft` is as accurate as +`mssample` with $n=10.000$, but it is almost 100 times faster (columns 3 +and 4). With $n=1000$, `mssample` achieves a good approximation to the +true state occupation probabilities, but is still roughly 9 times +slower. The details on how figure \@ref(fig:figmssample) and its +underlying data were generated are given in the Supporting Scripts and +Data (file ESM_1.html, section 5). + +### Interval estimation {#sec:interval_estimation} + +Under any model estimated by +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) -- as in +general under a Bayesian model --, one can, if the sample size is large +enough, approximate the posterior by a normal distribution with mean +equal to the maximum *a posteriori* estimate and covariance matrix equal +to the inverse of the generalised observed Fisher information [see, for +example, @Gelman2014 p. 83-84]. This approximation has first-order +accuracy and is thus outperformed by Laplace's method, which has +second-order accuracy [@Carlin2009 p. 110-111]. However, as @Carlin2009 +[p. 112] observe, "for moderate- to high-dimensional $\boldsymbol\theta$ +(say, bigger than 10), Laplaces method will rarely be of sufficient +accuracy\[\...\]". @Carlin2009 [p. 244-251] also describe three methods +of interval estimation in empirical Bayes settings, but all of them are +designed for fully parametric models. These reasons, along with the fact +that regularised methods such as the one implemented +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) are +typically used to fit models with more than a dozen covariates, led us +to choose the non-parametric bootstrap as the interval estimation method +in [**ebmstate**](https://CRAN.R-project.org/package=ebmstate). Note +that the non-parametric bootstrap can be given a Bayesian +interpretation. Its interval estimates are approximately the same as +those of a Bayesian model that assumes: a) a multinomial distribution +for the data; and b) a non-informative Dirichlet prior distribution for +the probability assigned to each category in the multinomial +distribution. This is a specific case of the so-called Bayesian +bootstrap [@Hastie2009 p. 272]. Further research is needed to determine +the theoretical properties of the non-parametric bootstrap in the +present setting, but this falls beyond the scope of the present paper. +Interval estimates of regression coefficients, cumulative hazards and +state occupation probabilities are implemented in the function +`boot_ebmstate`. + +## Estimator performance {#sec:estimator_performance} + +It is a well-documented fact in the statistical literature that standard +least-squares or maximum-likelihood estimators can often be improved by +regularisation or shrinkage [see, for example, @Samworth2012]. This +improvement comes about when the model dimensionality is high enough +that the bias introduced by regularisation is outweighed by the +reduction in the estimator variance. In the current setting, one might +therefore ask: what kind of dimensionality does a semi-parametric, +multi-state Cox model need to have to be outperformed by its empirical +Bayes counterpart? A simulation study we carried out offers a tentative +answer to this question, by comparing estimators under both Cox models +for an increasing number of covariates. The study also features a third +method, based on a fully non-parametric model, as a null model method. +This was included to give an idea of how many covariates the empirical +Bayes model can deal with before it becomes no better than a simple +non-regressive model. + +### Simulation setup + +We assessed the performance of all estimators defined by the tuple +$\left[a,m, G, n,p(n)\right]$, where $a\in \lbrace$regression +coefficients, relative hazards, state occupation probabilities$\rbrace$ +is the target of estimation, $m\in \lbrace$standard Cox, empirical Bayes +Cox, null$\rbrace$ is the assumed hazard model, $G \in \lbrace$linear, +competing risks, 'm' structure$\rbrace$ is the transition structure of +the model (illustrated in figure \@ref(fig:figtransition-structures)) +and $n\in \lbrace 100,1000\rbrace$ is the number of patients/disease +histories in the training data set; the variable $p$ denotes the number +of coefficients/covariates per transition in the true model and its +range depends on $n$: +$p\left(100\right) \in \lbrace 10,40,70,100 \rbrace$ whereas +$p\left(100\right) \in \lbrace 10,100,200,300 ,400,500\rbrace$. By +'relative hazards' and 'state occupation probabilities', we mean here +the relative transition hazards of an out-of-sample patient, and her +state occupation probabilities at 7 chosen time points. We generated a +batch of 300 independent absolute error observations ('NA' estimates +included) for each estimator, where each observation is recorded after +training the estimator on a newly simulated data set. Each boxplot in +figures \@ref(fig:figestimator-performance-boxplots-100patients) +($n=100$) and \@ref(fig:figestimator-performance-boxplots-1000patients) +($n=1000$) is based on one of these batches. As all estimators are +*vector* estimators, each absolute error is actually an *average* +absolute error, where the average is taken over the components of the +vector. + +All training data sets were simulated from clock-reset Cox models. Apart +from $G$ (the model transition structure), $n$ and $p$, also the true +baseline hazards are held fixed within each batch of 300 training data +sets. The coefficient vectors used in the simulation are always +non-sparse and are scaled by $\sqrt{\frac{10}{p}}$ to keep the +log-hazard variance constant when the dimensionality grows. All +covariates are dichotomous and mutually independent. To compute the +coefficient errors for the non-parametric (null) model method, we think +of it as a degenerate Cox model in which all regression coefficient +estimates are fixed at zero. The estimation of regression coefficients +under the standard Cox and the empirical Bayes Cox models was performed +with `survival::coxph` and `ebmstate::CoxRFX` respectively; the +estimation of state occupation probabilities is based on +`mstate::probtrans` for the null model and on `ebmstate::probtrans_fft` +for both the standard Cox and the empirical Bayes Cox models. + +The reason we did not consider simulation scenarios with more than 500 +covariates per transition, in data sets of 1000 patients, was simply +computational cost. For example, generating the data and error +observations for the scenario with $n=1000$, $p=100$ and $G=$'m' +structure took less than one hour to generate using 20 CPU cores in +parallel; the same scenario but with $p=500$ took 6.5 days using 25 CPU +cores. More details about the simulation setup can be found in the +Supporting Scripts and Data (file ESM_1.html, section 6, subsection +'sample script'). + +### Missing values + +Whenever an estimator was able to compute a valid estimate of its target +for each training data set, i.e., when it did not return any 'NA' +estimates, its boxplots are based on 300 valid error observations. This +was always the case with non-parametric estimators: the estimates of +regression coefficients and relative hazards of this type of estimators +are trivial (fixed at zero and one respectively) and hence it is also +straightforward to compute absolute errors. It also happened that +non-parametric estimators of state occupation probabilities had no 'NA' +estimates (see file ESM_1.html, section 6, figure 6.3, in the Supporting +Scripts and Data). The situation was similar for the empirical Bayes Cox +model estimators, which showed no more than 5$\%$ missing estimates in +any of the simulation scenarios studied (ibid., figures 6.1 and 6.2). +However, for the standard Cox model ones, the number of 'NA' estimates +depends to a large extent on the number of patients in the data set, as +well as on the dimensionality and transition structure of the model +(figures \@ref(fig:figna-props-100patients-coxph) and +\@ref(fig:figna-props-1000patients-coxph)). In data sets of 100 +patients, it fares well in models with fewer than 10 covariates per +transition, or in models with up to 40 covariates, if the transition +structure is linear. Otherwise its failure rates range from roughly +25$\%$ to nearly 100$\%$. In data sets of 1000 patients, the proportion +of 'NA' estimates is never above 10$\%$, if the transition structure is +linear, but it can climb above 60$\%$ for other transition structures. + +### Comparison of estimators + +With respect to the performance of the three methods studied, the +boxplots in figures +\@ref(fig:figestimator-performance-boxplots-100patients) and +\@ref(fig:figestimator-performance-boxplots-1000patients) suggest the +following conclusions: + +- As $p/n$ grows, the empirical Bayes estimators quickly outperform + the standard Cox model ones. They already fare substantially better + at $p/n=0.1$ for both $n=100$ and $n=1000$ and for all estimation + targets. At the same time, the relative performance of the empirical + Bayes method with respect to the null model one decreases. At + $p/n=0.5$, the difference between these two methods is already + rather small for all simulation scenarios. + +- The relative performance of the empirical Bayes method with respect + to the null method decreases as the number of co-occurring + transition hazards in the model grows. All other things equal, the + empirical Bayes method has the best performance under the 'linear' + structure model, which has no competing transitions; it performs + less well under the 'm' structure transition model, where two + transition hazards can co-occur; and has the worse relative + performances under the 'competing risks' model, where three + transition hazards co-occur. This trend is clearer for $n=100$ + (figure \@ref(fig:figestimator-performance-boxplots-100patients)) + but can also be detected in the relative hazard errors for $n=1000$ + (figure \@ref(fig:figestimator-performance-boxplots-1000patients)). + In any case, the empirical Bayes method seems to be far more robust + than the standard Cox model against increases in the number of + co-occurring transition hazards. + +- Having as target the regression coefficients or the state occupation + probabilities, instead of relative hazards, makes the empirical + Bayes method better in comparison to the null method. In fact, as + $p/n$ grows, the empirical Bayes method is never outperformed by the + null method except in the estimation of relative hazards. + +## Survival analysis workflow + +The features of `mstate` were illustrated in @Wreede2010 using a simple +workflow. The starting point of this workflow is a data set in 'long +format'. Such data set can be fed into `survival::coxph` to obtain +estimates of the regression coefficients of a multi-state Cox model. The +resulting model fit object can be passed on to `mstate::msfit`, along +with a vector of covariates of a particular patient, to get personalised +estimates of the cumulative hazard functions. Finally, state occupation +probabilities for the same patient can be estimated if the object +created by `mstate::msfit` is fed into `mstate::probtrans`. In this +section, we describe how +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) extends the +scope of this workflow, i.e., how it uses the packages +[**survival**](https://CRAN.R-project.org/package=survival) and +[**mstate**](https://CRAN.R-project.org/package=mstate) to generate +estimates under a multi-state empirical Bayes Cox model. A diagram +summarising the extension is shown in figure \@ref(fig:figworkflow). In +the [5.5](#sec:model_assessment) subsection, we give some +recommendations on how to assess and compare models, but for more +detailed tutorials on how to analyse multi-state data using models +defined by transition hazards, we refer the reader to +@Putter2007tutorial and @Putter2011tutorial. + +The main steps of the +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) workflow are +here illustrated using a data set of patients with myelodysplastic +syndromes (MDS) which has been described and studied in +@Papaemmanuil2013. A myelodysplastic syndrome is a form of leukemia in +which the bone marrow is not able to produce enough mature blood cells, +and which sometimes develops into a cancer of white blood cells with a +quick and aggressive progression, i.e., into acute myeloid leukemia +(AML). Figure \@ref(fig:figtrans-diagrams)a illustrates an illness-death +type model for MDS patients and also gives a breakdown of the number of +transition events. The conversion to a model with a transition structure +that has no cycles (i.e., that can be handled by our convolution-based +estimators) is shown in figure \@ref(fig:figtrans-diagrams)b. The data +set used for model estimation, obtained after a number of pre-processing +steps, contains the disease history of 576 patients, as well as +measurements on 30 covariates. Of these 30 covariates, 11 are mutation +covariates and the remaining are clinical or demographic (see figure +\@ref(fig:figtrans-diagrams)c). The running time for the estimation of +relative transition hazards does not exceed 10 seconds in a standard +laptop computer. The same holds for the estimation of cumulative +transition hazards or state occupation probabilities for a given +patient. The complete R code underlying the data analysis in the current +section can be found in the Supporting Scripts and Data (file +ESM_2.html). For running only the R snippets shown below and reproduce +their results, the best option is to use the R script in file ESM_3.R of +the Supporting Scripts and Data. + +### Input data + +Table \@ref(table:long_format_data) shows a fragment of the MDS data +set. The data is in 'long format', which means that each row refers to a +period of risk for a given transition and patient. For example, row $i$ +tells us that, at time `Tstart[i]`, patient `id[i]` entered state +`from[i]`, and thereby began to be at risk for transition `trans[i]`, +i.e., at risk of going from state `from[i]` to state `to[i]`. If the +first transition of patient `id[i]` after time `Tstart[i]` occurs before +the last follow-up time for this patient, `Tstop[i]` records the time of +this transition (regardless of whether the patient moved to state +`to[i]` or not). Otherwise, `Tstop[i]` is set to the last follow-up +time. The value of `status[i]` is set to 1 if and only if the first +transition of patient `id[i]` after `Tstart[i]` is to state `to[i]` and +occurs before the last follow-up (otherwise it is set to 0). The value +of `time[i]` is defined simply as `Tstop[i]`$-$`Tstart[i]`, and +`strata[i]` is the stratum of the baseline hazard for transition +`trans[i]` (more about this variable in the following section). For `x` +$\in \left\lbrace \right.$ `ASXL1`, `DNMT3A`, +$\dots \left. \right \rbrace$, `x[i]` denotes the level of covariate `x` +between `Tstart[i]` and `Tstop[i]` in patient `id[i]`. (In the MDS data +set, we assume that the relative hazard of a patient is determined by +her covariate vector at $t=0$, i.e., we assume all covariates to be +time-fixed.) If a patient enters a new state, and this state +communicates directly with $n$ other states, then, as long as the +patient actually spends time in the new state (i.e. the time of +transition is not the same as the last follow-up time), $n$ rows must be +added to the data set, with each row corresponding to a different +possible transition. + +From table \@ref(table:long_format_data), we know that patient 77 +entered state 1 ('MDS') at time 0 and remained in this state until time +2029, when she moved to state 3 ('death before AML'). There are no rows +to describe the evolution of patient 77 after entering state 3, as this +state is an absorbing state. As to patient 78, she remained in state 1 +until time 332, and moved from there to state 2 ('AML'). She lived with +AML for 1117 days and moved to state 4 ('death after AML') at time 1449. + +``` r +id from to trans Tstart Tstop time status strata ASXL1 DNMT3A [...] +77 1 2 1 0 2029 2029 0 1 0 0 . +77 1 3 2 0 2029 2029 1 2 0 0 . +78 1 2 1 0 332 332 1 1 1 0 . +78 1 3 2 0 332 332 0 2 1 0 . +78 2 4 3 332 1449 1117 1 3 1 0 . +``` + +### Fitting an empirical Bayes Cox model {#sec:fit_bayes_cox_model} + +Once the data is in 'long format', the estimation of an empirical Bayes +model can be carried out using the function `CoxRFX`. A simple example +of the first argument of `CoxRFX`, denoted '`Z`', is a data frame +gathering the `trans`, `strata` and covariate columns of the data in +long format: + +``` r +outcome_covs <- c("id","from","to","trans","Tstart","Tstop","time","status", + "strata") +Z <- mstate_data[!names(mstate_data) %in% outcome_covs] +#(`mstate_data' has the data in long format) +``` + +The `strata` column determines which baseline hazard functions are +assumed to be equal. In table \@ref(table:long_format_data), each +transition is assumed to have a (potentially) different baseline hazard. +The model's assumptions regarding how covariates affect the hazard are +reflected on the format of the covariate columns of `Z`. When the `Z` +argument is the one created in the previous block of code, `CoxRFX` +returns a single regression coefficient estimate for each covariate. In +other words, the impact of any covariate is assumed to be the same for +every transition. + +There are however ways of relaxing this assumption. One can replace the +`ASXL1` column in Z (or any other covariate column) by several +'type-specific' `ASXL1` columns: the `ASXL1` column specific for type +$i$ would show the mutation status of `ASXL1` in rows belonging to +transition of type $i$, and show zero in all other rows. This would +force `CoxRFX` to estimate a (potentially) different `ASXL1` coefficient +for each transition type. This process of covariate expansion by type +can be based on any partition of the set of transitions. When each type +corresponds to a single transition, we refer to it simply as 'covariate +expansion by transition'. The output shown below illustrates the effect +of expanding the covariates in 'mstate_data' by transition. + +``` r +# Columns `id' and `trans' from `mstate_data' together with the first +# two expanded covariates (patients 77 and 78): + id trans ASXL1.1 ASXL1.2 ASXL1.3 DNMT3A.1 DNMT3A.2 DNMT3A.3 [...] + 77 1 0 0 0 0 0 0 . + 77 2 0 0 0 0 0 0 . + 78 1 1 0 0 0 0 0 . + 78 2 0 1 0 0 0 0 . + 78 3 0 0 1 0 0 0 . +``` + +The example code given below shows how to use +[**mstate**](https://CRAN.R-project.org/package=mstate) to expand +covariates by transition and how to create a `Z` argument that makes +`CoxRFX` estimate a regression coefficient for each covariate for +transitions 1 and 2, and assume a fully non-parametric hazard for +transition 3. + +``` r +# To expand covariates by transition using mstate::expand.covs, +# first set the class of `mstate_data' as +class(mstate_data) <- c("data.frame","msdata") + +# then add the transition matrix as attribute: +attr(mstate_data,"trans") <- tmat +#(`tmat' is the output of mstate::transMat) + +# Expand covariates by transition: +covariates_expanded_123 <- mstate::expand.covs( + mstate_data, + covs = names(mstate_data)[! names(mstate_data) %in% outcome_covs], + append = F +) + +# remove all covariates for transition 3 from `covariates_expanded_123' +# to fit a fully non-parametric model on this transition: +covariates_expanded_12 <- covariates_expanded_123[ + !grepl(".3",names(covariates_expanded_123),fixed = T) +] + +#argument `Z' of coxrfx +Z_12 <- data.frame(covariates_expanded_12,strata = mstate_data$trans, + trans = mstate_data$trans) +``` + +The second argument of `CoxRFX` ('`surv`') is a survival object that can +easily be built by feeding the outcome variable columns of the data to +the function `Surv` (from the package +[**survival**](https://CRAN.R-project.org/package=survival)). Whether +`CoxRFX` fits a clock-forward model or a clock-reset model depends on +the kind of survival object: + +``` r +#argument `surv' for a clock-forward model +surv <- Surv(mstate_data$Tstart,mstate_data$Tstop,mstate_data$status) + +#argument `surv' for a clock-reset model +surv <- Surv(mstate_data$time,mstate_data$status) +``` + +The argument `groups` of `CoxRFX` is a vector whose length equals the +number of covariates in the data. In other words, the length of `groups` +is `ncol(Z)-2`, since the argument `Z` must include both the covariate +data and the `strata` and `trans` columns. If, for $i \neq j$, +`groups[i]`=`groups[j]` $=\text{`foo'}$, this means that the regression +coefficients of the $i^{th}$ and $j^{th}$ covariates of `Z` both belong +to a group named 'foo' of coefficients with the same prior. For the `Z` +object built above, the `groups` argument created in the following block +of code embodies the assumption that all coefficients associated with a +given transition have the same prior distribution. The final line of +code fits the empirical Bayes model. + +``` r +#argument `groups' of coxrfx +groups_12 <- paste0(rep("group",ncol(Z)-2),c("_1","_2")) + +#fit random effects model +model_12 <- CoxRFX(Z_12,surv,groups_12,tmat) +``` + +Figure \@ref(fig:figcoef-plots) shows regression coefficient point +estimates for a clock-reset, empirical Bayes model fitted with the code +above. Also shown are 95% non-parametric bootstrap confidence intervals +computed using `ebmstate::boot_ebmstate`. The $x$-axis scale is +logarithmic to allow estimates to be read as relative hazards more +easily. For example, a mutation in *RUNX1* is associated with a twofold +increase in the hazard of progression from MDS to AML, and treatment +centre 4 is associated with a 3-fold increase in the hazard of dying +before progressing to AML, when compared to the baseline value of +'treatment centre' (treatment centre = 2 or 5). In covariates that have +been log-transformed (age, platelet count and neutrophil count) or +logit-transformed (proportions of myeloblasts and ring sideroblasts in +the bone marrow), the interpretation of estimates is different. For +example, an increase in age by a factor of $e$ ($\approx 2.72$) almost +triples the hazard of dying before AML; the same increase in the ratio +$bm\_blasts/(1-bm\_blasts)$ (where *bm_blasts* is the proportion of +myeloblasts in the bone marrow) is associated with an increment in the +hazard of dying before AML of approximately $16\%$. + +### Computing cumulative transition hazard estimates {#sec:computing_cumulative_hazards} + +The function `msfit_generic` is the generic function in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) that +computes cumulative transition hazards for a given set of covariate +values and an estimated Cox model. It calls a different method according +to the class of its `object` argument. The default method corresponds to +the original `msfit` function of the +[**mstate**](https://CRAN.R-project.org/package=mstate) package and is +appropriate for objects of class `coxph`, i.e., objects that contain the +fit of a Cox model with fixed effects. The other available method for +`msfit_generic`, `msfit_generic.coxrfx`, is just the original `msfit` +function, (slightly) adapted to deal with objects generated by `CoxRFX`. +Quite importantly, `msfit_generic.coxrfx` does not allow the variance of +the cumulative hazards to be computed, as this computation relies on +asymptotic results which may not be valid for an empirical Bayes model. +As a result, it only has two other arguments apart from the object of +class `coxrfx`: a data frame with the covariate values of the patient +whose cumulative hazards we want to compute; and a transition matrix +describing the states and transitions in the model (such as the one that +can be generated using `transMat` from the package +[**mstate**](https://CRAN.R-project.org/package=mstate)). The following +block of code exemplifies how these objects can be built and generates +the `msfit` object containing the cumulative transition hazard estimates +for a sample patient. Note that the object with the patient data must +include a row for each transition, as well as a column specifying the +transition stratum of each row of covariates. + +``` r +# Build `patient_data' data frame with the covariate values for which +# cumulative hazards are to be computed (covariate values of patient 78): +patient_data <- mstate.data[mstate.data$id == 78,,drop = F][rep(1,3),] +patient_data$strata <- patient_data$trans <- 1:3 +patient_data <- mstate::expand.covs( + patient_data, + covs = names(patient_data)[ ! names(patient_data) %in% outcome_covs], + append = T +) +patient_data <- patient_data[ ! grepl(".3",names(patient_data),fixed = T)] + +# The `patient_data' data frame has only 3 rows (one for each transition). +# The output below shows its `id' and `trans' columns +# and expanded covariates ASXL1 and DNMT3A: + id trans ASXL1.1 ASXL1.2 DNMT3A.1 DNMT3A.2 [...] + 78 1 1 0 0 0 . + 78 2 0 1 0 0 . + 78 3 0 0 0 0 . + +# compute cumulative hazards +msfit_object_12 <- msfit_generic(model_12,patient_data,tmat) +``` + +Figure \@ref(fig:figpatient78-cumhaz) shows three plots of estimated +cumulative transition hazards for the sampled patient, one for each +transition in the model, along with $95\%$ non-parametric bootstrap +confidence intervals (computed with `ebmstate::boot_ebmstate`). +Throughout the plotted period, the 'slope' of the cumulative hazard +(i.e., the hazard rate) for the MDS to AML transition is lower than the +one for the MDS to death transition, and this in turn is lower than the +one for the AML to death transition. It should be recalled that the +cumulative hazard estimate is strictly non-parametric for this last +transition, i.e., it is the same for all patients. The central plot of +figure \@ref(fig:figpatient78-cumhaz) suggests that, as time since +diagnosis goes by, the hazard of dying in MDS increases (possibly an +effect of age). On the other hand, the hazard of dying in AML seems to +decrease (slightly) with time (rightmost plot). Conclusions regarding +the evolution of the AML hazard are hard to draw, since the confidence +intervals for the corresponding cumulative hazard curve are very wide +(leftmost plot). + +If an object generated by `msfit_generic` is fed to `plot`, and the +package [**mstate**](https://CRAN.R-project.org/package=mstate) is +loaded, the method `mstate:::plot.msfit` will be called. This is an +efficient way of automatically plotting the cumulative hazard estimates +for all transitions, but confidence interval lines (separately +estimated) cannot be added. + +### Computing state occupation probability estimates {#sec:computing_transition_probs} + +The functions `probtrans_mstate`, `probtrans_ebmstate` and +`probtrans_fft` compute estimates of state occupation probabilities for +a given `msfit` object. All three functions generate objects of class +`probtrans` that can be fed to the `plot.probtrans` method from the +package [**mstate**](https://CRAN.R-project.org/package=mstate). The +first of these functions should only be used for clock-forward models, +as it relies on product-limit calculations. It calls the method +`probtrans_mstate.default`, if the `msfit` object was generated by +`msfit_generic.default`, or the method `probtrans_mstate.coxrfx`, if it +was generated by `msfit_generic.coxrfx`. Both methods are identical to +the function `probtrans` in the +[**mstate**](https://CRAN.R-project.org/package=mstate) package, with +the reserve that `probtrans_mstate.coxrfx` does not allow the +computation of the variances or covariances of the state occupation +probability estimator. + +The functions `probtrans_ebmstate` and `probtrans_fft` are the functions +in [**ebmstate**](https://CRAN.R-project.org/package=ebmstate) for the +computation of state occupation probability estimates under clock-reset +models with a transition structure that has no cycles. When using +`probtrans_fft` (the faster, but somewhat less stable, of these two +functions), three arguments must be supplied: the initial state of the +process whose state occupation probabilities one wishes to compute, the +`msfit` object, and the upper time limit for the generation of estimates +(`max_time`). Both functions are based on a discrete-time approximation +to a series of convolutions. The default argument `nr_steps` controls +the number of (equally spaced) time steps used in this approximation. +The arguments `max_time` and `nr_steps` should be increased until the +estimated curves become stable. + +The following line of code computes point estimates of state occupation +probabilities for the sample patient. + +``` r +probtrans_object_12 <- probtrans_fft("MDS",msfit_object_12, max_time = 4000) +``` + +Estimates are shown in figure \@ref(fig:figpatient78-transProbs), along +with $95\%$ non-parametric, bootstrap confidence intervals. For this +particular patient, the estimated probability of being dead after AML +remains below 0.4 throughout a period of 10 years from the MDS +diagnosis; if the patient does reach AML, death is expected to happen +quickly thereafter, as reflected in the very low estimates for the +probability of being in AML at any point in time. The following block of +code shows how to compute confidence intervals with `boot_ebmstate`: + +``` r +# Creating the object arguments for boot_ebmstate() + +# `groups' arguments was already created, but we need to add names to it +names(groups_12) <- names(covariates_expanded_12) + +# `mstate_data_expanded' argument (similar to `covariates_expanded' but +# including outcome variables) +mstate_data_expanded <- cbind( + mstate_data[names(mstate_data) %in% outcome_covs], + covariates_expanded_12 +) + +# create the non-parametric bootstrap confidence intervals +boot_ebmstate_object <- boot_ebmstate( + mstate_data = mstate_data_expanded, + which_group = groups_12, + min_nr_samples = 100, + patient_data = patient_data, + tmat = tmat, + initial_state = "MDS", + time_model = "clockreset", + input_file = NULL, + coxrfx_args = list(max.iter = 200), + probtrans_args = list(max_time = 4000) +) +``` + +### Model assessment {#sec:model_assessment} + +For any model fitted with +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), two +performance metrics can be easily computed: the *concordance* statistic +([@harrell1982evaluating]; see also the help page of +`survival::concordance` for the definition of concordance) and the +*Bayesian Information Criterion* (BIC) score [@schwarz1978estimating]. +As an example of how these two metrics can be obtained and used for +model comparison, suppose we wish to compare 'model_12' fitted above -- +which consists of a Cox regression including all covariates for +transitions 1 and 2 and a fully non-parametric model for transition 3 -- +with a model that combines Cox regressions of all covariates for each of +the three transitions (denoted 'model_123' below). The following code +snippet shows how to fit this second model. + +``` r +# arguments `groups' and `Z' for fitting a Cox regression model on all transitions +Z_123 <- data.frame( + covariates_expanded_123, + strata = mstate_data$trans, + trans = mstate_data$trans +) +groups_123 <- paste0(rep("group", ncol(Z_123) - 2), c("_1", "_2", "_3")) + +# Fit a Cox regression model for all transitions +model_123 <- CoxRFX(Z = Z_123, surv = surv, groups = groups_123) +``` + +Running the `concordance` function in the +[**survival**](https://CRAN.R-project.org/package=survival) package for +each model yields the following output: + +``` r +> concordance(model_12) + Call: + concordance.coxph(object = model_12) + + n= 1210 + Concordance= 0.8131 se= 0.01314 + concordant discordant tied.x tied.y tied.xy + strata=1 18040 2783 0 1 0 + strata=2 37919 9678 0 7 0 + strata=3 0 0 1052 0 4 + +> concordance(model_123) + Call: + concordance.coxph(object = model_123) + + n= 1210 + Concordance= 0.8168 se= 0.01312 + concordant discordant tied.x tied.y tied.xy + strata=1 18041 2782 0 1 0 + strata=2 37920 9677 0 7 0 + strata=3 784 268 0 4 0 +``` + +The output shows that modelling transition 3 with a Cox model, instead +of a fully parametric one, has a negligible impact on the overall +concordance. However, this is due to the fact that there are far fewer +observations for this transition. The concordance for transition 3 only, +which corresponds to strata 3, is 0.5 under the fully parametric model +(i.e., all patients are assigned the same transition hazard) and +considerably higher under the Cox regression ($784/(784+268)=0.75$). +Ideally, the comparison of models of different complexity should be +carried out on a test sample rather than on the training data. For this +purpose, the test data can be input into to the `concordance` function +(argument `newdata`). However, in the present case, only 61 patients +were ever at risk of dying with AML (i.e. of undergoing transition 3), +and of these only 41 actually died, so we might prefer to keep all +patients in the training data, rather than saving a fraction of them for +testing purposes. Such an option will yield more accurate coefficient +estimates, at the expense of not allowing the computation of unbiased +estimates of model performance. If the goal is only to compare models, +we can make do without test data, by using an information score that +penalises model complexity, such as the BIC. To facilitate model +comparison, the BIC score is one of the attributes of the model fit +object: + +``` r +> model_12$BIC + [1] 2508.37 +> model_123$BIC + [1] 2483.49 +``` + +The best model is the one with the lowest score, so the choice of +'model_123' is confirmed. + +## Discussion + +We have shown that +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is suitable +for higher-dimensional, multi-state survival analysis, and that it is +both efficient and easy-to-use. To a significant extent, the +user-friendliness of +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) stems from +the fact that it was not built 'from the ground up'. Instead, we +produced a package that is more easily accessible to the many users of +[**mstate**](https://CRAN.R-project.org/package=mstate) by taking +advantage of whichever features of this package were useful to our +method and by eliminating redundancies. The connection between +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) and +[**mstate**](https://CRAN.R-project.org/package=mstate) is based on the +fact that the function `CoxRFX` takes the same type of input and +produces the same type of output as `coxph` from the package `survival`, +and the function `probtrans_fft` (or `probtrans_ebmstate`) has the same +type of input and output as `probtrans` from +[**mstate**](https://CRAN.R-project.org/package=mstate) (as shown in +figure \@ref(fig:figworkflow)). + +We also sought to improve our package's user-friendliness by making it +as efficient as possible. The reduction of computational cost is based +on two features. First, our empirical Bayes method relies on an +expectation-maximisation algorithm that estimates both the parameters +and the hyper-parameters of the model, i.e., no further tuning of the +model is required. Second, in +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate), the +computation of state occupation probability estimates relies on +analytical results rather than on simulation: not only for clock-forward +models, where we import from +[**mstate**](https://CRAN.R-project.org/package=mstate) a product-limit +estimator, but also for clock-reset models, where we implement our own +estimator based on a convolution argument and the fast Fourier +transform. + +To our knowledge, +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) is the first +R package to put together a framework for multi-state model estimation +that is complete and suitable for higher-dimensional data. It does so by +implementing point and interval estimators of regression coefficients, +cumulative transition hazards and state occupation probabilities, under +regularised multi-state Cox models. In section +[4](#sec:estimator_performance), the results of the simulation study +suggest that for data sets with 100 patients or more and a ratio of $p$ +(patients) to $n$ (coefficients per transition) greater than 0.1, the +standard Cox model estimator is clearly outperformed by the empirical +Bayes one when it comes to the estimation of relative hazards and state +occupation probabilities of an out-of-sample patient, or the regression +coefficients of the model. However, the same study suggests that using +an empirical Bayes method instead of a fully non-parametric one is of +limited or no value in settings where $p/n \geq 1$. This loss of +usefulness can already happen for $p/n\leq 1/2$ when it comes to the +estimation of the relative hazards of an out-of-sample patient, +especially for transition structures with multiple competing +transitions. + +As mentioned in previous sections, +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) imports a +product-limit estimator from +[**mstate**](https://CRAN.R-project.org/package=mstate) that targets the +state occupation probabilities of patients with *time-fixed* covariate +vectors. However, these estimators are extendible to models with +time-dependent covariates, as long as these are external and the +estimates are conditional on specific covariate paths [@Aalen2008 p. +142]. For piecewise constant covariates, it is likely that such an +adaptation could be obtained by combining transition probability +estimates obtained for each period in which the covariates are fixed. +While no significant theoretical obstacles are foreseen in this matter, +the computer implementation for more than a single piecewise constant +covariate is likely to be a laborious task. We have left it therefore +for future work. + +## Acknowledgements {#acknowledgements .unnumbered} + +The authors are supported by grant NNF17OC0027594 from the Novo Nordisk +Foundation. We thank an anonymous reviewer for their constructive +comments and helpful suggestions which led to a much-improved +manuscript. + +## Supporting Scripts and Data {#supporting-scripts-and-data .unnumbered} + +In the supporting Scripts and Data, the file `ESM_1.html` contains +additional simulation results and theoretical demonstrations. Additional +details on the analysis of the MDS data set are given in the file +`ESM_2.html`. The MDS data set is in files `MDS.TPD.20Nov2012.csv` and +`mds.paper.clin.txt`. The file `ESM_3.R` contains a simplified R script +to run the code snippets in the present paper. The +[**ebmstate**](https://CRAN.R-project.org/package=ebmstate) package is +available on CRAN. + +## Conflict of interest + +The authors have declared no conflict of interest. + +**Figures** + +```{r figpackage-summary-figure, echo=FALSE , fig.cap="Summary of inputs and outputs of the package ebmstate. The input data set should be one that violates the assumption – commonly used in survival analysis – that the number of observations is much larger than the number of parameters to be estimated (a genomic-clinical data set is shown as a typical example). The input model is a multi-state Cox model defined by a transition structure and a prior distribution on the regression coefficients. This prior distribution is defined by partitioning the vector of regression coefficients into groups of regression coefficients, with each group having its own Gaussian prior with undetermined mean and variance. The outputs of ebmstate include estimates of the relative transition hazards associated with each covariate, as well as estimates of the probability that a specific patient (with specific covariate measurements) has of occupying each state of the model over some time period. Estimates of cumulative transition hazards are omitted from the figure.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/package_summary_figure.png")) +``` + +```{r figmssample, echo=FALSE , fig.cap="Comparison of running times and estimation accuracy of mssample and probtrans_fft. Each plot in the grid shows two estimated curves of state occupation probabilities. The black curves are based on a single run of mstate::mssample with n=100.000 observations (approximately 17 minutes of running time) and are the same across columns. They serve as benchmark for precision assessment. In columns 1 to 3 of the grid, the superimposed red curves are based on a run of mssample with respectively 100, 1000, and 10.000 observations. In the rightmost column, the red curves are based on a run of probtrans_fft. All functions have as input the same set of cumulative transition hazards. These were estimated using a non-parametric multi-state model and a data set of 1000 patients generated according to a clock-reset Cox model with a ‘linear’ transition structure (leftmost diagram of figure 3). Plots in the same row refer to the same state of the model, while those in the same column refer to the same run of a function. Running times and, where appropriate, number of simulations (n) are given on top of each column.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/mssample_and_probtrans_fft.png")) +``` + +```{r figtransition-structures, echo=FALSE , fig.cap="Model transition structures. We studied the performance of Cox model estimators, empirical Bayes Cox model estimators and fully non-parametric estimators with respect to these 3 transition structures.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/transition_structures.png")) +``` + +```{r figna-props-100patients-coxph, echo=FALSE , fig.cap="Proportions of valid, infinite and missing (‘NA’) estimates for the standard Cox model estimators in the simulation study of figure 6 (100 patients per simulated data set).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/na_props_100patients_coxph.png")) +``` + +```{r figna-props-1000patients-coxph, echo=FALSE , fig.cap="Proportions of valid, infinite and missing (‘NA’) estimates for the standard Cox model estimators in the simulation study of figure 7 (1000 patients per simulated data set).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/na_props_1000patients_coxph.png")) +``` + +```{r figestimator-performance-boxplots-100patients, echo=FALSE , fig.cap="Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with 100 observations each. In the figure grid there is a boxplot corresponding to every tuple (a,m, G, p) such that a\in \lbraceregression coefficients, relative hazards, state occupation probabilities\rbrace is the target of estimation, m\in \lbracestandard Cox, empirical Bayes Cox, null\rbrace is the hazard model, G \in \lbracelinear, competing risks, ‘m’ structure\rbrace is the transition structure of the model, and p \in \lbrace 10,40,70,100 \rbrace is the number of coefficients/covariates per transition. Each boxplot is based on at most 300 average absolute error observations. Figure 4, together with figures 6.1 and 6.3 in file ESM_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot’s y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. ", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/estimator_performance_boxplots_100patients.png")) +``` + +```{r figestimator-performance-boxplots-1000patients, echo=FALSE , fig.cap=" Performance comparison of standard Cox, empirical Bayes Cox, and fully non-parametric (null) estimators using training data sets with 1000 observations each. In the figure grid there is a boxplot corresponding to every tuple (a,m, G, p) such that a\in \lbraceregression coefficients, relative hazards, state occupation probabilities\rbrace is the target of estimation, m\in \lbracestandard Cox, empirical Bayes Cox, null\rbrace is the hazard model, G \in \lbracelinear, competing risks, ‘m’ structure\rbrace is the transition structure of the model, and p \in \lbrace 10,100,200,300,400,500 \rbrace is the number of coefficients/covariates per transition. Each boxplot is based on at most 300 average absolute error observations. Figure 5, together with figures 6.2 and 6.3 in file ESM_1.html of the Supporting Scripts and Data, show the proportion of valid, missing and infinite estimates for each estimator. In each simulation scenario, the upper limit of the plot’s y-axis defines a threshold above which observations are considered very large. Very large observations were replaced by the y-axis upper limit before the boxplots were built. ", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/estimator_performance_boxplots_1000patients.png")) +``` + +```{r figworkflow, echo=FALSE , fig.cap="Extension of the mstate analysis framework by ebmstate. Arrows correspond to functions. Boxes correspond to inputs or outputs of functions. Functions CoxRFX and probtrans_fft from ebmstate compute point estimates only. Interval estimates can be obtained using the non-parametric bootstrap algorithm implemented in the function ebmstate::boot_ebmstate.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/workflow0.png")) +``` + +```{r figtrans-diagrams, echo=FALSE , fig.cap="a: transition model implied by the data set of patients with myelodysplastic syndromes, together with transition event numbers; b: conversion to a transition structure without cycles; c: transformations applied to the MDS covariate data and summary statistics for the data before transformation. MDS stands for myelodysplastic syndromes; AML stands for acute myeloid leukemia.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/data_summary_figs2.png")) +``` + +```{r figcoef-plots, echo=FALSE , fig.cap="Point estimates of regression coefficients for the Cox model fitted to the MDS data, along with 95% non-parametric bootstrap confidence intervals. The x-axis scale is logarithmic so that coefficient estimates can be read as relative hazard estimates. If \gamma_{ij} is the element of \hat{\boldsymbol{\beta}}_{ij} associated with a given covariate, \exp\left(\gamma_{ij}\right) is the estimated relative hazard for this covariate in transition \left(i,j\right). In general, a relative hazard estimate r for a covariate z in transition \left(i,j\right) means that a one-unit increase in z is associated with an r-fold increase in the hazard of this transition. If z was obtained by log-transformation (as in age, platelet counts and neutrophil counts), a one-unit increase in z corresponds to scaling the original covariate by e\approx 2.72. In case z was obtained by logit-transformation (as in bone marrow blasts and sideroblasts proportions), the same one-unit increase corresponds to scaling the odds of the original covariate by e.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/coef_plots.png")) +``` + +```{r figpatient78-cumhaz, echo=FALSE , fig.cap="Point estimates of cumulative transition hazards for a sample patient with MDS (black curve), along with 95\% non-parametric confidence intervals (dashed red lines).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/patient78_cumhaz_final.png")) +``` + +```{r figpatient78-transProbs, echo=FALSE , fig.cap="Point estimates of state occupation probabilities for a sample patient with MDS (black curve), along with 95\% non-parametric confidence intervals (dashed red lines).", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/patient78_transProbs_final.png")) +``` +::: diff --git a/_articles/RJ-2024-002/web/costa-gerstung.R b/_articles/RJ-2024-002/web/costa-gerstung.R new file mode 100644 index 0000000000..e69de29bb2 diff --git a/_articles/RJ-2024-002/web/costa-gerstung.bib b/_articles/RJ-2024-002/web/costa-gerstung.bib new file mode 100644 index 0000000000..31b9356dc1 --- /dev/null +++ b/_articles/RJ-2024-002/web/costa-gerstung.bib @@ -0,0 +1,453 @@ +% An example bibliography .bib file. + +%This is a bibliography of extremes papers Version 11 January 2002 +%@PREAMBLE{"\newcommand{\noopsort}[1]{} " } + +% Note that spaces are needed to get all authors initials, +% ie D. R. Cox or Cox, D. R. are both ok. The full stops are NOT +% needed ie D R Cox and Cox, D R also both work! + +@article{Aalen1989, + title={A linear regression model for the analysis of life times}, + author={Aalen, Odd O}, + journal={Statistics in Medicine}, + volume={8}, + number={8}, + pages={907--925}, + year={1989}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.4780080803} +} + +@BOOK{Aalen2008, + author="Aalen,O and Borgan, O and Gjessing, H ", + title="Survival and event history analysis", + year=2008, + publisher="Springer", + address="", + note="", + url={https://link.springer.com/book/10.1007/978-0-387-68560-1}} + +@BOOK{Andersen1993, + author="Andersen,PK and Borgan, O and Gill, RD and Keiding, N ", + title="Statistical Models Based On Counting Processes", + year=1993, + publisher="Springer", + address="", + note="", + url={https://link.springer.com/book/10.1007/978-1-4612-4348-9}} + +@article{Cortese2010, + title={Competing risks and time-dependent covariates}, + author={Cortese, Giuliana and Andersen, Per K}, + journal={Biometrical Journal}, + volume={52}, + number={1}, + pages={138--158}, + year={2010}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/bimj.200900076} +} + +@BOOK{Carlin2009, + author={Carlin,BP and Louis, TA}, + title={Bayesian Methods for Data Analysis}, + year={2009}, + publisher={CRC Press}, + url={https://doi.org/10.1201/b14884} + } + +@Article{flexsurv_package, + title = {{flexsurv}: A Platform for Parametric Survival Modeling in + {R}}, + author = {Christopher Jackson}, + journal = {Journal of Statistical Software}, + year = {2016}, + volume = {70}, + number = {8}, + pages = {1--33}, + doi = {10.18637/jss.v070.i08}, + } + +@article{Karoui2018, + title={Can we trust the bootstrap in high-dimensions? {T}he case of linear models}, + author={El Karoui, Noureddine and Purdom, Elizabeth}, + journal={The Journal of Machine Learning Research}, + volume={19}, + number={1}, + pages={170--235}, + year={2018}, + publisher={JMLR. org}, + url={https://jmlr.org/papers/v19/17-006.html} +} + +@article{gamboostMSM_package, + title={gamboostMSM}, + author={Reulen, Holger}, + journal={R package version}, + pages={1.1.87}, + year={2014}, + url={https://CRAN.R-project.org/package=gamboostMSM} +} + +@BOOK{Gelman2014, + author="Gelman,A and Carlin, JB and Stern, HS and Dunson, DB and Vehtari,A and Rubin, DB ", + title="Bayesian Data Analysis", + year=2014, + publisher="CRC Press", + address="", + note="", + url={https://doi.org/10.1201/b16018}} + +@article{Gerds2014, + title={Calibration plots for risk prediction models in the presence of competing risks}, + author={Gerds, Thomas A and Andersen, Per K and Kattan, Michael W}, + journal={Statistics in medicine}, + volume={33}, + number={18}, + pages={3191--3203}, + year={2014}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.6152} +} + + +@article{Grinfeld2018, +title = {Personalized Prognostic Predictions for Patients with Myeloproliferative Neoplasms through Integration of Comprehensive Genomic and Clinical Information}, +journal = {Blood}, +volume = {130}, +pages = {491}, +year = {2017}, +issn = {0006-4971}, +doi = {https://doi.org/10.1182/blood.V130.Suppl_1.491.491}, +url = {https://www.sciencedirect.com/science/article/pii/S000649711981008X}, +author = {Jacob Grinfeld and Jyoti Nangalia and E Joanna Baxter and Anna L. Godfrey and Paola Guglielmelli and Rob Cantrill and David Wedge and Nicos Angelopoulos and Gunes Gundem and Charlie Massie and Elli Papaemmanuil and Cathy MacLean and Julia Cook and Francesca Lauren Nice and Christen Lykkegaard Andersen and Hans Carl Hasselbalch and Mary Frances McMullin and Alessandro M. Vannucchi and Claire N. Harrison and Moritz Gerstung and Peter J Campbell and Anthony R Green}, +} + +@article{harrell1982evaluating, + author = {Harrell, Frank E., Jr and Califf, Robert M. and Pryor, David B. and Lee, Kerry L. and Rosati, Robert A.}, + title = "{Evaluating the Yield of Medical Tests}", + journal = {JAMA}, + volume = {247}, + number = {18}, + pages = {2543-2546}, + year = {1982}, + month = {05}, + issn = {0098-7484}, + doi = {10.1001/jama.1982.03320430047030}, + url = {https://doi.org/10.1001/jama.1982.03320430047030}, + eprint = {https://jamanetwork.com/journals/jama/articlepdf/372568/jama\_247\_18\_030.pdf}, +} + + +@book{Hastie2009, + title={The elements of statistical learning: data mining, inference, and prediction}, + author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H and Friedman, Jerome H}, + volume={2}, + year={2009}, + publisher={Springer}, + url={https://link.springer.com/book/10.1007/978-0-387-84858-7} +} + +@article{Hoff2019, + Author = {Hoff, Rune and Putter, Hein and Mehlum, Ingrid Sivesind and Gran, Jon Michael}, + Da = {2019/10/01}, + Date-Added = {2021-02-19 11:01:01 +0000}, + Date-Modified = {2021-02-19 11:01:01 +0000}, + Doi = {10.1007/s10985-019-09474-0}, + Id = {Hoff2019}, + Isbn = {1572-9249}, + Journal = {Lifetime Data Analysis}, + Number = {4}, + Pages = {660--680}, + Title = {Landmark estimation of transition probabilities in non-Markov multi-state models with covariates}, + Ty = {JOUR}, + Url = {https://doi.org/10.1007/s10985-019-09474-0}, + Volume = {25}, + Year = {2019}, + Bdsk-Url-1 = {https://doi.org/10.1007/s10985-019-09474-0}} + + +@article{Hougaard1999, + title={Multi-state models: a review}, + author={Hougaard, Philip}, + journal={Lifetime data analysis}, + volume={5}, + number={3}, + pages={239--264}, + year={1999}, + publisher={Springer}, + url={https://doi.org/10.1023/A:1009672031531} +} + +@Article{Jackson2011, + title = {Multi-state models for panel data: the {msm} package for {R}}, + author = {Christopher H. Jackson}, + journal = {Journal of Statistical Software}, + year = {2011}, + volume = {38}, + number = {8}, + pages = {1--29}, + url = {http://www.jstatsoft.org/v38/i08/}, + } + + @book{Kalbfleisch2002, + title={The statistical analysis of failure time data}, + author={Kalbfleisch, John D and Prentice, Ross L}, + year={2002}, + publisher={John Wiley \& Sons}, + doi={10.1002/9781118032985}, + } + +@article{Listwon2015, + TITLE = {{SemiMarkov: An R Package for Parametric Estimation in Multi-State Semi-Markov Models}}, + AUTHOR = {Listwon, Agnieszka and Saint-Pierre, Philippe}, + URL = {https://hal.archives-ouvertes.fr/hal-00860244}, + JOURNAL = {{Journal of Statistical Software}}, + PUBLISHER = {{University of California, Los Angeles}}, + VOLUME = {66}, + NUMBER = {6}, + PAGES = {784}, + YEAR = {2015}, + DOI = {10.18637/jss.v066.i06}, + KEYWORDS = {exponentiated Weibull distribution ; multi-state semi-Markov models ; parametric estimation ; asthma ; R package}, + PDF = {https://hal.archives-ouvertes.fr/hal-00860244/file/Listwon_SaintPierre_HAL.pdf}, + HAL_ID = {hal-00860244}, + HAL_VERSION = {v1}, +} + +@article{mboost_package, + title={mboost: Model-Based Boosting}, + author={Hothorn, Torsten and Buehlmann, Peter and Kneib, Thomas and Schmid, Matthias and Hofner, Benjamin}, + journal={R package version}, + pages={2.9-3}, + year={2020}, + url={https://CRAN.R-project.org/package=mboost} +} + +@article{Morris1983, + title={Parametric empirical Bayes inference: theory and applications}, + author={Morris, Carl N}, + journal={Journal of the American Statistical Association}, + volume={78}, + number={381}, + pages={47--55}, + year={1983}, + publisher={Taylor \& Francis Group}, + url={https://doi.org/10.1080/01621459.1983.10477920} +} + +@article{Papaemmanuil2013, + title={Clinical and biological implications of driver mutations in myelodysplastic syndromes}, + author={Papaemmanuil, Elli and Gerstung, Moritz and Malcovati, Luca and Tauro, Sudhir and Gundem, Gunes and Van Loo, Peter and Yoon, Chris J and Ellis, Peter and Wedge, David C and Pellagatti, Andrea and others}, + journal={Blood}, + volume={122}, + number={22}, + pages={3616--3627}, + year={2013}, + publisher={Am Soc Hematology}, + url={https://doi.org/10.1182/blood-2013-08-518886} +} + +@BOOK{Pawitan2001, + author="Pawitan, Y", + title="In All Likelihood", + year=2001, + publisher="Oxford University Press", + address="", + note="", + url={https://global.oup.com/academic/product/in-all-likelihood-9780199671229?cc=gb&lang=en&#} + } + +@article{penMSM_package, + title={penMSM}, + author={Reulen, Holger}, + journal={R package version}, + pages={0.99}, + year={2015}, + url={https://CRAN.R-project.org/package=penMSM} +} + +@article{Perperoglou2014, + title={Cox models with dynamic ridge penalties on time-varying effects of the covariates}, + author={Perperoglou, Aris}, + journal={Statistics in Medicine}, + volume={33}, + number={1}, + pages={170--180}, + year={2014}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.5921} +} + +@Article{Putter2011, + title = {{mstate}: An {R} Package for the Analysis of Competing Risks and Multi-State Models}, + author = {Liesbeth C. {de Wreede} and Marta Fiocco and Hein Putter}, + journal = {Journal of Statistical Software}, + year = {2011}, + volume = {38}, + number = {7}, + pages = {1--30}, + url = {http://www.jstatsoft.org/v38/i07/}, + } + +@article{Putter2007tutorial, + title={Tutorial in biostatistics: competing risks and multi-state models}, + author={Putter, Hein and Fiocco, Marta and Geskus, Ronald B}, + journal={Statistics in Medicine}, + volume={26}, + number={11}, + pages={2389--2430}, + year={2007}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1002/sim.2712} +} + +@article{Putter2011tutorial, + title={Tutorial in biostatistics: Competing risks and multi-state models Analyses using the mstate package}, + author={Putter, Hein}, + journal={Companion file for the mstate package}, + year={2011}, + url = {https://mirror.las.iastate.edu/CRAN/web/packages/mstate/vignettes/Tutorial.pdf} +} + + @Manual{R_language, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2019}, + url = {https://www.R-project.org/}, + } + +@article{Rueda2019, + title={Dynamics of breast-cancer relapse reveal late-recurring ER-positive genomic subgroups}, + author={Rueda, Oscar M and Sammut, Stephen-John and Seoane, Jose A and Chin, Suet-Feung and Caswell-Jin, Jennifer L and Callari, Maurizio and Batra, Rajbir and Pereira, Bernard and Bruna, Alejandra and Ali, H Raza and others}, + journal={Nature}, + volume={567}, + number={7748}, + pages={399}, + year={2019}, + publisher={Nature Publishing Group}, + url={https://doi.org/10.1038/s41586-019-1007-8} +} + +@article{Samworth2012, + title={Stein's paradox}, + author={Samworth, Richard J}, + journal={Eureka}, + volume={62}, + pages={38--41}, + year={2012}, + publisher={The Archimedeans}, + url={https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=7eebd55f569395544f2b5d367d6aee614901d2c1} +} + +@article{Schall1991, +author = {Schall, Robert}, +title = {Estimation in generalized linear models with random effects}, +journal = {Biometrika}, +volume = {78}, +number = {4}, +pages = {719-727}, +year = {1991}, +doi = {10.1093/biomet/78.4.719}, +URL = {http://dx.doi.org/10.1093/biomet/78.4.719}, +eprint = {/oup/backfile/content_public/journal/biomet/78/4/10.1093/biomet/78.4.719/2/78-4-719.pdf} +} + +@article{schwarz1978estimating, + title={Estimating the dimension of a model}, + author={Schwarz, Gideon}, + journal={The annals of statistics}, + pages={461--464}, + year={1978}, + publisher={JSTOR}, + url={https://www.jstor.org/stable/2958889} +} + +@manual{shiny, + title={Easy web applications in R.}, + author={{RStudio, Inc}}, + year={2013}, + url={http://www.rstudio.com/shiny/} +} + + @Manual{survival_package, + title = {A Package for Survival Analysis in S}, + author = {Terry M Therneau}, + year = {2015}, + note = {version 2.38}, + url = {https://CRAN.R-project.org/package=survival}, + } + + +@article{Shu2007, + Author = {Shu, Youyi and Klein, John P. and Zhang, Mei-Jie}, + Da = {2007/03/01}, + Date-Added = {2021-02-09 10:55:17 +0000}, + Date-Modified = {2021-02-09 10:55:17 +0000}, + Doi = {10.1007/s10985-006-9018-9}, + Id = {Shu2007}, + Isbn = {1572-9249}, + Journal = {Lifetime Data Analysis}, + Number = {1}, + Pages = {91--117}, + Title = {Asymptotic theory for the Cox semi-Markov illness-death model}, + Ty = {JOUR}, + Url = {https://doi.org/10.1007/s10985-006-9018-9}, + Volume = {13}, + Year = {2007}, + Bdsk-Url-1 = {https://doi.org/10.1007/s10985-006-9018-9}} + +@article{Spitoni2012, +author = {Cristian Spitoni and Marion Verduijn and Hein Putter}, +doi = {doi:10.1515/1557-4679.1375}, +url = {https://doi.org/10.1515/1557-4679.1375}, +title = {Estimation and Asymptotic Theory for Transition Probabilities in Markov Renewal Multi-State Models}, +journal = {The International Journal of Biostatistics}, +number = {1}, +volume = {8}, +year = {2012} +} + + +@article{vanHouwelingen2007, + title={Dynamic prediction by landmarking in event history analysis}, + author={van Houwelingen, Hans C}, + journal={Scandinavian Journal of Statistics}, + volume={34}, + number={1}, + pages={70--85}, + year={2007}, + publisher={Wiley Online Library}, + url={https://doi.org/10.1111/j.1467-9469.2006.00529.x} +} + +@article{Wreede2010, +title = "The mstate package for estimation and prediction in non- and semi-parametric multi-state and competing risks models", +journal = "Computer Methods and Programs in Biomedicine", +volume = "99", +number = "3", +pages = "261 - 274", +year = "2010", +issn = "0169-2607", +doi = "https://doi.org/10.1016/j.cmpb.2010.01.001", +url = "http://www.sciencedirect.com/science/article/pii/S0169260710000027", +author = "Liesbeth C. de Wreede and Marta Fiocco and Hein Putter", +keywords = "Survival analysis, Multi-state models, Competing risks models, Markov models, Cox models, Software" +} + +@article{Zou2005, +author = {Zou, Hui and Hastie, Trevor}, +title = {Regularization and variable selection via the elastic net}, +journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, +volume = {67}, +number = {2}, +pages = {301-320}, +keywords = {Grouping effect, LARS algorithm, Lasso, Penalization, p≫n problem, Variable selection}, +doi = {https://doi.org/10.1111/j.1467-9868.2005.00503.x}, +url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-9868.2005.00503.x}, +eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-9868.2005.00503.x}, +year = {2005} +} \ No newline at end of file diff --git a/_articles/RJ-2024-002/web/figures/coef_plots.png b/_articles/RJ-2024-002/web/figures/coef_plots.png new file mode 100644 index 0000000000..eb06c62756 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/coef_plots.png differ diff --git a/_articles/RJ-2024-002/web/figures/data_summary_figs2.png b/_articles/RJ-2024-002/web/figures/data_summary_figs2.png new file mode 100644 index 0000000000..3d1e1a925d Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/data_summary_figs2.png differ diff --git a/_articles/RJ-2024-002/web/figures/estimator_performance_boxplots_1000patients.png b/_articles/RJ-2024-002/web/figures/estimator_performance_boxplots_1000patients.png new file mode 100644 index 0000000000..75cdf29709 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/estimator_performance_boxplots_1000patients.png differ diff --git a/_articles/RJ-2024-002/web/figures/estimator_performance_boxplots_100patients.png b/_articles/RJ-2024-002/web/figures/estimator_performance_boxplots_100patients.png new file mode 100644 index 0000000000..95ff1abf44 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/estimator_performance_boxplots_100patients.png differ diff --git a/_articles/RJ-2024-002/web/figures/mssample_and_probtrans_fft.png b/_articles/RJ-2024-002/web/figures/mssample_and_probtrans_fft.png new file mode 100644 index 0000000000..4a5d062aa3 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/mssample_and_probtrans_fft.png differ diff --git a/_articles/RJ-2024-002/web/figures/na_props_1000patients_coxph.png b/_articles/RJ-2024-002/web/figures/na_props_1000patients_coxph.png new file mode 100644 index 0000000000..b374072256 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/na_props_1000patients_coxph.png differ diff --git a/_articles/RJ-2024-002/web/figures/na_props_100patients_coxph.png b/_articles/RJ-2024-002/web/figures/na_props_100patients_coxph.png new file mode 100644 index 0000000000..6b20c970e4 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/na_props_100patients_coxph.png differ diff --git a/_articles/RJ-2024-002/web/figures/package_summary_figure.png b/_articles/RJ-2024-002/web/figures/package_summary_figure.png new file mode 100644 index 0000000000..e39bbe27eb Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/package_summary_figure.png differ diff --git a/_articles/RJ-2024-002/web/figures/patient78_cumhaz_final.png b/_articles/RJ-2024-002/web/figures/patient78_cumhaz_final.png new file mode 100644 index 0000000000..f6dd1b3910 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/patient78_cumhaz_final.png differ diff --git a/_articles/RJ-2024-002/web/figures/patient78_transProbs_final.png b/_articles/RJ-2024-002/web/figures/patient78_transProbs_final.png new file mode 100644 index 0000000000..3c0fa7ae95 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/patient78_transProbs_final.png differ diff --git a/_articles/RJ-2024-002/web/figures/transition_structures.png b/_articles/RJ-2024-002/web/figures/transition_structures.png new file mode 100644 index 0000000000..b63f3398d3 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/transition_structures.png differ diff --git a/_articles/RJ-2024-002/web/figures/workflow0.png b/_articles/RJ-2024-002/web/figures/workflow0.png new file mode 100644 index 0000000000..56c954c422 Binary files /dev/null and b/_articles/RJ-2024-002/web/figures/workflow0.png differ diff --git a/_articles/RJ-2024-003/RJ-2024-003.Rmd b/_articles/RJ-2024-003/RJ-2024-003.Rmd new file mode 100644 index 0000000000..b85e95d234 --- /dev/null +++ b/_articles/RJ-2024-003/RJ-2024-003.Rmd @@ -0,0 +1,1576 @@ +--- +title: 'bootCT: An R Package for Bootstrap Cointegration Tests in ARDL Models' +abstract: | + The Autoregressive Distributed Lag approach to cointegration or bound + testing, proposed by Pesaran in 2001, has become prominent in + empirical research. Although this approach has many advantages over + the classical cointegration tests, it is not exempt from drawbacks, + such as possible inconclusive inference and distortion in size. + Recently, Bertelli and coauthors developed a bootstrap approach to the + bound tests to overcome these drawbacks. This paper introduces the R + package bootCT, which implements this method by deriving the bootstrap + versions of the bound tests and of the asymptotic F-test on the + independent variables proposed by Sam and coauthors in 2019. As a + spinoff, a general method for generating random multivariate time + series following a given VECM/ARDL structure is provided in the + package. Empirical applications showcase the main functionality of the + package. +author: +- name: Gianmarco Vacca + affiliation: Department of Economic Policy. Università Cattolica del Sacro Cuore + address: + - Largo Gemelli, 1, Milan. + - Italy + - (0000-0002-8996-5524) + - | + [gianmarco.vacca@unicatt.it](gianmarco.vacca@unicatt.it){.uri} +- name: Maria Zoia + affiliation: Department of Economic Policy. Università Cattolica del Sacro Cuore + address: + - Largo Gemelli, 1, Milan. + - Italy + - (0000-0002-8169-781X) + - | + [maria.zoia@unicatt.it](maria.zoia@unicatt.it){.uri} +- name: Stefano Bertelli + affiliation: |- + CRO Area, Internal Validation and Controls Department, Operational + Risk and ICAAP Internal Systems, Intesa Sanpaolo, Milan + address: + - Viale Stelvio, 55/57, Milan. + - Italy + - | + [stefano.bertelli@intesasanpaolo.com](stefano.bertelli@intesasanpaolo.com){.uri} +date: '2025-01-10' +date_received: '2022-07-25' +journal: + firstpage: 39 + lastpage: 66 +volume: 16 +issue: 1 +slug: RJ-2024-003 +citation_url: https://rjournal.github.io/ +packages: + cran: + - bootCT + - dynamac + - magrittr + - gtools + - pracma + - Rcpp + - RcppArmadillo + - Rmisc + - ARDL + - aod + - vars + - urca + - aTSA + - tseries + - reshape2 + - ggplot2 + - stringr + - tidyverse + - dplyr + - ggplot + bioc: [] +preview: preview.png +bibliography: vacca-zoia-bertelli.bib +CTV: ~ +legacy_pdf: yes +legacy_converted: yes +output: + rjtools::rjournal_web_article: + self_contained: yes + toc: no + mathjax: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js + md_extension: -tex_math_single_backslash +draft: no + +--- + + +## Introduction {#sec:intro} + +Cointegration and error correction are fundamental concepts in the +analysis of economic data, insofar as they provide an appropriate +framework for testing economic hypotheses about growth and fluctuation. +Several approaches have been proposed in the literature to determine +whether two or more non-stationary time series are cointegrated, meaning +they share a common long-run relationship.\ +There are two basic types of tests for cointegration: single equation +tests and VAR-based tests. The former check the presence of unit roots +in cointegration residuals [see, e.g., +@engle1987co; @engleyoo87; @Mackinnon91; @gabriel2002; @cook2006power] +or test the significance of the error-correction (EC) term coefficient +[@kremers1992power; @maddala1998; @arranz2000; @ericsson2002]. The +latter, such as the @johansen1991 approach, tackle the problem of +detecting cointegrating relationships in a VAR model. This latter +approach, albeit having the advantage of avoiding the issue of +normalization, as well as allowing the detection of multiple +cointegrating vectors, is far from being perfect. In the VAR system all +variables are treated symmetrically, as opposed to the standard +univariate models that usually have a clear interpretation in terms of +exogenous and endogenous variables. Furthermore, in a VAR system all the +variables are estimated at the same time, which is problematic if the +relation between some variables is flawed, that is affected by some +source of error. In this case a simultaneous estimation process tends to +propagate the error affecting one equation to the others. Furthermore, a +multidimensional VAR models employs plenty of degrees of freedom.\ +The recent cointegration approach, known as Autoregressive Distributed +Lag (ARDL) approach to cointegration or bound testing, proposed by + @pesaran2001 (PSS), falls in the former strand of literature. It has +become prominent in empirical research because it shows several +advantages with respect to traditional methods for testing +cointegration. First, it is applicable also in cases of mixed order +integrated variables, albeit with integration not exceeding the first +order. Thus, it evades the necessity of pre-testing the variables and, +accordingly, avoids some common practices that may prevent finding +cointegrating relationships, such as dropping variables or transforming +them into stationary form  [see @mcnown2018bootstrapping]. Second, +cointegration bound tests are performed in an ARDL model that allows +different lag orders for each variable, thus providing a more flexible +framework than other commonly employed approaches. Finally, unlike other +cointegration techniques, which are sensitive to the sample size, the +ARDL approach provides robust and consistent results for small sample +sizes.\ +Notably, the ARDL bound testing methodology has quickly spread in +economics and econometrics to study the cointegrating relationships +between macroeconomic and financial variables, to evaluate the long-run +impact of energy variables, or to assess recent environmental policies +and their impact on the economy. Among the many applications, see for +instance @haseeb2019impact +[@reda2020using; @menegaki2019ardl; @yilanci2020brics; @hussain2019environmental; @abbasi2021energy].\ +The original bound tests proposed by @pesaran2001 are an $F$-test for +the significance of the coefficients of all lagged level variables +entering the error correction term ($F_{ov}$), and a $t$-test for the +coefficient of the lagged dependent variable. When either the dependent +or the independent variables do not appear in the long-run relationship, +a degenerate case arises. The bound $t$-test provides answers on the +occurrence of a degenerate case of second type, while the occurrence of +a degeneracy case of first type can be assessed by testing whether the +dependent variable is of integration order I(1). This type of check +violates the spirit and motivation of the bound tests, which are +supposed to be applicable in situations of unknown order of integration +for the variables.\ +Recently, @mcnown2018bootstrapping pointed out how, due to the low power +problem of unit root tests, investigating the presence of a first type +degeneracy by testing the integration order of the dependent variable +may lead to incorrect conclusions. Therefore, they suggested checking +for its occurrence by testing the significance of the lagged levels of +the independent variables via an extra $F$-test ($F_{ind}$), which was +also worked out in its asymptotic version [SMK; @sam2019augmented].\ +Besides problems in testing the occurrence of degenerate cases, in +general, the main drawback of the bound tests is the occurrence of +potentially inconclusive results, if the test statistic lies between the +bounds of the test distribution under the null. Furthermore, the +asymptotic distributions of the statistics may provide a poor +approximation of the true distributions in small samples. Finite sample +critical values, even if only for a subset of all possible model +specifications, have been worked out in the literature [see +@mills2001real; @narayan2004crime; @kanioura2005critical; @narayan2005saving], +while [@kripfganz2020response] provided the quantiles of the asymptotic +distributions of the tests as functions of the sample size, the lag +order and the number of long-run forcing variables. However, this +relevant improvement does not eliminate the uncertainty related to the +inconclusive regions, or the existence of other critical issues related +to the underlying assumptions of the bound test framework, such as the +(weak) exogeneity of the independent variables or the non-stationarity +of the dependent variable.\ +To overcome the mentioned bound test drawbacks, [@bertelli2022bootstrap] +proposed bootstrapping the ARDL cointegration test. Inference can always +be pursued with ARDL bootstrap tests, unlike what happens with both the +PSS tests and the SMK test on the independent variables. Bootstrap ARDL +tests were first put forward by [@mcnown2018bootstrapping] in an +unconditional ARDL model, which omits the instantaneous differences of +the exogenous variables in the ARDL equation, rather than a conditional +one, as originally proposed by [@pesaran2001]. The unconditional model +is often used, for reason of practical convenience, in empirical +research. Simulation results in [@bertelli2022bootstrap] have +highlighted the importance of employing the appropriate specification, +especially under degenerate cases. In fact, it has been pointed out that +a correct detection of these cases requires the comparison of the test +outcomes in both the conditional and unconditional settings. Erroneous +conclusions, based exclusively on one model specification, can thus be +avoided.\ +In this paper, bootstrap bound tests, thereby including the bootstrap +versions of the $F_{ov}$, $t$ and $F_{ind}$ bound tests, are carried out +in a conditional ARDL model setting. This approach allows to overcome +the problem of inconclusive regions of the standard bound tests. A +comparison with the outcomes engendered by the unconditional ARDL +bootstrap tests is nevertheless provided for the $F_{ind}$ test, to +avoid erroneous inference in presence of degenerate cases.\ +The paper is organized as follows. Section [2](#sec:cointegration) +introduces the theoretical results of the ARDL cointegration bound +tests. Section [3](#sec:boot) details the steps carried out by the +bootstrap procedure, which allows the construction of the (bootstrap) +distribution - under the null - for the $F_{ov}$, $t$, conditional +$F_{ind}$ and unconditional $F_{ind}$ tests. Section [4](#sec:pkg) +introduces the `R` package +[**bootCT**](https://CRAN.R-project.org/package=bootCT) [@bootCT] and +its functionalities: a method for the generation of random multivariate +time series that follow a user-specified VECM/ARDL structure, with some +examples, and the main function that carries out the aforementioned +bootstrap tests, while also computing the PSS and SMK bound tests. The +trade-off between accuracy and computational time of the bootstrap +procedure is also investigated, under several scenarios in terms of +sample size and number of replications. Notably, a function that +performs the PSS bound tests is already available in the +[**dynamac**](https://CRAN.R-project.org/package=dynamac) package +[@PKGDYNAMAC], while no `R` routine has so far been implemented for the +SMK test, to the best of our knowledge. Section [5](#sec:app) gives some +empirical applications that employ the core function of the package and +its possible outputs. Section [6](#sec:end) concludes. Appendix +[7](#sec:appendix) briefly delves into technical details of the +conditional ARDL model and its possible specifications [^1]. + +## Cointegration bound tests in ARDL models {#sec:cointegration} + +The starting point of the approach proposed by  [@pesaran2001] is a +$(K+1)$ VAR($p$) model +$$ +\mathbf{A}(L)(\mathbf{z}_t-\boldsymbol{\mu}-\boldsymbol{\eta}t)=\boldsymbol{\varepsilon}_t \enspace \enspace \enspace \boldsymbol{\varepsilon}_t\sim N(\mathbf{0}, \boldsymbol{\Sigma}),\qquad\mathbf{A}(L)=\left(\mathbf{I}_{K+1}- \sum_{j=1}^{p}\mathbf{A}_j\mathbf{L}^j\right) +\enspace \enspace \enspace t=1,2,\dots,T. (\#eq:var)$$ +Here, $\mathbf{A}_j$ are square $(K+1)$ matrices, $\mathbf{z}_t$ a +vector of $(K+1)$ variables, $\boldsymbol{\mu}$ and $\boldsymbol{\eta}$ +are $(K+1)$ vectors representing the drift and the trend respectively, +and $\det(\mathbf{A}(z))=0$ for $|z| \geq 1$. If the matrix +$\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}$ is +singular, the components of $\mathbf{z}_t$ turn out to be integrated and +possibly cointegrated.\ +The VECM representation of \@ref(eq:var) is given by (see Appendix +[7.1](#sec:appendixa) for details) +$$ +\Delta\mathbf{z}_t=\boldsymbol{\alpha}_{0}+\boldsymbol{\alpha}_{1}t-\mathbf{A}(1)\mathbf{z}_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta \mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_t. (\#eq:vecm)$$ +Now, to study the adjustment to the equilibrium of a single variable +$y_t$, given the other $\mathbf{x}_t$ variables, the vectors +$\mathbf{z}_t$ and $\boldsymbol{\varepsilon}_t$ are partitioned +$$ +\mathbf{z}_t=\begin{bmatrix} +\underset{(1,1)}{y_{t}} \\ \underset{(K,1)}{\mathbf{x}_{t}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\varepsilon}_t=\begin{bmatrix} +\underset{(1,1)}{\varepsilon_{yt}} \\ \underset{(K,1)}{\boldsymbol{\varepsilon}_{xt}} +\end{bmatrix}. (\#eq:vecpart)$$ +The matrix $\mathbf{A}(1)$, which is assumed to be singular to allow +cointegration, is partitioned conformably to $\mathbf{z}_{t}$ as [^2]\ + +$$\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}.$$ +Under the assumption +$$ +\boldsymbol{\varepsilon}_t \sim N\Bigg(\mathbf{0}, \begin{bmatrix} +\underset{(1,1)}{\sigma_{yy}}& \underset{(1,K)}{\boldsymbol{\sigma}_{yx}'} \\ \underset{(K,1)}{\boldsymbol{\sigma}_{xy}} & \underset{(K,K)}{\boldsymbol{\Sigma}_{xx}} \end{bmatrix}\Bigg), (\#eq:normerr)$$ +the following holds +$$ +\varepsilon_{yt}=\boldsymbol{\omega}'\boldsymbol{\varepsilon}_{xt}+\nu_{yt} \sim N(0,\sigma_{y.x}), (\#eq:epsilonx)$$ +where +$\sigma_{y.x}=\sigma_{yy}-\boldsymbol{\omega}'\boldsymbol{\sigma}_{xy}$ +with +$\boldsymbol{\omega}'=\boldsymbol{\sigma}'_{yx}\boldsymbol{\Sigma}^{-1}_{xx}$, +and $\nu_{yt}$ is independent of $\boldsymbol{\varepsilon}_{xt}$.\ +Substituting \@ref(eq:epsilonx) into \@ref(eq:vecm) and assuming that +the $\mathbf{x}_{t}$ variables are exogenous towards the ARDL parameters +(that is, setting $\mathbf{a}_{xy}=\mathbf{0}$ in $\mathbf{A}(1)$) +yields the system (see Appendix [7.1](#sec:appendixa) for details) +$$ + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt} (\#eq:ardl)$$ + +$$ +\Delta\mathbf{x}_{t} += \boldsymbol{\alpha}_{0x} +\boldsymbol{\alpha}_{1x}t+ \mathbf{A}_{(x)}\mathbf{z}_{t-1}+ \boldsymbol{\Gamma}_{(x)}(L)\Delta\mathbf{z}_t+ \boldsymbol{\varepsilon}_{xt}, (\#eq:marg)$$ +where +$$ +\boldsymbol\gamma_{y.x,j}'=\boldsymbol\gamma_{y,j}'-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x),j} (\#eq:ardlgamma)$$ + +$$ +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x}, (\#eq:ardldet)$$ +and where the error correction term, $EC_{t-1}$, expressing the long-run +equilibrium relationship between $y_{t}$ and $\mathbf{x}_{t}$, is given +by +$$ +EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}, (\#eq:ec)$$ +with +$$ +\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}, \enspace\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}'}_{y.x}}{a_{yy}}=-\frac{\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}}{a_{yy}}. (\#eq:const)$$ +Thus, no cointegration occurs when +$\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}$ or $a_{yy}=0$ . These two +circumstances are referred to as degenerate case of second and first +type, respectively. Degenerate cases imply no cointegration between +$y_{t}$ and $\mathbf{x}_{t}$.\ +To test the hypothesis of cointegration between $y_{t}$ and +$\mathbf{x}_{t}$, @pesaran2001 proposed an $F$-test, $F_{ov}$ hereafter, +based on the hypothesis system +\begin{align} +H_0: a_{yy}=0 \; \cap \;\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}\\ +H_1: a_{yy} \neq 0 \; \cup \;\widetilde{\mathbf{a}}_{y.x}\neq \mathbf{0}.(\#eq:h0sys) +\end{align} +Note that $H_{1}$ covers also the degenerate cases +\begin{align} +H_1^{y.x}: a_{yy}=0 \; , \;\widetilde{\mathbf{a}}_{y.x}\neq\mathbf{0}\\ +H_1^{yy}: a_{yy} \neq 0 \; , \;\widetilde{\mathbf{a}}_{y.x} = \mathbf{0}.(\#eq:h0deg) +\end{align} +The exact distribution of the $F$ statistic under the null is unknown, +but it is limited from above and below by two asymptotic distributions: +one corresponding to the case of stationary regressors, and another +corresponding to the case of first-order integrated regressors. As a +consequence, the test is called bound test and has an inconclusive area. +[^3]\ + @pesaran2001 worked out two sets of (asymptotic) critical values: one, +$\{\tau_{L,F}\}$, for the case when $\mathbf{x}_{t}\sim{I}(0)$ and +another, $\{\tau_{U,F}\}$, for the case when $\mathbf{x}_{t}\sim{I}(1)$. +These values vary in accordance with the number of regressors in the +ARDL equation, the sample size and the assumptions made about the +deterministic components (intercept and trend) of the data generating +process.\ +In this regard,  @pesaran2001 introduced five different specifications +for the ARDL model, depending on its deterministic components, which are +(see Appendix [7.2](#sec:appendixb) for details) + +I. *No intercept and no trend* + \begin{align} + \Delta y_t=-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},(\#eq:case1) + \end{align} + where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$,\ + +II. *Restricted intercept and no trend* + \begin{align} + \Delta y_{t}= + -a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},(\#eq:case2) + \end{align} + where + $EC_{t-1}=y_{t-1}-\theta_{0}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + The intercept extracted from the EC term is + $\alpha_{0.y}^{EC} = a_{yy}\theta_0$. + +III. *Unrestricted intercept and no trend* + \begin{align} + \Delta y_{t} + =\alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},(\#eq:case3) + \end{align} + where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + +IV. *Unrestricted intercept, restricted trend* + \begin{align} + \Delta y_{t}= + \alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},(\#eq:case4) + \end{align} + where + $EC_{t-1}=y_{t-1}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + The trend extracted from the EC term is + $\alpha_{1.y}^{EC} = a_{yy}\theta_1$. + +V. *Unrestricted intercept, unrestricted trend* + \begin{align} + \Delta y_{t} + =\alpha_{0.y}+\alpha_{1.y}t + -a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, (\#eq:case5) + \end{align} + where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + +The model in \@ref(eq:ardl) proposed by  @pesaran2001 represents the +correct framework in which to carry out bound tests. However, bound test +are often performed in an unconditional ARDL model setting, specified as +$$ + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{j}\Delta\mathbf{z}_{t-j}+\varepsilon_{yt}, (\#eq:ardluc)$$ +which omits the term $\boldsymbol{\omega}'\Delta\mathbf{x}_{t}$.\ +[@bertelli2022bootstrap] have highlighted that bootstrap tests performed +in these two ARDL specifications can lead to contrasting results. To +explain this divergence, note that the conditional model makes use of +the following vector in the EC term +$$\widetilde{\mathbf{a}}_{y.x}'=\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}$$ +(divided by $a_{yy}$, see \@ref(eq:const)) to carry out bound tests, +while the unconditional one only uses the vector $\mathbf{a}_{yx}'$, +(divided by $a_{yy}$), since it neglects the term +$\boldsymbol{\omega}'\mathbf{A}_{xx}$. [^4] This can lead to contrasting +inference in two instances. The first happens when a degeneracy of first +type occurs in the conditional model, that is +$$ +\widetilde{\mathbf{a}}_{y.x}'=\mathbf{0}, (\#eq:deg1cond)$$ +because +$$\mathbf{a}_{yx}'=\boldsymbol{\omega}'\mathbf{A}_{xx}.$$ +In this case, the conditional model rejects cointegration, while the +unconditional one concludes the opposite. The other case happens when a +degeneracy of first type occurs in the unconditional model, that is +$$ +\mathbf{a}_{yx}'=\mathbf{0}, (\#eq:deg1uc)$$ +but +$$\widetilde{\mathbf{a}}_{y.x}'=\boldsymbol{\omega}'\mathbf{A}_{xx} \neq \mathbf{0}.$$ +In this case, the unconditional model rejects cointegration, while the +conditional one concludes for the existence of cointegrating +relationships, which are however spurious. Only a comparison of the +outcomes of the $F_{ind}$ test performed in both the conditional and +unconditional ARDL equation can help to disentangle this problem. [^5]\ +In the following, bootstrap tests are carried out in the conditional +ARDL model \@ref(eq:ardl). However, when a degeneracy of first type +occurs in the unconditional model, the outcomes of the $F_{ind}$ +bootstrap test performed in both the conditional and unconditional +settings are provided. This, as previously outlined, is performed to +avoid the acceptance of spurious long-run relationships among the +dependent variable and the independent variables. + +## The new bootstrap procedure {#sec:boot} + +The bootstrap procedure here proposed focuses on a ARDL model specified +as in \@ref(eq:case1)-\@ref(eq:case5), depending on the assumptions on +the deterministic components.\ +The bootstrap procedure consists of the following steps: + +1. The ARDL model is estimated via OLS and the related test statistics + $F_{ov}$, $t$ or $F_{ind}$ are computed. + +2. In order to construct the distribution of each test statistic under + the corresponding null, the same model is re-estimated imposing the + appropriate restrictions on the coefficients according to the test + under consideration. + +3. Following [@mcnown2018bootstrapping], the ARDL restricted residuals + are then computed. For example, under Case III, the residuals are + $$ + \widehat{\nu}_{yt}^{F_{ov}}=\Delta y_{t}-\widehat{\alpha}_{0.y}-\sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} (\#eq:resfov)$$ + + $$ + \widehat{\nu}_{yt}^{t}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{\widetilde{\mathbf{a}}}'_{y.x}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} (\#eq:rest)$$ + + $$ + \widehat{\nu}_{yt}^{F_{ind}}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{a}_{yy}y_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t}. (\#eq:resfind)$$ + Here, the apex $"\widehat{\,\,.\,\,}"$ denotes the estimated + parameters. The other cases can be dealt with in a similar manner. + +4. The VECM model + + $$ + \Delta\mathbf{z}_{t}=\boldsymbol{\alpha}_{0}-\mathbf{A}\mathbf{z}_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_{t} (\#eq:vecmhat)$$ + is estimated as well (imposing weak exogeneity), and the residuals + + $$ + \widehat{\boldsymbol{\varepsilon}}_{xt}= \Delta\mathbf{x}_{t}-\widehat{\boldsymbol{\alpha}}_{0x}+\widehat{\mathbf{A}}_{xx}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\Gamma}}_{(x)j}\Delta\mathbf{z}_{t-j} (\#eq:resvecm)$$ + are computed. Thsis approach guarantees that the residuals + $\widehat{\boldsymbol{\varepsilon}}_{xt}$, associated to the + variables $\mathbf{x}_{t}$ explained by the marginal model + \@ref(eq:marg), are uncorrelated with the ARDL residuals + $\widehat{\nu}_{yt}^{.}$. + +5. A large set of $B$ bootstrap replicates are sampled from the + residuals calculated as in \@ref(eq:resfov),\@ref(eq:rest), + \@ref(eq:resfind) and \@ref(eq:resvecm). In each replication, the + following operations are carried out: + + 1. Each set of $(T-p)$ resampled residuals (with replacement) + $\widehat{\boldsymbol{\nu}}_{zt}^{(b)}=(\widehat{\nu}_{yt}^{(b)},\widehat{\boldsymbol{\varepsilon}}_{xt}^{(b)})$ + is re-centered [see @davidson2005case] + \begin{align} + \dot{\widehat{\nu}}^{(b)}_{yt}&=\widehat{\nu}^{(b)}_{yt} -\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\nu}^{(b)}_{yt} (\#eq:recentery) \\ + \dot{\widehat{\boldsymbol{\varepsilon}}}^{b}_{x_{i}t}&=\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}-\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}\qquad i=1,\dots,K.(\#eq:recenterx) + \end{align} + + 2. A sequential set of $(T-p)$ bootstrap observations, + $y^{*}_{t}\enspace, \mathbf{x}^{*}_{t}\enspace t=p+1,\dots,T$, + is generated as follows + $$y^{*}_{t}=y^{*}_{t-1}+\Delta y^{*}_{t}, \enspace \enspace \mathbf{x}^{*}_{t}=\mathbf{x}^{*}_{t-1}+\Delta \mathbf{x}^{*}_{t},$$ + where $\Delta \mathbf{x}^{*}_{t}$ are obtained from + \@ref(eq:resvecm) and $\Delta y^{*}_{t}$ from either + \@ref(eq:resfov), \@ref(eq:rest) or \@ref(eq:resfind) after + replacing in each of these equations the original residuals with + the bootstrap ones.\ + The initial conditions, that is the observations before $t=p+1$, + are obtained by drawing randomly $p$ observations in block from + the original data, so as to preserve the data dependence + structure. + + 3. An unrestricted ARDL model is estimated via OLS using the + bootstrap observations, and the statistics $F_{ov}^{(b),H_0}$, + $t^{(b),H_0}$ $F_{ind}^{(b),H_0}$ are computed. + +6. The bootstrap distributions of + $\big\{F_{ov}^{(b),H_0}\big\}_{b=1}^B$, + $\big\{F_{ind}^{(b),H_0}\big\}_{b=1}^B$ and + $\big\{t^{(b),H_0}\big\}_{b=1}^B$ under the null are then employed + to determine the critical values of the tests. By denoting with + $M^*_b$ the ordered bootstrap test statistic, and with $\alpha$ the + nominal significance level, the bootstrap critical values are + determined as follows + $$ + c^*_{\alpha,M}=\min\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{M^*_b >c\}} \leq\alpha\bigg\} + \qquad M\in\{F_{ov},F_{ind}\} (\#eq:bootf)$$ + for the $F$ tests and + $$ + c^*_{{\alpha,t}}=\max\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{t^*_b80$, at +the 5% significance level. Therefore, it is recommended a number of +bootstrap replicates of at least $B=1000$ for higher sample size, or at +least $B=2000$ for smaller samples. The analysis has been carried out +using an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of +RAM. + +::: {#tab:exec} + ------------------------------------------------------------------------------------------------------ + $T$ $B$ Exec. Time (sec) $cv^{(F_{ov})}(5\%)$ $cv^{(F_{ov})}(2.5\%)$ $cv^{(F_{ov})}(1\%)$ + ----- ------ ------------------ ---------------------- ------------------------ ---------------------- + 50 200 23.38 8.648 10.925 13.392 + + 50 500 48.37 6.312 6.952 8.640 + + 50 1000 96.65 4.806 5.613 6.288 + + 50 2000 231.15 4.255 4.226 4.946 + + 80 200 23.46 7.251 8.936 11.263 + + 80 500 50.19 4.998 6.220 7.946 + + 80 1000 143.00 3.882 4.453 5.305 + + 80 2000 255.64 2.912 3.623 4.518 + + 100 200 37.89 7.707 8.583 10.955 + + 100 500 52.86 4.691 5.304 7.557 + + 100 1000 184.51 3.512 4.567 5.695 + + 100 2000 212.65 3.519 3.674 4.185 + + 200 200 35.46 6.644 7.173 10.365 + + 200 500 76.78 4.734 5.355 6.225 + + 200 1000 148.25 3.124 4.177 5.034 + + 200 2000 484.51 2.811 3.361 3.907 + + 500 200 54.47 6.641 8.694 10.414 + + 500 500 133.17 5.137 5.816 6.408 + + 500 1000 271.87 3.905 4.585 5.283 + + 500 2000 561.71 3.221 3.490 4.145 + ------------------------------------------------------------------------------------------------------ + + Table 1: Average execution times (in seconds) of the `boot_ardl` + function, for different combinations of sample size $T$ and bootstrap + replicates $B$. Coefficients of variation ($cv$) reported for the + $F_{ov}$ bootstrap critical values at level 5%, 2.5% and 1%. +::: + +## Empirical applications {#sec:app} + +This section provides two illustrative application which highlight the +performance of the bootstrap ARDL tests. + +### An application to the German macroeconomic dataset + +In the first example, the occurrence of a long-run relationship between +consumption \[C\], income \[INC\], and investment \[INV\] of Germany has +been investigated via a set of ARDL models, where each variable takes in +turn the role of dependent one, while the remaining are employed as +independent. The models have been estimated by employing the dataset of +@lutkepohl2005 which includes quarterly data of the series over the +years 1960 to 1982. The data have been employed in logarithmic form. +Figure \@ref(fig:figplotemp) displays these series over the sample +period.\ +Before applying the bootstrap procedure, the order of integration of +each series has been analyzed. Table [2](#tab:adf) shows the results of +ADF test performed on both the series and their first-differences ($k=3$ +maximum lags). The results confirm the applicability of the ARDL +framework as no series is integrated of order higher than one.\ +The following ARDL equations have been estimated: + +1. First ARDL equation (C | INC, INV): + \begin{align} + \Delta \log \text{C}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{C}_{t-1} - {a}_{y.x_1}\log \text{INC}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{INC}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + + \end{align} + +2. Second ARDL equation (INC | C, INV): + \begin{align} + \Delta \log \text{INC}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INC}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + + \end{align} + +3. Third ARDL equation (INV | C, INC): + \begin{align} + \Delta \log \text{INV}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INV}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INC}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INV}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INC}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INC}_{t}+\nu_{t}. + + \end{align} + +Table [3](#tab:est) shows the estimation results for each ARDL and VECM +model. It is worth noting that the instantaneous difference of the +independent variables are highly significant in each conditional ARDL +model. Thus, neglecting these variables in the ARDL equation, as happens +in the unconditional version of the model, may potentially lead to +biased estimates and incorrect inference. For the sake of completeness, +also the results of the marginal VECM estimation are reported for each +model.\ +The code to prepare the data, available in the package as the +`ger_macro` dataset, is: + +``` r + data("ger_macro") + LNDATA = apply(ger_macro[,-1], 2, log) + col_ln = paste0("LN", colnames(ger_macro)[-1]) + LNDATA = as.data.frame(LNDATA) + colnames(LNDATA) = col_ln +``` + +Then, the `boot_ardl` function is called, to perform the bootstrap +tests. In the code chunk below, Model I is considered. + +``` r + set.seed(999) + BCT_res_CONS = boot_ardl(data = LNDATA, + yvar = "LNCONS", + xvar = c("LNINCOME", "LNINVEST"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) +``` + +to which follows the call to the `summary` function + +``` r + summary(BCT_res_CONS, out = "ARDL") + summary(BCT_res_CONS, out = "VECM") + summary(BCT_res_CONS, out = "cointVECM") + summary(BCT_res_CONS, out = "cointARDL") +``` + +The first summary line displays the output in the ARDL column of Table +[3](#tab:est) and the second column of Table [4](#tab:cointbig), Model +I. The second line corresponds to the VECM columns of Table +[3](#tab:est), Model I - only for the independent variables. The +information on the rank of the $\mathbf A_{xx}$ in Table [3](#tab:est) +is inferred from the third line. Finally, the fourth summary line +corresponds to the test results in Table [4](#tab:cointbig), Model I. A +textual indication of the presence of spurious cointegration is +displayed at the bottom of the `"cointARDL"` summary, if detected.\ +In this example, the bootstrap and bound testing procedures are in +agreement only for model I, indicating the existence of a cointegrating +relationship. Additionally, no spurious cointegration is detected for +this model. As for models II and III, the null hypothesis is not +rejected by the bootstrap tests, while the PSS and SMG bound tests fail +to give a conclusive answer in the $F_{ind}$ test.\ +The running time of the entire analysis is of roughly 11 minutes, using +an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of RAM. + +:::: center +::: {#tab:adf} + ----------------------------------------------------------------------------------- + level variable first difference + -------------------- ----- ---------------- --------- ------------------ ---------- + Series lag ADF p.value ADF p-value + + $\log\text{C}_t$ 0 -1.690 0.450 -9.750 <0.01 + + 1 -1.860 0.385 -5.190 <0.01 + + 2 -1.420 0.549 -3.130 0.030 + + 3 -1.010 0.691 -2.720 0.080 + + $\log\text{INC}_t$ 0 -2.290 0.217 -11.140 <0.01 + + 1 -1.960 0.345 -7.510 <0.01 + + 2 -1.490 0.524 -5.120 <0.01 + + 3 -1.310 0.587 -3.290 0.020 + + $\log\text{INV}_t$ 0 -1.200 0.625 -8.390 <0.01 + + 1 -1.370 0.565 -5.570 <0.01 + + 2 -1.360 0.570 -3.300 0.020 + + 3 -1.220 0.619 -3.100 0.032 + ----------------------------------------------------------------------------------- + + : Table 2: ADF preliminary test (null hypothesis: random walk with + drift). +::: +:::: + +::: center +```{r figplotemp, echo=FALSE , fig.cap="log-consumption/investment/income graphs (level variables and first differences). Made with ggplot.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/tsgraph.png")) +``` +::: + +::: {#tab:est .l-screen-inset} +| | | Model I | | | Model II | | | Model III | | +|---------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------| +| | ARDL | VECM | | ARDL | VECM | | ARDL | VECM | | +| | $\Delta\log\text{C}_t$ | $\Delta\log\text{INV}_t$ | $\Delta\log\text{INC}_t$ | $\Delta\log\text{INC}_t$ | $\Delta\log\text{C}_t$ | $\Delta\log\text{INV}_t$ | $\Delta\log\text{INV}_t$ | $\Delta\log\text{C}_t$ | $\Delta\log\text{INC}_t$ | +| | | | | | | | | | | +| $\log\text{C}_{t-1}$ | -0.307 *** (0.055) | | | 0.168 * (0.081) | -0.0011 (0.0126) | 0.1286 * (0.0540) | 0.611 . (0.339) | -0.2727 *** (0.0704) | -0.0508 (0.0796) | +| $\log\text{INC}_{t-1}$ | 0.297 *** (0.055) | 0.124 * (0.054) | -0.017 (0.014) | -0.183 * (0.079) | | | -0.491 (0.340) | 0.2619 *** (0.0681) | 0.0464 (0.0772) | +| $\log\text{INV}_{t-1}$ | -0.001 (0.011) | -0.152 * (0.063) | 0.016 (0.017) | 0.0209 (0.0135) | -0.00107 (0.0142) | -0.1531 * (0.0607) | -0.1212 * (0.060) | | | +| | | | | | | | | | | +| $\Delta\log\text{C}_{t-1}$ | -0.248 ** (0.079) | 0.899 * (0.442) | 0.211 . (0.113) | 0.375 *** (0.1086) | | 0.9288 * (0.442) | 1.113 * (0.441) | | 0.2072 . (0.1142) | +| $\Delta\log\text{C}_{t-2}$ | | 0.744 (0.431) | | | | 0.8049 . (0.4345) | | | | +| $\Delta\log\text{INC}_{t-1}$ | | | | -0.1404 (0.1095) | | | | | | +| $\Delta\log\text{INC}_{t-2}$ | | | | | 0.2675 ** (0.0958) | | | 0.1522 . (0.0912) | | +| $\Delta\log\text{INV}_{t-1}$ | | -0.18 (0.111) | 0.035 (0.029) | | | -0.189 . (0.1097) | -0.175 (0.1075) | | 0.0479 . (0.0282) | +| $\Delta\log\text{INV}_{t-2}$ | | | 0.049 . (0.027) | | 0.0591 * (0.0245) | | | 0.0578 * (0.0223) | 0.0562 * (0.0266) | +| | | | | | | | | | | +| $\Delta\log\text{C}_t$ | | | | 0.7070 *** (0.1093) | | | 1.8540 *** (0.5425) | | | +| $\Delta\log\text{INC}_t$ | 0.471 *** (0.074) | | | | | | -0.445 *** (0.4726) | | | +| $\Delta\log\text{INV}_t$ | 0.065 ** (0.019) | | | -0.0230 (0.025) | | | | | | +| const. | 0.048 *** (0.013) | 0.036 (0.066) | 0.033 * (0.017) | 0.002 (0.018) | 0.0266 . (0.0155) | 0.023 (0.0666) | -0.056 (0.072) | 0.0517 ** (0.0157) | 0.0378 * (0.0177) | +| J-test | | $rk(\mathbf{A_{xx}})=2$ | | | $rk(\mathbf{A_{xx}})=2$ | | | $rk(\mathbf{A_{xx}})=2$ | | + + : Table 3: Conditional ARDL and VECM results for the + consumption/income/investment dataset, along with rank of the + $\mathbf A_{xx}$ matrix via the Johansen (J) test.\ + Significance codes: (\*\*\*) 1%; (\*\*) 5%; (.) 10%. +::: + +::: {#tab:cointbig} + ------------------------------------------------------------------------------------------------------------------- + PSS / SMG Threshold Outcome + ------- --------- ----------- ----------------------- --------------------- --------- ----------- --------- ------- + Model Lags Test Boot. Critical Values I(0) 5% I(1) 5% Statistic Boot Bound + + I (1,0,0) $F_{ov}$ 3.79 3.79 4.85 10.75 Y Y + + $t$ -2.88 -2.86 -3.53 -5.608 + + $F_{ind}$ 4.92 3.01 5.42 15.636 + + II (1,1,0) $F_{ov}$ 5.79 3.79 4.85 2.867 N U + + $t$ -3.69 -2.86 -3.53 -2.315 + + $F_{ind}$ 7.38 3.01 5.42 3.308 + + III (1,1,0) $F_{ov}$ 5.50 3.79 4.85 3.013 N U + + $t$ -3.32 -2.86 -3.53 -2.020 + + $F_{ind}$ 6.63 3.01 5.42 4.189 + ------------------------------------------------------------------------------------------------------------------- + + : Table 4: Cointegration analysis for the three ARDL equations in the + German macroeconomic data. The optimal number of ARDL lags in the + short-run - in the form $(y,x_1,x_2)$, matching the model definition - + bootstrap critical values, bound test thresholds and test statistics + for each test are shown (case III).\ + The outcome columns draw conclusions on each type of model (bootstrap + or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of + type 1, D2 = degenerate of type 2, U = inconclusive inference. +::: + +### An application on Italian Macroeconomic Data + +Following @bertelli2022bootstrap, the relationship between foreign +direct investment \[FDI\], exports \[EXP\], and gross domestic product +\[GDP\] in Italy is investigated. The data of these three yearly +variables have been retrieved from the World Bank Database and cover the +period from 1970 to 2020. In the analysis, the log of the variables has +been used and \[EXP\] and \[FDI\] have been adjusted using the GDP +deflator. Figure \@ref(fig:figplotemp2) displays these series over the +sample period. + +::: center +```{r figplotemp2, echo=FALSE , fig.cap="log-GDP/export/investment graphs (level variables and first differences). Made with ggplot.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/tsgraph2.png")) +``` +::: + +Table [5](#tab:gdp1) shows the outcomes of the ADF test performed on +each variable, which ensures that the integration order is not higher +than one for all variables. Table [6](#tab:cointbig2) shows the results +of bound and bootstrap tests performed in ARDL model by taking each +variable, in turn, as the dependent one. The following ARDL equations +have been estimated: + +1. First ARDL equation (GDP | EXP, FDI): + \begin{align} + \Delta \log \text{GDP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{GDP}_{t-1} - {a}_{y.x_1}\log \text{EXP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{EXP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t}. + + \end{align} + For this model, a degenerate case of the first type can be + observed, while the simpler bound testing procedure does not signal + cointegration. + +2. Second ARDL equation (EXP | GDP, FDI): + \begin{align} + \Delta \log \text{EXP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{EXP}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t}. + + \end{align} + For this model, the ARDL bootstrap test indicates absence of + cointegration, while the bound testing approach is inconclusive for + the $F_{ind}$ test. + +3. Third ARDL equation (FDI | GDP, EXP): + \begin{align} + \Delta \log \text{FDI}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{FDI}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{EXP}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{FDI}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{EXP}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{EXP}_{t}+\nu_{t}. + + \end{align} + For this model, the long-run cointegrating relationship is confirmed + using both boostrap and bound testing. No spurious cointegration is + detected. + +The code to load the data and perform the analysis (e.g. for Model I) +is: + +``` r + data("ita_macro") + BCT_res_GDP = boot_ardl(data = ita_macro, + yvar = "LGDP", + xvar = c("LEXP", "LFI"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) +``` + +For the sake of simplicity, the conditional ARDL and VECM marginal +models outputs included in each cointegrating analysis is omitted. The +summary for the cointegration tests for Model I is called via + +``` r + summary(BCT_res_GDP, out = "ARDL") # extract lags + summary(BCT_res_GDP, out ="cointARDL") # ARDL cointegration +``` + +This empirical application further highlights the importance of dealing +with inconclusive inference via the bootstrap procedure, while naturally +including the effect of conditioning in the ARDL model, as highlighted +in @bertelli2022bootstrap. + +::: {#tab:gdp1 .l-page} +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- + No Drift, No Trend Drift, No Trend Drift and Trend +--------------------------- -------------------- ---------------------------- --------- --------- ----------------- --------- --------- --------- ----------------- --------- --------- --------- +Variable Lag = 0 Lag = 1 Lag = 2 Lag = 3 Lag = 0 Lag = 1 Lag = 2 Lag = 3 Lag = 0 Lag = 1 Lag = 2 Lag = 3 + +$\log \text{GDP}_t$ 0.99 0.974 0.941 0.796 <0.01 <0.01 <0.01 0.084 0.99 0.99 0.99 0.99 + +$\log \text{FDI}_t$ 0.572 0.599 0.675 0.725 <0.01 0.0759 0.3199 0.5174 <0.01 0.013 0.151 0.46 + +$\log \text{EXP}_t$ 0.787 0.71 0.698 0.684 0.479 0.288 0.467 0.433 0.629 0.35 0.463 0.379 + +$\Delta\log \text{GDP}_t$ <0.01 <0.0164 0.0429 0.0402 <0.01 0.0861 0.3989 0.4267 <0.01 <0.01 0.0166 0.017 + +$\Delta\log \text{FDI}_t$ <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 + +$\Delta\log \text{EXP}_t$ <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 <0.01 0.0336 0.0315 +--------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 5: ADF preliminary test for the second example. +::: + +::: {#tab:cointbig2 .l-page} + ------------------------------------------------------------------------------------------------------------------- + PSS / SMG Threshold Outcome + ------- --------- ----------- ----------------------- --------------------- --------- ----------- --------- ------- + Model Lags Test Boot. Critical Values I(0) 5% I(1) 5% Statistic Boot Bound + + I (1,1,0) $F_{ov}$ 3.730 4.070 5.190 9.758 D1 N + + $t$ -2.020 -2.860 -3.530 -2.338 + + $F_{ind}$ 3.710 3.220 5.620 2.273 + + II (1,0,0) $F_{ov}$ 5.400 4.070 5.190 2.649 N U + + $t$ -3.380 -2.860 -3.530 -1.889 + + $F_{ind}$ 5.630 3.220 5.620 3.481 + + III (1,0,0) $F_{ov}$ 5.360 4.070 5.190 6.716 Y Y + + $t$ -3.550 -2.860 -3.530 -4.202 + + $F_{ind}$ 6.500 3.220 5.620 7.017 + ------------------------------------------------------------------------------------------------------------------- + + : Table 6: Cointegration analysis for the three ARDL equations in the + Italian macroeconomic data. The optimal number of ARDL lags in the + short-run - in the form $(y,x_1,x_2)$, matching the model definition - + bootstrap critical values, bound test thresholds and test statistics + for each test are shown (case III).\ + The outcome columns draw conclusions on each type of model (bootstrap + or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of + type 1, D2 = degenerate of type 2, U = inconclusive inference. +::: + +## Conclusion {#sec:end} + +The [**bootCT**](https://CRAN.R-project.org/package=bootCT) package +allows the user to perform bootstrap cointegration tests in ARDL models +by overcoming the problem of inconclusive inference which is a +well-known drawback of standard bound tests. The package makes use of +different functions. The function `boot_ardl` performs the bootstrap +tests, and it acts as a wrapper of both the bootstrap and the standard +bound tests, including also the Johansen test on the independent +variables of the model. Finally, it also performs the bound $F$-test on +the lagged independent variables, so far not available in other extant +`R` packages. The function `sim_vecm_ardl`, which allows the simulation +of multivariate time series data following a user-defined DGP, enriches +the available procedures for multivariate data generation, while the +function `lag_mts` provides a supporting tool in building datasets of +lagged variables for any practical purpose. Finally, the use of Rcpp +functions gives a technical advantage in terms of computational speed, +performing the bootstrap analysis within an acceptable time frame. + +## Appendix {#sec:appendix} + +### Section A - the methodological framework of (conditional) VECM and ARDL models {#sec:appendixa} + +Expanding the matrix polynomial $\mathbf{A}(z)$ about $z=1$, yields +$$ +\mathbf{A}(z)=\mathbf{A}(1)z+(1-z)\boldsymbol{\Gamma}(z), (\#eq:polyamat)$$ +where +$$\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}$$ + +$$ +\boldsymbol{\Gamma}(z)=\mathbf{I}_{K+1}-\sum_{i=1}^{p-1}\boldsymbol{\Gamma}_{i}z^i, \enspace \enspace \boldsymbol{\Gamma}_{i}=-\sum_{j=i+1}^{p}\mathbf{A}_j. (\#eq:polygamma)$$ +The VECM model \@ref(eq:vecm) follows accordingly, and + +$$ +\boldsymbol{\alpha}_0=\mathbf{A}(1)\boldsymbol{\mu}+(\boldsymbol{\Gamma}(1)-\mathbf{A}(1))\boldsymbol{\eta}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\mathbf{A}(1)\boldsymbol{\eta}. (\#eq:vecmint)$$ +Assuming that $\mathbf{A}(1)$ is singular and that the variables +$\mathbf{x}_{t}$ are cointegrated. This entails the following +\begin{align} + \mathbf{A}(1)=&\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}=\underset{(K+1,r+1)}{\mathbf{B}}\underset{(r+1,K+1)}{\mathbf{C}'}=\begin{bmatrix}b_{yy} & \mathbf{b}_{yx}'\\ \mathbf{b}_{xy} & \mathbf{B}_{xx} \end{bmatrix}\begin{bmatrix}c_{yy} & \mathbf{c}_{yx}'\\ \mathbf{c}_{xy} & \mathbf{C}_{xx}'\end{bmatrix}= \nonumber\\ +=&\begin{bmatrix}b_{yy}c_{yy}+\mathbf{b}_{yx}'\mathbf{c}_{xy} & b_{yy}\mathbf{c}_{yx}'+\mathbf{b}_{yx}'\mathbf{C}_{xx}'\\ +\mathbf{b}_{xy}c_{yy}+\mathbf{B}_{xx}\mathbf{c}_{xy} & \mathbf{b}_{xy}\mathbf{c}_{yx}'+ \mathbf{A}_{xx} \end{bmatrix}, \enspace \enspace \enspace rk(\mathbf{A}(1))=rk(\mathbf{B})=rk(\mathbf{C}),(\#eq:factt) +\end{align} +where $\mathbf{B}$ and $\mathbf{C}$ are full column rank matrices +arising from the rank-factorization of +$\mathbf{A}(1)=\mathbf{B}\mathbf{C}'$ with $\mathbf{C}$ matrix of the +long-run relationships of the process and $\mathbf{B}_{xx}$, +$\mathbf{C}_{xx}$ arising from the rank factorization of +$\mathbf{A}_{xx}=\mathbf{B}_{xx}\mathbf{C}_{xx}'$, with +$rk(\mathbf{A}_{xx})=rk(\mathbf{B}_{xx})=rk(\mathbf{C}_{xx})=r$ [^6].\ +By partitioning the vectors $\boldsymbol{\alpha}_{0}$, +$\boldsymbol{\alpha}_{1}$, the matrix $\mathbf{A}(1)$ and the polynomial +matrix $\boldsymbol{\Gamma}(L)$ conformably to $\mathbf{z}_{t}$, as +follows + +$$ +\boldsymbol{\alpha}_0=\begin{bmatrix} +\underset{(1,1)}{\alpha_{0y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{0x}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\begin{bmatrix} +\underset{(1,1)}{\alpha_{1y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{1x} } +\end{bmatrix} (\#eq:alphapart)$$ + +$$ +\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,K+1)}{\mathbf{a}'_{(y)}} \\ \underset{(K,K+1)}{\mathbf{A}_{(x)}} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}'_{yx}} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx} } +\end{bmatrix}, +\enspace \enspace \enspace +\boldsymbol{\Gamma}(L)=\begin{bmatrix} +\underset{(1,K+1)}{\boldsymbol{\gamma}'_{y}(L)} \\ \underset{(K,K+1)}{\boldsymbol{\Gamma}_{(x)}(L)} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{\gamma_{yy}(L)} & \underset{(1,K)}{\boldsymbol{\gamma}'_{yx}(L)} \\ \underset{(K,1)}{\boldsymbol{\gamma}_{xy}(L)} & \underset{(K,K)}{\boldsymbol{\Gamma}_{xx}(L) } +\end{bmatrix} (\#eq:coeffpart)$$ +, and substituting \@ref(eq:epsilonx) into \@ref(eq:vecm) yields + +$$ +\Delta\mathbf{z}_t=\begin{bmatrix} +\Delta y_{t} \\ \Delta\mathbf{x}_{t} +\end{bmatrix}=\begin{bmatrix} +\alpha_{0.y} \\ \boldsymbol{\alpha}_{0x} +\end{bmatrix} + \begin{bmatrix} +\alpha_{1.y} \\ \boldsymbol{\alpha}_{1x} +\end{bmatrix}t- \begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}\begin{bmatrix} +y_{t-1} \\ \mathbf{x}_{t-1} +\end{bmatrix} + \begin{bmatrix} +\boldsymbol{\gamma}'_{y.x}(L) \\ \boldsymbol{\Gamma}_{(x)}(L) +\end{bmatrix}\Delta\mathbf{z}_t+\begin{bmatrix} +\boldsymbol{\omega}'\Delta\mathbf{x}_{t} \\ \mathbf{0} +\end{bmatrix}+\begin{bmatrix} +{\nu}_{yt} \\ \boldsymbol{\varepsilon}_{xt} +\end{bmatrix} (\#eq:condsys)$$ +, where + +$$ +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x} (\#eq:condintt)$$ + +$$ +\mathbf{a}'_{(y).x}=\mathbf{a}'_{(y)}-\boldsymbol{\omega}'\mathbf{A}_{(x)}, \enspace \enspace \enspace \boldsymbol{\gamma}'_{y.x}(L)=\boldsymbol{\gamma}_{y}'(L)-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x)}(L). (\#eq:condAmat)$$ + +According to \@ref(eq:condsys), the long-run relationships of the VECM +turn out to be now included in the matrix + +$$ +\begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}=\begin{bmatrix} +a_{yy}-\boldsymbol{\omega}'\mathbf{a}_{xy} & \mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{a}_{xy}&\mathbf{A}_{xx} +\end{bmatrix}. (\#eq:condAmat2)$$ + +To rule out the presence of long-run relationships between $y_{t}$ and +$\mathbf{x}_{t}$ in the marginal model, the $\mathbf{x}_{t}$ variables +are assumed to be exogenous with respect to the ARDL parameters, that is +$\mathbf{a}_{xy}$ is assumed to be a null vector. Accordingly, the +long-run matrix in \@ref(eq:condAmat2) becomes + +$$ +\widetilde{\mathbf{A}}=\begin{bmatrix}a_{yy} & \mathbf{a}'_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{0} & \mathbf{A}_{xx} +\end{bmatrix}=\begin{bmatrix} +a_{yy} & \widetilde{\mathbf{a}}_{y.x}' \\ \mathbf{0}&\mathbf{A}_{xx}\end{bmatrix} =\begin{bmatrix} +b_{yy}c_{yy} & b_{yy}\mathbf c_{yx}'+(\mathbf{b}_{yx}'-\boldsymbol{\omega}'\mathbf{B}_{xx})\mathbf{C}_{xx}' \\ \mathbf{0}& \mathbf{B}_{xx}\mathbf{C}_{xx}'\end{bmatrix}. (\#eq:cond)$$ + +After these algebraic transformations, the ARDL equation for +$\Delta y_{t}$ can be rewritten as in \@ref(eq:ardl).\ +In light of the factorization \@ref(eq:factt) of the matrix +$\mathbf{A}(1)$, the long-run equilibrium vector $\boldsymbol{\theta}$ +can be expressed as + +$$ +\boldsymbol{\theta}'= +-\frac{1}{a_{yy}}\underset{(1,r+1)}{\left[b_{yy}\enspace\enspace(\mathbf{b}_{yx}-\boldsymbol{\omega}'\mathbf{B}_{xx})\right]} +\underset{(r+1,K)}{\begin{bmatrix} \mathbf{c}'_{yx}\\ \mathbf{C}'_{xx} \end{bmatrix}}, (\#eq:thetat)$$ + +where + +$\widetilde{\mathbf{a}}_{y.x}=\mathbf{a}_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx}$.\ +Bearing in mind that $\mathbf{C}'_{xx}$ is the cointegrating matrix for +the variables $\mathbf{x}_t$, the equation \@ref(eq:thetat) leads to the +following conclusion + +$$ +rk\begin{bmatrix}\mathbf{c}'_{yx}\\ \mathbf{C}'_{xx}\end{bmatrix}=\begin{cases} +r \to \enspace y_{t} \sim I(0) \\ +r+1 \to \enspace y_{t} \sim I(1) +\end{cases}, (\#eq:rank)$$ +where $r=rk(\mathbf{A}_{xx})$ and $0 \leq r\leq K$.\ + +### Section B - Intercept and trend specifications {#sec:appendixb} + +@pesaran2001 introduced five different specifications for the ARDL +model, which depend on the deterministic components that can be absent +or restricted to the values they assume in the parent VAR model. In this +connection, note that, in light of \@ref(eq:vecmint), the drift and the +trend coefficient in the conditional VECM \@ref(eq:condsys) are defined +as +$$\boldsymbol{\alpha_{0}}^{c}=\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} , \enspace \enspace +\boldsymbol{\alpha_{1}}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta},$$ +where $\widetilde{\mathbf{A}}(1)$ is as in \@ref(eq:cond) and +$\widetilde{\boldsymbol{\Gamma}}(1)=\begin{bmatrix} \boldsymbol{\gamma}_{y.x}'(1) \\ \boldsymbol{\Gamma}_{(x)}(1) \end{bmatrix}$.\ +Accordingly, after partitioning the mean and the drift vectors as +$$\underset{(1,K+1)}{\boldsymbol{\mu}'}=[\underset{(1,1)}{\mu_{y}},\underset{(1,K)}{\boldsymbol{\mu}_x'}], \enspace \underset{(1,K+1)}{\boldsymbol{\eta}'}=[\underset{(1,1)}{\eta_{y}},\underset{(1,K)}{\boldsymbol{\eta}_{x}'}],$$ +the intercept and the coefficient of the trend of the ARDL equation +\@ref(eq:ardl) are defined as +$$\alpha_{0.y}^{EC} += \mathbf{e}_{1}'\boldsymbol{\alpha_{0}}^{c} +=a_{yy}\mu_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\mu}_{x}+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}=a_{yy}(\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x})+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}, \enspace +\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}}'_{y.x}}{a_{yy}}$$ + +$$\enspace \enspace \alpha_{1.y}^{EC}=\mathbf{e}_{1}'\boldsymbol{\alpha_{1}}^{c}= +a_{yy}\eta_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\eta}_{x}=a_{yy}(\eta_{y}-\boldsymbol{\theta'}\boldsymbol{\eta}_{x}),$$ +where $\mathbf{e}_{1}$ is the $K+1$ first elementary vector.\ +In the error correction term +$$EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}$$ +the parameters that partake in the calculation of intercept and trend +are +$$\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}.$$ +In particular, these latter are not null only when they are assumed to +be restricted in the model specification.\ +The five specifications proposed by  @pesaran2001 are + +1. *No intercept and no trend*: + $$\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf{0}.$$ + It follows that + $$\theta_{0}=\theta_{1}=\alpha_{0.y}=\alpha_{1.y}=0.$$ + Accordingly, the model is as in \@ref(eq:case1). + +2. *Restricted intercept and no trend*: + $$\boldsymbol{\alpha}_{0}^{c}= \widetilde{\mathbf{A}}(1)\boldsymbol{\mu},\enspace \enspace \boldsymbol{\eta}=\mathbf{0},$$ + which entails + $$\theta_0 \neq 0 \enspace\enspace\alpha_{0.y}^{EC}=a_{yy}\theta_{0}, \enspace \enspace + \alpha_{0.y}=\theta_{1}=\alpha_{1.y}=0.$$ + Therefore, the intercept stems from the EC term of the ARDL + equation. The model is specified as in \@ref(eq:case2) + +3. *Unrestricted intercept and no trend*: + $$\boldsymbol{\alpha}_{0}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\mu}, \enspace \enspace \boldsymbol{\eta}=\mathbf{0}.$$ + Thus, + $$\alpha_{0.y}\neq 0,\enspace \enspace \theta_{0}=\theta_{1}=\alpha_{1.y}=0.$$ + Accordingly, the model is as in \@ref(eq:case3). + +4. *Unrestricted intercept, restricted trend*: + $$\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta}\enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta},$$ + which entails + $$\alpha_{0.y} \neq 0,\enspace \enspace + \theta_{0}=0 \enspace \enspace + \theta_{1}\neq 0\enspace\enspace + \alpha_{1.y}^{EC}=a_{yy}\theta_1\enspace\enspace + \alpha_{1.y}=0.$$ + Accordingly, the trend stems from the EC term of the ARDL equation. + The model is as in \@ref(eq:case4). + +5. *Unrestricted intercept, unrestricted trend*: + $$\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} \enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\eta}.$$ + Accordingly, + $$\alpha_{0.y} \neq 0 \enspace \enspace\alpha_{1.y} \neq 0, \enspace \enspace\theta_{0}=\theta_{1}=0.$$ + The model is as in \@ref(eq:case5). + +[^1]: The `R` packages, either used in the creation of + [**bootCT**](https://CRAN.R-project.org/package=bootCT) or employed + in the analyses presented in this paper, are + [**magrittr**](https://CRAN.R-project.org/package=magrittr) + [@magrittr], [**gtools**](https://CRAN.R-project.org/package=gtools) + [@gtools], [**pracma**](https://CRAN.R-project.org/package=pracma) + [@pracma], [**Rcpp**](https://CRAN.R-project.org/package=Rcpp) + [@RCPP], + [**RcppArmadillo**](https://CRAN.R-project.org/package=RcppArmadillo) + [@RcppArmadillo2023], + [**Rmisc**](https://CRAN.R-project.org/package=Rmisc) [@Rmisc], + [**dynamac**](https://CRAN.R-project.org/package=dynamac) + [@PKGDYNAMAC], [**ARDL**](https://CRAN.R-project.org/package=ARDL) + [@PKGARDL], [**aod**](https://CRAN.R-project.org/package=aod) + [@aod], [**vars**](https://CRAN.R-project.org/package=vars) and + [**urca**](https://CRAN.R-project.org/package=urca) + [@PKGVARS; @urca], + [**aTSA**](https://CRAN.R-project.org/package=aTSA) [@PKGATSA], + [**tseries**](https://CRAN.R-project.org/package=tseries) + [@tseries], + [**reshape2**](https://CRAN.R-project.org/package=reshape2), + [**ggplot2**](https://CRAN.R-project.org/package=ggplot2) and + [**stringr**](https://CRAN.R-project.org/package=stringr) + [@reshape2; @ggplot; @stringr], + [**tidyverse**](https://CRAN.R-project.org/package=tidyverse) and + [**dplyr**](https://CRAN.R-project.org/package=dplyr) + [@tidyverse; @dplyr]. + +[^2]: If the explanatory variables are stationary $\mathbf{A}_{xx}$ is + non-singular ($rk(\mathbf{A}_{xx})=K$), while when they are + integrated but without cointegrating relationship $\mathbf{A}_{xx}$ + is a null matrix. + +[^3]: The knowledge of the rank of the cointegrating matrix is necessary + to overcome this impasse. + +[^4]: The latter is introduced in the ARDL equation by the operation of + conditioning $y_t$ on the other variables $\mathbf{x}_t$ of the + model + +[^5]: In fact, as + $\boldsymbol{\omega}'\mathbf{A}_{xx}\mathbf{x}_{t} \approx I(0)$, + the conclusion that $y_{t}\approx I(0)$ must hold. This in turn + entails that no cointegration occurs between $y_t$ and + $\mathbf{x}_{t}$. + +[^6]: If the explanatory variables are stationary $\mathbf{A}_{xx}$ is + non-singular ($rk(\mathbf{A}_{xx})=K$), while when they are + integrated but without cointegrating relationship $\mathbf{A}_{xx}$ + is a null matrix diff --git a/_articles/RJ-2024-003/RJ-2024-003.html b/_articles/RJ-2024-003/RJ-2024-003.html new file mode 100644 index 0000000000..79e09c963a --- /dev/null +++ b/_articles/RJ-2024-003/RJ-2024-003.html @@ -0,0 +1,11990 @@ + + + + + + + + + + + + + + + + + + + + + + bootCT: An R Package for Bootstrap Cointegration Tests in ARDL Models + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    bootCT: An R Package for Bootstrap Cointegration Tests in ARDL Models

    + + + +

    The Autoregressive Distributed Lag approach to cointegration or bound +testing, proposed by Pesaran in 2001, has become prominent in +empirical research. Although this approach has many advantages over +the classical cointegration tests, it is not exempt from drawbacks, +such as possible inconclusive inference and distortion in size. +Recently, Bertelli and coauthors developed a bootstrap approach to the +bound tests to overcome these drawbacks. This paper introduces the R +package bootCT, which implements this method by deriving the bootstrap +versions of the bound tests and of the asymptotic F-test on the +independent variables proposed by Sam and coauthors in 2019. As a +spinoff, a general method for generating random multivariate time +series following a given VECM/ARDL structure is provided in the +package. Empirical applications showcase the main functionality of the +package.

    +
    + + + +
    +

    1 Introduction

    +

    Cointegration and error correction are fundamental concepts in the +analysis of economic data, insofar as they provide an appropriate +framework for testing economic hypotheses about growth and fluctuation. +Several approaches have been proposed in the literature to determine +whether two or more non-stationary time series are cointegrated, meaning +they share a common long-run relationship.
    +There are two basic types of tests for cointegration: single equation +tests and VAR-based tests. The former check the presence of unit roots +in cointegration residuals (see, e.g., Engle and Granger 1987; Engle and Yoo 1987; Mackinnon 1991; Gabriel et al. 2002; Cook 2006) +or test the significance of the error-correction (EC) term coefficient +(Kremers et al. 1992; Maddala and Kim 1998; Arranz and Escribano 2000; Ericsson and MacKinnon 2002). The +latter, such as the Johansen (1991) approach, tackle the problem of +detecting cointegrating relationships in a VAR model. This latter +approach, albeit having the advantage of avoiding the issue of +normalization, as well as allowing the detection of multiple +cointegrating vectors, is far from being perfect. In the VAR system all +variables are treated symmetrically, as opposed to the standard +univariate models that usually have a clear interpretation in terms of +exogenous and endogenous variables. Furthermore, in a VAR system all the +variables are estimated at the same time, which is problematic if the +relation between some variables is flawed, that is affected by some +source of error. In this case a simultaneous estimation process tends to +propagate the error affecting one equation to the others. Furthermore, a +multidimensional VAR models employs plenty of degrees of freedom.
    +The recent cointegration approach, known as Autoregressive Distributed +Lag (ARDL) approach to cointegration or bound testing, proposed by + Pesaran et al. (2001) (PSS), falls in the former strand of literature. It has +become prominent in empirical research because it shows several +advantages with respect to traditional methods for testing +cointegration. First, it is applicable also in cases of mixed order +integrated variables, albeit with integration not exceeding the first +order. Thus, it evades the necessity of pre-testing the variables and, +accordingly, avoids some common practices that may prevent finding +cointegrating relationships, such as dropping variables or transforming +them into stationary form  (see McNown et al. 2018). Second, +cointegration bound tests are performed in an ARDL model that allows +different lag orders for each variable, thus providing a more flexible +framework than other commonly employed approaches. Finally, unlike other +cointegration techniques, which are sensitive to the sample size, the +ARDL approach provides robust and consistent results for small sample +sizes.
    +Notably, the ARDL bound testing methodology has quickly spread in +economics and econometrics to study the cointegrating relationships +between macroeconomic and financial variables, to evaluate the long-run +impact of energy variables, or to assess recent environmental policies +and their impact on the economy. Among the many applications, see for +instance Haseeb et al. (2019; Hussain et al. 2019; Menegaki 2019; Reda and Nourhan 2020; Yilanci et al. 2020; Abbasi et al. 2021).
    +The original bound tests proposed by Pesaran et al. (2001) are an \(F\)-test for +the significance of the coefficients of all lagged level variables +entering the error correction term (\(F_{ov}\)), and a \(t\)-test for the +coefficient of the lagged dependent variable. When either the dependent +or the independent variables do not appear in the long-run relationship, +a degenerate case arises. The bound \(t\)-test provides answers on the +occurrence of a degenerate case of second type, while the occurrence of +a degeneracy case of first type can be assessed by testing whether the +dependent variable is of integration order I(1). This type of check +violates the spirit and motivation of the bound tests, which are +supposed to be applicable in situations of unknown order of integration +for the variables.
    +Recently, McNown et al. (2018) pointed out how, due to the low power +problem of unit root tests, investigating the presence of a first type +degeneracy by testing the integration order of the dependent variable +may lead to incorrect conclusions. Therefore, they suggested checking +for its occurrence by testing the significance of the lagged levels of +the independent variables via an extra \(F\)-test (\(F_{ind}\)), which was +also worked out in its asymptotic version [SMK; Sam et al. (2019)].
    +Besides problems in testing the occurrence of degenerate cases, in +general, the main drawback of the bound tests is the occurrence of +potentially inconclusive results, if the test statistic lies between the +bounds of the test distribution under the null. Furthermore, the +asymptotic distributions of the statistics may provide a poor +approximation of the true distributions in small samples. Finite sample +critical values, even if only for a subset of all possible model +specifications, have been worked out in the literature (see Mills and Pentecost 2001; Narayan and Smyth 2004; Kanioura and Turner 2005; Narayan 2005), +while (Kripfganz and Schneider 2020) provided the quantiles of the asymptotic +distributions of the tests as functions of the sample size, the lag +order and the number of long-run forcing variables. However, this +relevant improvement does not eliminate the uncertainty related to the +inconclusive regions, or the existence of other critical issues related +to the underlying assumptions of the bound test framework, such as the +(weak) exogeneity of the independent variables or the non-stationarity +of the dependent variable.
    +To overcome the mentioned bound test drawbacks, (Bertelli et al. 2022) +proposed bootstrapping the ARDL cointegration test. Inference can always +be pursued with ARDL bootstrap tests, unlike what happens with both the +PSS tests and the SMK test on the independent variables. Bootstrap ARDL +tests were first put forward by (McNown et al. 2018) in an +unconditional ARDL model, which omits the instantaneous differences of +the exogenous variables in the ARDL equation, rather than a conditional +one, as originally proposed by (Pesaran et al. 2001). The unconditional model +is often used, for reason of practical convenience, in empirical +research. Simulation results in (Bertelli et al. 2022) have +highlighted the importance of employing the appropriate specification, +especially under degenerate cases. In fact, it has been pointed out that +a correct detection of these cases requires the comparison of the test +outcomes in both the conditional and unconditional settings. Erroneous +conclusions, based exclusively on one model specification, can thus be +avoided.
    +In this paper, bootstrap bound tests, thereby including the bootstrap +versions of the \(F_{ov}\), \(t\) and \(F_{ind}\) bound tests, are carried out +in a conditional ARDL model setting. This approach allows to overcome +the problem of inconclusive regions of the standard bound tests. A +comparison with the outcomes engendered by the unconditional ARDL +bootstrap tests is nevertheless provided for the \(F_{ind}\) test, to +avoid erroneous inference in presence of degenerate cases.
    +The paper is organized as follows. Section 2 +introduces the theoretical results of the ARDL cointegration bound +tests. Section 3 details the steps carried out by the +bootstrap procedure, which allows the construction of the (bootstrap) +distribution - under the null - for the \(F_{ov}\), \(t\), conditional +\(F_{ind}\) and unconditional \(F_{ind}\) tests. Section 4 +introduces the R package +bootCT (Vacca and Bertelli 2023) and +its functionalities: a method for the generation of random multivariate +time series that follow a user-specified VECM/ARDL structure, with some +examples, and the main function that carries out the aforementioned +bootstrap tests, while also computing the PSS and SMK bound tests. The +trade-off between accuracy and computational time of the bootstrap +procedure is also investigated, under several scenarios in terms of +sample size and number of replications. Notably, a function that +performs the PSS bound tests is already available in the +dynamac package +(Jordan and Philips 2020), while no R routine has so far been implemented for the +SMK test, to the best of our knowledge. Section 5 gives some +empirical applications that employ the core function of the package and +its possible outputs. Section 6 concludes. Appendix +7 briefly delves into technical details of the +conditional ARDL model and its possible specifications 1.

    +

    2 Cointegration bound tests in ARDL models

    +

    The starting point of the approach proposed by  (Pesaran et al. 2001) is a +\((K+1)\) VAR(\(p\)) model +\[ +\mathbf{A}(L)(\mathbf{z}_t-\boldsymbol{\mu}-\boldsymbol{\eta}t)=\boldsymbol{\varepsilon}_t \enspace \enspace \enspace \boldsymbol{\varepsilon}_t\sim N(\mathbf{0}, \boldsymbol{\Sigma}),\qquad\mathbf{A}(L)=\left(\mathbf{I}_{K+1}- \sum_{j=1}^{p}\mathbf{A}_j\mathbf{L}^j\right) +\enspace \enspace \enspace t=1,2,\dots,T. \tag{1}\] +Here, \(\mathbf{A}_j\) are square \((K+1)\) matrices, \(\mathbf{z}_t\) a +vector of \((K+1)\) variables, \(\boldsymbol{\mu}\) and \(\boldsymbol{\eta}\) +are \((K+1)\) vectors representing the drift and the trend respectively, +and \(\det(\mathbf{A}(z))=0\) for \(|z| \geq 1\). If the matrix +\(\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}\) is +singular, the components of \(\mathbf{z}_t\) turn out to be integrated and +possibly cointegrated.
    +The VECM representation of (1) is given by (see Appendix +7.1 for details) +\[ +\Delta\mathbf{z}_t=\boldsymbol{\alpha}_{0}+\boldsymbol{\alpha}_{1}t-\mathbf{A}(1)\mathbf{z}_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta \mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_t. \tag{2}\] +Now, to study the adjustment to the equilibrium of a single variable +\(y_t\), given the other \(\mathbf{x}_t\) variables, the vectors +\(\mathbf{z}_t\) and \(\boldsymbol{\varepsilon}_t\) are partitioned +\[ +\mathbf{z}_t=\begin{bmatrix} +\underset{(1,1)}{y_{t}} \\ \underset{(K,1)}{\mathbf{x}_{t}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\varepsilon}_t=\begin{bmatrix} +\underset{(1,1)}{\varepsilon_{yt}} \\ \underset{(K,1)}{\boldsymbol{\varepsilon}_{xt}} +\end{bmatrix}. \tag{3}\] +The matrix \(\mathbf{A}(1)\), which is assumed to be singular to allow +cointegration, is partitioned conformably to \(\mathbf{z}_{t}\) as 2
    +

    +

    \[\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}.\] +Under the assumption +\[ +\boldsymbol{\varepsilon}_t \sim N\Bigg(\mathbf{0}, \begin{bmatrix} +\underset{(1,1)}{\sigma_{yy}}& \underset{(1,K)}{\boldsymbol{\sigma}_{yx}'} \\ \underset{(K,1)}{\boldsymbol{\sigma}_{xy}} & \underset{(K,K)}{\boldsymbol{\Sigma}_{xx}} \end{bmatrix}\Bigg), \tag{4}\] +the following holds +\[ +\varepsilon_{yt}=\boldsymbol{\omega}'\boldsymbol{\varepsilon}_{xt}+\nu_{yt} \sim N(0,\sigma_{y.x}), \tag{5}\] +where +\(\sigma_{y.x}=\sigma_{yy}-\boldsymbol{\omega}'\boldsymbol{\sigma}_{xy}\) +with +\(\boldsymbol{\omega}'=\boldsymbol{\sigma}'_{yx}\boldsymbol{\Sigma}^{-1}_{xx}\), +and \(\nu_{yt}\) is independent of \(\boldsymbol{\varepsilon}_{xt}\).
    +Substituting (5) into (2) and assuming that +the \(\mathbf{x}_{t}\) variables are exogenous towards the ARDL parameters +(that is, setting \(\mathbf{a}_{xy}=\mathbf{0}\) in \(\mathbf{A}(1)\)) +yields the system (see Appendix 7.1 for details) +\[ + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt} \tag{6}\]

    +

    \[ +\Delta\mathbf{x}_{t} += \boldsymbol{\alpha}_{0x} +\boldsymbol{\alpha}_{1x}t+ \mathbf{A}_{(x)}\mathbf{z}_{t-1}+ \boldsymbol{\Gamma}_{(x)}(L)\Delta\mathbf{z}_t+ \boldsymbol{\varepsilon}_{xt}, \tag{7}\] +where +\[ +\boldsymbol\gamma_{y.x,j}'=\boldsymbol\gamma_{y,j}'-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x),j} \tag{8}\]

    +

    \[ +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x}, \tag{9}\] +and where the error correction term, \(EC_{t-1}\), expressing the long-run +equilibrium relationship between \(y_{t}\) and \(\mathbf{x}_{t}\), is given +by +\[ +EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}, \tag{10}\] +with +\[ +\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}, \enspace\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}'}_{y.x}}{a_{yy}}=-\frac{\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}}{a_{yy}}. \tag{11}\] +Thus, no cointegration occurs when +\(\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}\) or \(a_{yy}=0\) . These two +circumstances are referred to as degenerate case of second and first +type, respectively. Degenerate cases imply no cointegration between +\(y_{t}\) and \(\mathbf{x}_{t}\).
    +To test the hypothesis of cointegration between \(y_{t}\) and +\(\mathbf{x}_{t}\), Pesaran et al. (2001) proposed an \(F\)-test, \(F_{ov}\) hereafter, +based on the hypothesis system +\[\begin{align} +H_0: a_{yy}=0 \; \cap \;\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}\\ +H_1: a_{yy} \neq 0 \; \cup \;\widetilde{\mathbf{a}}_{y.x}\neq \mathbf{0}.\tag{12} +\end{align}\]
    +Note that \(H_{1}\) covers also the degenerate cases +\[\begin{align} +H_1^{y.x}: a_{yy}=0 \; , \;\widetilde{\mathbf{a}}_{y.x}\neq\mathbf{0}\\ +H_1^{yy}: a_{yy} \neq 0 \; , \;\widetilde{\mathbf{a}}_{y.x} = \mathbf{0}.\tag{13} +\end{align}\]
    +The exact distribution of the \(F\) statistic under the null is unknown, +but it is limited from above and below by two asymptotic distributions: +one corresponding to the case of stationary regressors, and another +corresponding to the case of first-order integrated regressors. As a +consequence, the test is called bound test and has an inconclusive area. +3
    Pesaran et al. (2001) worked out two sets of (asymptotic) critical values: one, +\(\{\tau_{L,F}\}\), for the case when \(\mathbf{x}_{t}\sim{I}(0)\) and +another, \(\{\tau_{U,F}\}\), for the case when \(\mathbf{x}_{t}\sim{I}(1)\). +These values vary in accordance with the number of regressors in the +ARDL equation, the sample size and the assumptions made about the +deterministic components (intercept and trend) of the data generating +process.
    +In this regard,  Pesaran et al. (2001) introduced five different specifications +for the ARDL model, depending on its deterministic components, which are +(see Appendix 7.2 for details)

    +
      +
    1. No intercept and no trend +\[\begin{align} +\Delta y_t=-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},\tag{14} +\end{align}\]
      +where \(EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}\),
      +

    2. +
    3. Restricted intercept and no trend +\[\begin{align} +\Delta y_{t}= +-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},\tag{15} +\end{align}\]
      +where +\(EC_{t-1}=y_{t-1}-\theta_{0}-\boldsymbol{\theta}'\mathbf{x}_{t-1}\). +The intercept extracted from the EC term is +\(\alpha_{0.y}^{EC} = a_{yy}\theta_0\).

    4. +
    5. Unrestricted intercept and no trend +\[\begin{align} +\Delta y_{t} +=\alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},\tag{16} +\end{align}\]
      +where \(EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}\).

    6. +
    7. Unrestricted intercept, restricted trend +\[\begin{align} +\Delta y_{t}= +\alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt},\tag{17} +\end{align}\]
      +where +\(EC_{t-1}=y_{t-1}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}\). +The trend extracted from the EC term is +\(\alpha_{1.y}^{EC} = a_{yy}\theta_1\).

    8. +
    9. Unrestricted intercept, unrestricted trend +\[\begin{align} +\Delta y_{t} +=\alpha_{0.y}+\alpha_{1.y}t +-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, \tag{18} +\end{align}\]
      +where \(EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}\).

    10. +
    +

    The model in (6) proposed by  Pesaran et al. (2001) represents the +correct framework in which to carry out bound tests. However, bound test +are often performed in an unconditional ARDL model setting, specified as +\[ + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{j}\Delta\mathbf{z}_{t-j}+\varepsilon_{yt}, \tag{19}\] +which omits the term \(\boldsymbol{\omega}'\Delta\mathbf{x}_{t}\).
    +(Bertelli et al. 2022) have highlighted that bootstrap tests performed +in these two ARDL specifications can lead to contrasting results. To +explain this divergence, note that the conditional model makes use of +the following vector in the EC term +\[\widetilde{\mathbf{a}}_{y.x}'=\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}\] +(divided by \(a_{yy}\), see (11)) to carry out bound tests, +while the unconditional one only uses the vector \(\mathbf{a}_{yx}'\), +(divided by \(a_{yy}\)), since it neglects the term +\(\boldsymbol{\omega}'\mathbf{A}_{xx}\). 4 This can lead to contrasting +inference in two instances. The first happens when a degeneracy of first +type occurs in the conditional model, that is +\[ +\widetilde{\mathbf{a}}_{y.x}'=\mathbf{0}, \tag{20}\] +because +\[\mathbf{a}_{yx}'=\boldsymbol{\omega}'\mathbf{A}_{xx}.\] +In this case, the conditional model rejects cointegration, while the +unconditional one concludes the opposite. The other case happens when a +degeneracy of first type occurs in the unconditional model, that is +\[ +\mathbf{a}_{yx}'=\mathbf{0}, \tag{21}\] +but +\[\widetilde{\mathbf{a}}_{y.x}'=\boldsymbol{\omega}'\mathbf{A}_{xx} \neq \mathbf{0}.\] +In this case, the unconditional model rejects cointegration, while the +conditional one concludes for the existence of cointegrating +relationships, which are however spurious. Only a comparison of the +outcomes of the \(F_{ind}\) test performed in both the conditional and +unconditional ARDL equation can help to disentangle this problem. 5
    +In the following, bootstrap tests are carried out in the conditional +ARDL model (6). However, when a degeneracy of first type +occurs in the unconditional model, the outcomes of the \(F_{ind}\) +bootstrap test performed in both the conditional and unconditional +settings are provided. This, as previously outlined, is performed to +avoid the acceptance of spurious long-run relationships among the +dependent variable and the independent variables.

    +

    3 The new bootstrap procedure

    +

    The bootstrap procedure here proposed focuses on a ARDL model specified +as in (14)-(18), depending on the assumptions on +the deterministic components.
    +The bootstrap procedure consists of the following steps:

    +
      +
    1. The ARDL model is estimated via OLS and the related test statistics +\(F_{ov}\), \(t\) or \(F_{ind}\) are computed.

    2. +
    3. In order to construct the distribution of each test statistic under +the corresponding null, the same model is re-estimated imposing the +appropriate restrictions on the coefficients according to the test +under consideration.

    4. +
    5. Following (McNown et al. 2018), the ARDL restricted residuals +are then computed. For example, under Case III, the residuals are +\[ +\widehat{\nu}_{yt}^{F_{ov}}=\Delta y_{t}-\widehat{\alpha}_{0.y}-\sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} \tag{22}\]

      +

      \[ +\widehat{\nu}_{yt}^{t}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{\widetilde{\mathbf{a}}}'_{y.x}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} \tag{23}\]

      +

      \[ +\widehat{\nu}_{yt}^{F_{ind}}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{a}_{yy}y_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t}. \tag{24}\] +Here, the apex \("\widehat{\,\,.\,\,}"\) denotes the estimated +parameters. The other cases can be dealt with in a similar manner.

    6. +
    7. The VECM model

      +

      \[ +\Delta\mathbf{z}_{t}=\boldsymbol{\alpha}_{0}-\mathbf{A}\mathbf{z}_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_{t} \tag{25}\] +is estimated as well (imposing weak exogeneity), and the residuals

      +

      \[ +\widehat{\boldsymbol{\varepsilon}}_{xt}= \Delta\mathbf{x}_{t}-\widehat{\boldsymbol{\alpha}}_{0x}+\widehat{\mathbf{A}}_{xx}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\Gamma}}_{(x)j}\Delta\mathbf{z}_{t-j} \tag{26}\] +are computed. Thsis approach guarantees that the residuals +\(\widehat{\boldsymbol{\varepsilon}}_{xt}\), associated to the +variables \(\mathbf{x}_{t}\) explained by the marginal model +(7), are uncorrelated with the ARDL residuals +\(\widehat{\nu}_{yt}^{.}\).

    8. +
    9. A large set of \(B\) bootstrap replicates are sampled from the +residuals calculated as in (22),(23), +(24) and (26). In each replication, the +following operations are carried out:

      +
        +
      1. Each set of \((T-p)\) resampled residuals (with replacement) +\(\widehat{\boldsymbol{\nu}}_{zt}^{(b)}=(\widehat{\nu}_{yt}^{(b)},\widehat{\boldsymbol{\varepsilon}}_{xt}^{(b)})\) +is re-centered (see Davidson and MacKinnon 2005) +\[\begin{align} +\dot{\widehat{\nu}}^{(b)}_{yt}&=\widehat{\nu}^{(b)}_{yt} -\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\nu}^{(b)}_{yt} \tag{27} \\ +\dot{\widehat{\boldsymbol{\varepsilon}}}^{b}_{x_{i}t}&=\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}-\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}\qquad i=1,\dots,K.\tag{28} +\end{align}\]

      2. +
      3. A sequential set of \((T-p)\) bootstrap observations, +\(y^{*}_{t}\enspace, \mathbf{x}^{*}_{t}\enspace t=p+1,\dots,T\), +is generated as follows +\[y^{*}_{t}=y^{*}_{t-1}+\Delta y^{*}_{t}, \enspace \enspace \mathbf{x}^{*}_{t}=\mathbf{x}^{*}_{t-1}+\Delta \mathbf{x}^{*}_{t},\] +where \(\Delta \mathbf{x}^{*}_{t}\) are obtained from +(26) and \(\Delta y^{*}_{t}\) from either +(22), (23) or (24) after +replacing in each of these equations the original residuals with +the bootstrap ones.
        +The initial conditions, that is the observations before \(t=p+1\), +are obtained by drawing randomly \(p\) observations in block from +the original data, so as to preserve the data dependence +structure.

      4. +
      5. An unrestricted ARDL model is estimated via OLS using the +bootstrap observations, and the statistics \(F_{ov}^{(b),H_0}\), +\(t^{(b),H_0}\) \(F_{ind}^{(b),H_0}\) are computed.

      6. +
    10. +
    11. The bootstrap distributions of +\(\big\{F_{ov}^{(b),H_0}\big\}_{b=1}^B\), +\(\big\{F_{ind}^{(b),H_0}\big\}_{b=1}^B\) and +\(\big\{t^{(b),H_0}\big\}_{b=1}^B\) under the null are then employed +to determine the critical values of the tests. By denoting with +\(M^*_b\) the ordered bootstrap test statistic, and with \(\alpha\) the +nominal significance level, the bootstrap critical values are +determined as follows +\[ +c^*_{\alpha,M}=\min\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{M^*_b >c\}} \leq\alpha\bigg\} +\qquad M\in\{F_{ov},F_{ind}\} \tag{29}\] +for the \(F\) tests and +\[ +c^*_{{\alpha,t}}=\max\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{t^*_b<c\}} \leq {\alpha}\bigg\} \tag{30}\] +for the \(t\) test.
      +Here, \(\mathbf{1}_{\{x \in A\}}\) is the indicator function, which is +equal to one if the condition in subscript is satisfied and zero +otherwise.

    12. +
    +

    The null hypothesis is rejected if the \(F\) statistic computed at step 1, +\(F_{ov}\) or \(F_{ind}\), is greater than the respective \(c^*_{\alpha,M}\), +or if the \(t\) statistic computed at the same step is lower than +\(c^*_{{\alpha,t}}\).

    +

    4 Illustration of the bootCT package

    +

    This section describes the main functionalities of the +bootCT package. The +functions included in the package are essentially of two types. The +function sim_vecm_ardl generates data according to a given data +generating process (DGP), assuming either the presence or the absence of +cointegrating relationships between variables, or degenerate cases. The +function boot_ardl tests the presence of cointegrating relationships +employing the Pesaran ARDL bound tests (\(F_{ov}\) and \(t\)), the SMK bound +test on lagged independent variables (\(F_{ind}\)), and the novel ARDL +bootstrap testing procedure.

    +

    Generating a multivariate time series: the sim_vecm_ardl function

    +

    The function sim_vecm_ardl allows to simulate a multivariate time +series from a given conditional ARDL specification for a dependent +variable \(y_t\) and a VAR/VECM specification for the remaining +independent variables \(\mathbf{x}_t\). In sthis regard, it represents an +interesting addition to extant data generating procedures for VAR/VECM +models. The arguments of this function can be divided into two +subgroups.
    +A group of parameters pertains the VECM model (6) and +(7), with \(\mathbf{A}_{xx}\) identifying the matrix of the +long-run relationships among the \(\mathbf x_t\) variables, and +\(\boldsymbol\Gamma_j\)’s, \(j=1,...,p-1\) the short-run matrices of the +system variables. Additionally, the parameter \(a_{yy}\) weighs the EC +term for \(y_t\), while \(\mathbf{a}_{yx}'\) is the parameter vector +weighting the variables \(\mathbf{x}_{t}\) in the ARDL equation. The +vector \(\mathbf{a}_{yx}'\), after conditioning \(y_t\) on the other +variables (\(\mathbf{x}_t\), see model (6)) becomes +\(\widetilde{\mathbf{a}}_{y.x}' = \mathbf{a}_{yx}' - \boldsymbol\omega'\mathbf{A}_{xx}\).
    +The second group of parameters concerns the model intercept and trend of +the VAR specification, \(\boldsymbol\mu\) and \(\boldsymbol\eta\), which in +the VECM representation become +\(\boldsymbol\alpha_0 = \mathbf A \boldsymbol\mu + (\mathbf I_{K+1} - \sum_{i=1}^{p-1} \boldsymbol\Gamma_j-\mathbf A)\boldsymbol\eta\) +and \(\boldsymbol{\alpha}_1 = \mathbf A \boldsymbol\eta\) and in the +conditional ARDL become +\(\alpha_{0.y}^{EC}= a_{yy}(\mu_{y}-\widetilde{\mathbf{ a}}_{y.x}'\boldsymbol \mu_{x})+\boldsymbol\gamma_{y.x}'(1)\boldsymbol\eta\) +and +\(a_{1.y}^{EC}=a_{yy}(\eta_y-\widetilde{\mathbf{ a}}_{y.x}'\boldsymbol\eta_x)\). +As explained in Appendix 7.2, intercept and trend +appear in the error correction (EC) term of the ARDL equation only when +restricted. Accordingly, they both do not appear in the EC in the case +I, the intercept does not appear in the EC term in cases III, IV and V +(it is freely set to \(\alpha_{0.y}\)) while the trend appears in the EC +term only in the case IV (it is freely set to \(\alpha_{1.y}\) for case +V). Accordingly, when these terms are not restricted, they need to be +supplied by the user.
    +The approach used to specify the function inputs offers great control to +the user, in terms of generating specific (conditional) ARDL-based +cointegration structures.
    +The function sim_vecm_ardl takes the following arguments:

    +
      +
    • nobs: number of observations to generate;

    • +
    • case: indicates the conditional ARDL specification in terms of +deterministic component (intercept and trend) among the five +specifications proposed by Pesaran et al. (2001), given in +(14)-(18).

    • +
    • sigma.in: covariance matrix, \(\boldsymbol{\Sigma}\), of the error +term \(\boldsymbol{\varepsilon}_{t}\);

    • +
    • gamma.in: list of short-run parameter matrices +\(\boldsymbol\Gamma_j\);

    • +
    • axx.in: cointegrating relationships, \(\mathbf{A}_{xx}\), pertaining +the independent variables in the marginal VECM model;

    • +
    • ayx.uc.in: vector of parameters, as in \(\mathbf{a}_{yx}\);

    • +
    • ayy.in: the \(a_{yy}\) term, weighting the EC term in the ARDL +equation;

    • +
    • mu.in: mean vector, \(\boldsymbol\mu\), in the starting VAR +specification, used to define the VECM intercept for CASE II;

    • +
    • eta.in: trend vector, \(\boldsymbol\eta\), in the starting VAR +specification, used to define the VECM trend for case IV;

    • +
    • azero.in: unrestricted intercept of the VECM specification (valid +only for cases III, IV and V), when the intercept is not involved in +the EC term;

    • +
    • aone.in: unrestricted coefficient of the trend in the VECM +specification (valid only for case V), when the trend is not +involved in the EC term;

    • +
    • burn.in: additional observations burn-in observations to be +generated. A total of burn.in + nobs observations are generated, +but only the last nobs are kept in the data;

    • +
    • seed.in: seed number for the generation of +\(\boldsymbol{\varepsilon}_{t}\sim N(\mathbf 0,\boldsymbol\Sigma)\).

    • +
    +

    If parameter values for mu.in, eta.in, azero.in, or aone.in and +case number turn out to be in contradiction, an error message is +displayed.
    +As output, the function gives out a list containing the data, both in +level and first difference, along with all the parameter values given as +input. Additionally, all intermediate transformation of parameters via +VECM transformation or as a by-product of conditioning \(y_{t}\) on +\(\mathbf{x}_{t}\) are included in the output.
    +Figure 1 depicts three-time series, dep_1_0, +ind_1_0 and ind_2_0, generated using this function and +affected by a cointegrating relationship, one panel for each case, from +I to V. The variable dep_1_0 represents the dependent variable +\(y_t\) of the ARDL equation, while ind_1_0 and ind_2_0 +the independent ones, \(x_{1t}\) and \(x_{2t}\).
    +The code used to generate the data for case I is the following:

    +
        corrm = matrix(c(   0,     0, 0,
    +                     0.25,     0, 0,
    +                      0.4, -0.25, 0), nrow = 3, ncol = 3, byrow = T)
    +
    +    Corrm = (corrm + t(corrm)) + diag(3)
    +
    +    sds = diag(c(1.3, 1.2, 1))
    +
    +    sigma.in = (sds %*% Corrm %*% t(sds))
    +
    +    gamma1 = matrix(c(0.6,    0, 0.2,
    +                      0.1, -0.3,   0,
    +                        0, -0.3, 0.2), nrow = 3, ncol = 3,byrow=T)
    +    gamma2= gamma1 * 0.3
    +
    +    omegat = sigma.in[1, -1] %*% solve(sigma.in[-1, -1])
    +    axx.in = matrix(c( 0.3, 0.5,
    +                      -0.4, 0.3), nrow = 2, ncol = 2, byrow = T)
    +    ayx.uc.in = c(0.4, 0.4)
    +    ayy.in = 0.6
    +
    +    data.vecm.ardl_1 =
    +    sim_vecm_ardl(nobs = 200,
    +                  case = 1,
    +                  sigma.in = sigma.in,
    +                  gamma.in = list(gamma1, gamma2),
    +                  axx.in = axx.in,
    +                  ayx.uc.in = ayx.uc.in,
    +                  ayy.in = ayy.in,
    +                  mu.in = rep(0, 3),
    +                  eta.in = rep(0, 3),
    +                  azero.in = rep(0, 3),
    +                  aone.in = rep(0, 3),
    +                  burn.in = 100,
    +                  seed.in = 999)
    +

    Additionally, Figure 2 displays other three time +series, dep_1_0 (\(y_t\)), ind_1_0 (\(x_{1t}\)) and +ind_2_0 (\(x_{2t}\)), when a degeneracy of second type occurs +(\(a_{yy} = 0\)) in the long-run relationship in the ARDL equation of +dep_1_0 on ind_1_0, ind_2_0. The five panels +represents the behavior of these series in the Cases from I to V. It is +worth noting the different scenario implied by these cases: case III +depicts a trend for the \(y_t\) variable, case IV highlights the inclusion +of a trend in the cointegrating relationship, and case V exhibits a +quadratic trend in the \(y_t\) variable.
    +Finally, the flowchart in Figure 3 details the +internal steps of the function sim_vecm_ardl and the data generation +workflow. There, it is specified how the parameters of the VAR, VECM and +ARDL equation are introduced. Attention is paid on whether the error +correction mechanism involves either intercept or trend (or both) via +the internal computation of the parameters \(\theta_0\) and \(\theta_1\) +(and thus \(\alpha_{0.y}^{EC}\) and \(\alpha_{1.y}^{EC}\)). When the EC term +does not involve intercept and/or trend, \(\boldsymbol\alpha_0\) and +\(\boldsymbol\alpha_1\) are supplied by the user, depending on the case +under study.

    +
    +
    +graphic without alt text +

    +Figure 1: Simulated data from the VECM / conditional ARDL specifications, for every case. Made with ggplot . +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 2: Simulated data from the VECM / conditional ARDL specifications (degenerate case of type 2, a_{yy}=0), for every case. Made with ggplot. +

    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Figure 3: Flowchart of the sim_vecm_ardl function inner steps. When applying (7) and (8), \(y_{t_j}=0, \Delta y_{t_j}=0, \mathbf x_{t_j}=\mathbf 0, \Delta \mathbf x_{t_j}= \mathbf 0\) for any \(t_j < 1\). Boxes denote parameter definitions and transformations. Circles denote crucial actions, Empty nodes denote function inputs. +

    +
    +
    +
    +

    Bootstrapping the ARDL bound tests: the boot_ardl function

    +

    This function develops the bootstrap procedure detailed previously. As +an option in the initial estimation phase, it offers the possibility of +automatically choosing the best order for the lagged differences of all +the variables in the ARDL and VECM models. This is done by using several +criteria. In particular, AIC, BIC, AICc, \(R^2\) and \(R^2_{adj}\) are used +as lag selection criteria for the ARDL model, while the overall minimum +between AIC, HQIC, SC and FPE is used for the lag selection for the +VECM.
    +In particular, the auto_ardl function in the package +ARDL (Natsiopoulos and Tzeremes 2021) selects +the best ARDL order in terms of the short-run parameter vectors +\(\boldsymbol\gamma_{y.x,j}\), while the VARselect function in the +package vars (Pfaff 2008b) +selects the best VECM order in terms of the short-run parameter matrices +\(\boldsymbol\Gamma_{(x),j}\). Furthermore, the user can input a +significance threshold for the retention of single parameters in the +\(\boldsymbol\Gamma_j\) and in the \(\boldsymbol\gamma_{y.x,j}\) vectors.
    +The function boot_ardl takes the following arguments:

    +
      +
    • data: input dataset. Must contain a dependent variable and a set +of independent variables;

    • +
    • yvar: name of the dependent variable enclosed in quotation marks. +If unspecified, the first variable in the dataset is used;

    • +
    • xvar: vector of names of the independent variables, each enclosed +in quotation marks. If unspecified, all variables in the dataset +except the first are used;

    • +
    • fix.ardl: vector \((j_1,\dots,j_K)\), containing the maximum orders +of the lagged differences (i.e., +\(\Delta y_{t-j_1}, \Delta x_{1,t-j_2},\dots,\) \(\Delta x_{1,t-j_K}\)) +for the short term part of the ARDL equation, chosen in advance;

    • +
    • info.ardl: (alternatively to fix.ardl) the information criterion +used to choose the best lag order for the short term part of the +ARDL equation. It must be one between AIC (default), AICc, +BIC, R2, , adjR2;

    • +
    • fix.vecm: scalar \(m\) containing the maximum order of the lagged +differences (i.e., \(\Delta\mathbf z_{t-m}\)) for the short term part +of the VECM equation, chosen in advance;

    • +
    • info.vecm: (alternatively to fix.vecm) the information criterion +used to choose the best lag order for the short term part of the +VECM equation. Must be one among AIC (default), HQIC, SC, +FPE;

    • +
    • maxlag: (in conjunction with info.ardl / info.vecm) maximum +number of lags for the auto_ardl function in the package +ARDL, and for the +VARselect function in the package +vars;

    • +
    • a.ardl: significance threshold for the short-term ARDL +coefficients (\(\boldsymbol\gamma_{y.x,j}\)) in the ARDL model +estimation;

    • +
    • a.vecm: significance threshold for the short-term VECM +coefficients (in \(\boldsymbol\Gamma_j\)) in the VECM model +estimation;

    • +
    • nboot: number of bootstrap replications;

    • +
    • case: type of the specification for the conditional ARDL in terms +of deterministic components (intercept and trend) among the five +proposed by (Pesaran et al. 2001), given in +(14)-(18);

    • +
    • a.boot.H0: probability/ies \(\alpha\) by which the critical +quantiles of the bootstrap distribution(s) \(c^{*}_{\alpha,F_{ov}}\), +\(c^{*}_{\alpha,t}\) and \(c^{*}_{\alpha,F_{ind}}\) must be calculated;

    • +
    • print: if set to TRUE, shows the progress bar.

    • +
    +

    boot_ardl makes use of the lag_mts function which produces lagged +versions of a given matrix of time series, each column with a separate +order. lag_mts takes as parameters the data included in a matrix X +and the lag orders in a vector k, with the addition of a boolean +parameter last.only, which allows to specify whether only the \(k\)-th +order lags have to be retained, or all the lag orders from the first to +the \(k\)-th.
    +boot_ardl also acts as a wrapper for the most common methodologies +detecting cointegration, offering a comprehensive view on the testing +procedures involved in the analysis. The resulting object, of class +bootCT, contains all the information about

    +
      +
    • The conditional ARDL model estimates, and the unconditional VECM +model estimates;

    • +
    • the bootstrap tests performed in the conditional ARDL model;

    • +
    • the Pesaran, Shin and Smith bound testing procedure (\(F_{ov}\) and +\(t\)-test, when applicable);

    • +
    • the Sam, McNown and Goh bound testing procedure for \(F_{ind}\), when +applicable;

    • +
    • the Johansen rank and trace cointegration tests on the independent +variables.

    • +
    +

    Internally, the bootstrap data generation under the null is executed via +a Rcpp function, employing the +Rcpp and +RcppArmadillo +packages (Eddelbuettel 2013), so as to greatly speed up computational times. As +explained in the previous section, cointegration tests in the +unconditional ARDL model are performed in order to uncover the presence +of spurious cointegrating relationships.
    +To this end, the function provides

    +
      +
    • the bootstrap critical values of the \(F_{ov}\), \(t\) and \(F_{ind}\) +tests in the conditional model, at level a.boot.H0, along with the +same statistics computed in the conditional model.

    • +
    • a flag, called fakecoint, that indicates divergence between the +outcomes of the \(F_{ind}\) test performed in both the conditional and +unconditional model. In this circumstance, as explained before, +there is no cointegration (see Bertelli et al. 2022).

    • +
    +

    A summary method has been implemented to present the results in a +visually clear manner. It accepts the additional argument "out" that +lets the user choose which output(s) to visualize: ARDL prints the +conditional ARDL model summary, VECM prints the VECM model summary, +cointARDL prints the summary of the bound tests and the bootstrap +tests, cointVECM prints the summary of the Johansen test on the +independent variables.
    +A detailed flowchart showing the function’s workflow is displayed in +Figure 4. There, the expressions "C ARDL" +and "UC ARDL" stand for conditional and unconditional ARDL model, +respectively.
    +

    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Figure 4: Flowchart of the boot_ardl function inner steps. Boxes denote parameter definitions and transformations. Diamonds denote function outputs. Dashed diamonds denote intermediate output (not shown after function call). Empty nodes denote function inputs. The first p+1 rows of \(\mathbf z_t^{(b)}\) are set equal to the first p+1 rows of the original data. The best lag order for each difference variable in the ARDL model is determined via auto_ardl(). It is reported as a unique value p in \(\boldsymbol{\gamma}_{y.x,j}\) for brevity in the flowchart. +

    +
    +
    +
    +

    Execution time and technical remarks

    +

    In order to investigate the sensitivity of the procedure to different +sample sizes and number of bootstrap replicates, an experiment has been +run using a three-dimensional time series of length +\(T=\{50,80,100,200,500\}\), generating 100 datasets for each sample size +with the sim_vecm_ardl function (Case II, with cointegrated variables, +and 2 lags in the short-run section of the model).
    +Then, the boot_ardl function has been called

    +
    boot_ardl(data = df_sim,
    +          nboot = bootr,
    +          case = 2,
    +          fix.ardl = rep(2, 3),
    +          fix.vecm = 2)
    +

    In the code above, bootr has been set equal to +\(B=\{200,500,1000,2000\}\), the number of lags has been assumed known +(fix.ardl and fix.vecm), while default values have been used for +every other argument (such as a.ardl, a.vecm and a.boot.H0).
    +Table 1 shows the average running time per replication +together with the coefficient of variation (%) of the bootstrap critical +values of the \(F_{ov}\) test, for each value of \(T\) and \(B\), across 100 +replications for each scenario.
    +Naturally, the running time increases as both sample size and bootstrap +replicates increase. However, it can be noticed how the coefficients of +variation tend to stabilize for \(B \geq 1000\), especially for \(T>80\), at +the 5% significance level. Therefore, it is recommended a number of +bootstrap replicates of at least \(B=1000\) for higher sample size, or at +least \(B=2000\) for smaller samples. The analysis has been carried out +using an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of +RAM.

    +
    + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    \(T\)\(B\)Exec. Time (sec)\(cv^{(F_{ov})}(5\%)\)\(cv^{(F_{ov})}(2.5\%)\)\(cv^{(F_{ov})}(1\%)\)
    5020023.388.64810.92513.392
    5050048.376.3126.9528.640
    50100096.654.8065.6136.288
    502000231.154.2554.2264.946
    8020023.467.2518.93611.263
    8050050.194.9986.2207.946
    801000143.003.8824.4535.305
    802000255.642.9123.6234.518
    10020037.897.7078.58310.955
    10050052.864.6915.3047.557
    1001000184.513.5124.5675.695
    1002000212.653.5193.6744.185
    20020035.466.6447.17310.365
    20050076.784.7345.3556.225
    2001000148.253.1244.1775.034
    2002000484.512.8113.3613.907
    50020054.476.6418.69410.414
    500500133.175.1375.8166.408
    5001000271.873.9054.5855.283
    5002000561.713.2213.4904.145
    +

    Table 1: Average execution times (in seconds) of the boot_ardl +function, for different combinations of sample size \(T\) and bootstrap +replicates \(B\). Coefficients of variation (\(cv\)) reported for the +\(F_{ov}\) bootstrap critical values at level 5%, 2.5% and 1%.

    +
    +

    5 Empirical applications

    +

    This section provides two illustrative application which highlight the +performance of the bootstrap ARDL tests.

    +

    An application to the German macroeconomic dataset

    +

    In the first example, the occurrence of a long-run relationship between +consumption [C], income [INC], and investment [INV] of Germany has +been investigated via a set of ARDL models, where each variable takes in +turn the role of dependent one, while the remaining are employed as +independent. The models have been estimated by employing the dataset of +Lütkepohl (2005) which includes quarterly data of the series over the +years 1960 to 1982. The data have been employed in logarithmic form. +Figure 5 displays these series over the sample +period.
    +Before applying the bootstrap procedure, the order of integration of +each series has been analyzed. Table 2 shows the results of +ADF test performed on both the series and their first-differences (\(k=3\) +maximum lags). The results confirm the applicability of the ARDL +framework as no series is integrated of order higher than one.
    +The following ARDL equations have been estimated:

    +
      +
    1. First ARDL equation (C | INC, INV): +\[\begin{align} + \Delta \log \text{C}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{C}_{t-1} - {a}_{y.x_1}\log \text{INC}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{INC}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + +\end{align}\]

    2. +
    3. Second ARDL equation (INC | C, INV): +\[\begin{align} + \Delta \log \text{INC}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INC}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + +\end{align}\]

    4. +
    5. Third ARDL equation (INV | C, INC): +\[\begin{align} + \Delta \log \text{INV}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INV}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INC}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INV}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INC}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INC}_{t}+\nu_{t}. + +\end{align}\]

    6. +
    +

    Table 3 shows the estimation results for each ARDL and VECM +model. It is worth noting that the instantaneous difference of the +independent variables are highly significant in each conditional ARDL +model. Thus, neglecting these variables in the ARDL equation, as happens +in the unconditional version of the model, may potentially lead to +biased estimates and incorrect inference. For the sake of completeness, +also the results of the marginal VECM estimation are reported for each +model.
    +The code to prepare the data, available in the package as the +ger_macro dataset, is:

    +
        data("ger_macro")
    +    LNDATA = apply(ger_macro[,-1], 2, log)
    +    col_ln = paste0("LN", colnames(ger_macro)[-1])
    +    LNDATA = as.data.frame(LNDATA)
    +    colnames(LNDATA) = col_ln
    +

    Then, the boot_ardl function is called, to perform the bootstrap +tests. In the code chunk below, Model I is considered.

    +
        set.seed(999)
    +    BCT_res_CONS = boot_ardl(data = LNDATA,
    +                         yvar = "LNCONS",
    +                         xvar = c("LNINCOME", "LNINVEST"),
    +                         maxlag = 5,
    +                         a.ardl = 0.1,
    +                         a.vecm = 0.1,
    +                         nboot = 2000,
    +                         case = 3,
    +                         a.boot.H0 = c(0.05),
    +                         print = T)
    +

    to which follows the call to the summary function

    +
        summary(BCT_res_CONS, out = "ARDL")
    +    summary(BCT_res_CONS, out = "VECM")
    +    summary(BCT_res_CONS, out = "cointVECM")
    +    summary(BCT_res_CONS, out = "cointARDL")
    +

    The first summary line displays the output in the ARDL column of Table +3 and the second column of Table 4, Model +I. The second line corresponds to the VECM columns of Table +3, Model I - only for the independent variables. The +information on the rank of the \(\mathbf A_{xx}\) in Table 3 +is inferred from the third line. Finally, the fourth summary line +corresponds to the test results in Table 4, Model I. A +textual indication of the presence of spurious cointegration is +displayed at the bottom of the "cointARDL" summary, if detected.
    +In this example, the bootstrap and bound testing procedures are in +agreement only for model I, indicating the existence of a cointegrating +relationship. Additionally, no spurious cointegration is detected for +this model. As for models II and III, the null hypothesis is not +rejected by the bootstrap tests, while the PSS and SMG bound tests fail +to give a conclusive answer in the \(F_{ind}\) test.
    +The running time of the entire analysis is of roughly 11 minutes, using +an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of RAM.

    +
    +
    + + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 2: ADF preliminary test (null hypothesis: random walk with +drift).
    level variablefirst difference
    SerieslagADFp.valueADFp-value
    \(\log\text{C}_t\)0-1.6900.450-9.750<0.01
    1-1.8600.385-5.190<0.01
    2-1.4200.549-3.1300.030
    3-1.0100.691-2.7200.080
    \(\log\text{INC}_t\)0-2.2900.217-11.140<0.01
    1-1.9600.345-7.510<0.01
    2-1.4900.524-5.120<0.01
    3-1.3100.587-3.2900.020
    \(\log\text{INV}_t\)0-1.2000.625-8.390<0.01
    1-1.3700.565-5.570<0.01
    2-1.3600.570-3.3000.020
    3-1.2200.619-3.1000.032
    +
    +
    +
    +
    +
    +graphic without alt text +

    +Figure 5: log-consumption/investment/income graphs (level variables and first differences). Made with ggplot. +

    +
    +
    +
    +
    + + ++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 3: Conditional ARDL and VECM results for the +consumption/income/investment dataset, along with rank of the +\(\mathbf A_{xx}\) matrix via the Johansen (J) test.
    +Significance codes: (***) 1%; (**) 5%; (.) 10%.
    Model IModel IIModel III
    ARDLVECMARDLVECMARDLVECM
    \(\Delta\log\text{C}_t\)\(\Delta\log\text{INV}_t\)\(\Delta\log\text{INC}_t\)\(\Delta\log\text{INC}_t\)\(\Delta\log\text{C}_t\)\(\Delta\log\text{INV}_t\)\(\Delta\log\text{INV}_t\)\(\Delta\log\text{C}_t\)\(\Delta\log\text{INC}_t\)
    \(\log\text{C}_{t-1}\)-0.307 *** (0.055)0.168 * (0.081)-0.0011 (0.0126)0.1286 * (0.0540)0.611 . (0.339)-0.2727 *** (0.0704)-0.0508 (0.0796)
    \(\log\text{INC}_{t-1}\)0.297 *** (0.055)0.124 * (0.054)-0.017 (0.014)-0.183 * (0.079)-0.491 (0.340)0.2619 *** (0.0681)0.0464 (0.0772)
    \(\log\text{INV}_{t-1}\)-0.001 (0.011)-0.152 * (0.063)0.016 (0.017)0.0209 (0.0135)-0.00107 (0.0142)-0.1531 * (0.0607)-0.1212 * (0.060)
    \(\Delta\log\text{C}_{t-1}\)-0.248 ** (0.079)0.899 * (0.442)0.211 . (0.113)0.375 *** (0.1086)0.9288 * (0.442)1.113 * (0.441)0.2072 . (0.1142)
    \(\Delta\log\text{C}_{t-2}\)0.744 (0.431)0.8049 . (0.4345)
    \(\Delta\log\text{INC}_{t-1}\)-0.1404 (0.1095)
    \(\Delta\log\text{INC}_{t-2}\)0.2675 ** (0.0958)0.1522 . (0.0912)
    \(\Delta\log\text{INV}_{t-1}\)-0.18 (0.111)0.035 (0.029)-0.189 . (0.1097)-0.175 (0.1075)0.0479 . (0.0282)
    \(\Delta\log\text{INV}_{t-2}\)0.049 . (0.027)0.0591 * (0.0245)0.0578 * (0.0223)0.0562 * (0.0266)
    \(\Delta\log\text{C}_t\)0.7070 *** (0.1093)1.8540 *** (0.5425)
    \(\Delta\log\text{INC}_t\)0.471 *** (0.074)-0.445 *** (0.4726)
    \(\Delta\log\text{INV}_t\)0.065 ** (0.019)-0.0230 (0.025)
    const.0.048 *** (0.013)0.036 (0.066)0.033 * (0.017)0.002 (0.018)0.0266 . (0.0155)0.023 (0.0666)-0.056 (0.072)0.0517 ** (0.0157)0.0378 * (0.0177)
    J-test\(rk(\mathbf{A_{xx}})=2\)\(rk(\mathbf{A_{xx}})=2\)\(rk(\mathbf{A_{xx}})=2\)
    +
    +
    + + +++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 4: Cointegration analysis for the three ARDL equations in the +German macroeconomic data. The optimal number of ARDL lags in the +short-run - in the form \((y,x_1,x_2)\), matching the model definition - +bootstrap critical values, bound test thresholds and test statistics +for each test are shown (case III).
    +The outcome columns draw conclusions on each type of model (bootstrap +or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of +type 1, D2 = degenerate of type 2, U = inconclusive inference.
    PSS / SMG ThresholdOutcome
    ModelLagsTestBoot. Critical ValuesI(0) 5%I(1) 5%StatisticBootBound
    I(1,0,0)\(F_{ov}\)3.793.794.8510.75YY
    \(t\)-2.88-2.86-3.53-5.608
    \(F_{ind}\)4.923.015.4215.636
    II(1,1,0)\(F_{ov}\)5.793.794.852.867NU
    \(t\)-3.69-2.86-3.53-2.315
    \(F_{ind}\)7.383.015.423.308
    III(1,1,0)\(F_{ov}\)5.503.794.853.013NU
    \(t\)-3.32-2.86-3.53-2.020
    \(F_{ind}\)6.633.015.424.189
    +
    +

    An application on Italian Macroeconomic Data

    +

    Following Bertelli et al. (2022), the relationship between foreign +direct investment [FDI], exports [EXP], and gross domestic product +[GDP] in Italy is investigated. The data of these three yearly +variables have been retrieved from the World Bank Database and cover the +period from 1970 to 2020. In the analysis, the log of the variables has +been used and [EXP] and [FDI] have been adjusted using the GDP +deflator. Figure 6 displays these series over the +sample period.

    +
    +
    +
    +graphic without alt text +

    +Figure 6: log-GDP/export/investment graphs (level variables and first differences). Made with ggplot. +

    +
    +
    +
    +

    Table 5 shows the outcomes of the ADF test performed on +each variable, which ensures that the integration order is not higher +than one for all variables. Table 6 shows the results +of bound and bootstrap tests performed in ARDL model by taking each +variable, in turn, as the dependent one. The following ARDL equations +have been estimated:

    +
      +
    1. First ARDL equation (GDP | EXP, FDI): +\[\begin{align} + \Delta \log \text{GDP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{GDP}_{t-1} - {a}_{y.x_1}\log \text{EXP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{EXP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t}. + +\end{align}\] +For this model, a degenerate case of the first type can be +observed, while the simpler bound testing procedure does not signal +cointegration.

    2. +
    3. Second ARDL equation (EXP | GDP, FDI): +\[\begin{align} + \Delta \log \text{EXP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{EXP}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t}. + +\end{align}\] +For this model, the ARDL bootstrap test indicates absence of +cointegration, while the bound testing approach is inconclusive for +the \(F_{ind}\) test.

    4. +
    5. Third ARDL equation (FDI | GDP, EXP): +\[\begin{align} + \Delta \log \text{FDI}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{FDI}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{EXP}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{FDI}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{EXP}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{EXP}_{t}+\nu_{t}. + +\end{align}\] +For this model, the long-run cointegrating relationship is confirmed +using both boostrap and bound testing. No spurious cointegration is +detected.

    6. +
    +

    The code to load the data and perform the analysis (e.g. for Model I) +is:

    +
        data("ita_macro")
    +    BCT_res_GDP = boot_ardl(data = ita_macro,
    +                         yvar = "LGDP",
    +                         xvar = c("LEXP", "LFI"),
    +                         maxlag = 5,
    +                         a.ardl = 0.1,
    +                         a.vecm = 0.1,
    +                         nboot = 2000,
    +                         case = 3,
    +                         a.boot.H0 = c(0.05),
    +                         print = T)
    +

    For the sake of simplicity, the conditional ARDL and VECM marginal +models outputs included in each cointegrating analysis is omitted. The +summary for the cointegration tests for Model I is called via

    +
        summary(BCT_res_GDP, out = "ARDL") # extract lags
    +    summary(BCT_res_GDP, out ="cointARDL") # ARDL cointegration
    +

    This empirical application further highlights the importance of dealing +with inconclusive inference via the bootstrap procedure, while naturally +including the effect of conditioning in the ARDL model, as highlighted +in Bertelli et al. (2022).

    +
    + + +++++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 5: ADF preliminary test for the second example.
    No Drift, No TrendDrift, No TrendDrift and Trend
    VariableLag = 0Lag = 1Lag = 2Lag = 3Lag = 0Lag = 1Lag = 2Lag = 3Lag = 0Lag = 1Lag = 2Lag = 3
    \(\log \text{GDP}_t\)0.990.9740.9410.796<0.01<0.01<0.010.0840.990.990.990.99
    \(\log \text{FDI}_t\)0.5720.5990.6750.725<0.010.07590.31990.5174<0.010.0130.1510.46
    \(\log \text{EXP}_t\)0.7870.710.6980.6840.4790.2880.4670.4330.6290.350.4630.379
    \(\Delta\log \text{GDP}_t\)<0.01<0.01640.04290.0402<0.010.08610.39890.4267<0.01<0.010.01660.017
    \(\Delta\log \text{FDI}_t\)<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01
    \(\Delta\log \text{EXP}_t\)<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.01<0.010.03360.0315
    +
    +
    + + +++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 6: Cointegration analysis for the three ARDL equations in the +Italian macroeconomic data. The optimal number of ARDL lags in the +short-run - in the form \((y,x_1,x_2)\), matching the model definition - +bootstrap critical values, bound test thresholds and test statistics +for each test are shown (case III).
    +The outcome columns draw conclusions on each type of model (bootstrap +or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of +type 1, D2 = degenerate of type 2, U = inconclusive inference.
    PSS / SMG ThresholdOutcome
    ModelLagsTestBoot. Critical ValuesI(0) 5%I(1) 5%StatisticBootBound
    I(1,1,0)\(F_{ov}\)3.7304.0705.1909.758D1N
    \(t\)-2.020-2.860-3.530-2.338
    \(F_{ind}\)3.7103.2205.6202.273
    II(1,0,0)\(F_{ov}\)5.4004.0705.1902.649NU
    \(t\)-3.380-2.860-3.530-1.889
    \(F_{ind}\)5.6303.2205.6203.481
    III(1,0,0)\(F_{ov}\)5.3604.0705.1906.716YY
    \(t\)-3.550-2.860-3.530-4.202
    \(F_{ind}\)6.5003.2205.6207.017
    +
    +

    6 Conclusion

    +

    The bootCT package +allows the user to perform bootstrap cointegration tests in ARDL models +by overcoming the problem of inconclusive inference which is a +well-known drawback of standard bound tests. The package makes use of +different functions. The function boot_ardl performs the bootstrap +tests, and it acts as a wrapper of both the bootstrap and the standard +bound tests, including also the Johansen test on the independent +variables of the model. Finally, it also performs the bound \(F\)-test on +the lagged independent variables, so far not available in other extant +R packages. The function sim_vecm_ardl, which allows the simulation +of multivariate time series data following a user-defined DGP, enriches +the available procedures for multivariate data generation, while the +function lag_mts provides a supporting tool in building datasets of +lagged variables for any practical purpose. Finally, the use of Rcpp +functions gives a technical advantage in terms of computational speed, +performing the bootstrap analysis within an acceptable time frame.

    +

    7 Appendix

    +

    Section A - the methodological framework of (conditional) VECM and ARDL models

    +

    Expanding the matrix polynomial \(\mathbf{A}(z)\) about \(z=1\), yields +\[ +\mathbf{A}(z)=\mathbf{A}(1)z+(1-z)\boldsymbol{\Gamma}(z), \tag{31}\] +where +\[\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}\]

    +

    \[ +\boldsymbol{\Gamma}(z)=\mathbf{I}_{K+1}-\sum_{i=1}^{p-1}\boldsymbol{\Gamma}_{i}z^i, \enspace \enspace \boldsymbol{\Gamma}_{i}=-\sum_{j=i+1}^{p}\mathbf{A}_j. \tag{32}\] +The VECM model (2) follows accordingly, and

    +

    \[ +\boldsymbol{\alpha}_0=\mathbf{A}(1)\boldsymbol{\mu}+(\boldsymbol{\Gamma}(1)-\mathbf{A}(1))\boldsymbol{\eta}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\mathbf{A}(1)\boldsymbol{\eta}. \tag{33}\] +Assuming that \(\mathbf{A}(1)\) is singular and that the variables +\(\mathbf{x}_{t}\) are cointegrated. This entails the following +\[\begin{align} + \mathbf{A}(1)=&\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}=\underset{(K+1,r+1)}{\mathbf{B}}\underset{(r+1,K+1)}{\mathbf{C}'}=\begin{bmatrix}b_{yy} & \mathbf{b}_{yx}'\\ \mathbf{b}_{xy} & \mathbf{B}_{xx} \end{bmatrix}\begin{bmatrix}c_{yy} & \mathbf{c}_{yx}'\\ \mathbf{c}_{xy} & \mathbf{C}_{xx}'\end{bmatrix}= \nonumber\\ +=&\begin{bmatrix}b_{yy}c_{yy}+\mathbf{b}_{yx}'\mathbf{c}_{xy} & b_{yy}\mathbf{c}_{yx}'+\mathbf{b}_{yx}'\mathbf{C}_{xx}'\\ +\mathbf{b}_{xy}c_{yy}+\mathbf{B}_{xx}\mathbf{c}_{xy} & \mathbf{b}_{xy}\mathbf{c}_{yx}'+ \mathbf{A}_{xx} \end{bmatrix}, \enspace \enspace \enspace rk(\mathbf{A}(1))=rk(\mathbf{B})=rk(\mathbf{C}),\tag{34} +\end{align}\]
    +where \(\mathbf{B}\) and \(\mathbf{C}\) are full column rank matrices +arising from the rank-factorization of +\(\mathbf{A}(1)=\mathbf{B}\mathbf{C}'\) with \(\mathbf{C}\) matrix of the +long-run relationships of the process and \(\mathbf{B}_{xx}\), +\(\mathbf{C}_{xx}\) arising from the rank factorization of +\(\mathbf{A}_{xx}=\mathbf{B}_{xx}\mathbf{C}_{xx}'\), with +\(rk(\mathbf{A}_{xx})=rk(\mathbf{B}_{xx})=rk(\mathbf{C}_{xx})=r\) 6.
    +By partitioning the vectors \(\boldsymbol{\alpha}_{0}\), +\(\boldsymbol{\alpha}_{1}\), the matrix \(\mathbf{A}(1)\) and the polynomial +matrix \(\boldsymbol{\Gamma}(L)\) conformably to \(\mathbf{z}_{t}\), as +follows

    +

    \[ +\boldsymbol{\alpha}_0=\begin{bmatrix} +\underset{(1,1)}{\alpha_{0y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{0x}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\begin{bmatrix} +\underset{(1,1)}{\alpha_{1y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{1x} } +\end{bmatrix} \tag{35}\]

    +

    \[ +\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,K+1)}{\mathbf{a}'_{(y)}} \\ \underset{(K,K+1)}{\mathbf{A}_{(x)}} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}'_{yx}} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx} } +\end{bmatrix}, +\enspace \enspace \enspace +\boldsymbol{\Gamma}(L)=\begin{bmatrix} +\underset{(1,K+1)}{\boldsymbol{\gamma}'_{y}(L)} \\ \underset{(K,K+1)}{\boldsymbol{\Gamma}_{(x)}(L)} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{\gamma_{yy}(L)} & \underset{(1,K)}{\boldsymbol{\gamma}'_{yx}(L)} \\ \underset{(K,1)}{\boldsymbol{\gamma}_{xy}(L)} & \underset{(K,K)}{\boldsymbol{\Gamma}_{xx}(L) } +\end{bmatrix} \tag{36}\] +, and substituting (5) into (2) yields

    +

    \[ +\Delta\mathbf{z}_t=\begin{bmatrix} +\Delta y_{t} \\ \Delta\mathbf{x}_{t} +\end{bmatrix}=\begin{bmatrix} +\alpha_{0.y} \\ \boldsymbol{\alpha}_{0x} +\end{bmatrix} + \begin{bmatrix} +\alpha_{1.y} \\ \boldsymbol{\alpha}_{1x} +\end{bmatrix}t- \begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}\begin{bmatrix} +y_{t-1} \\ \mathbf{x}_{t-1} +\end{bmatrix} + \begin{bmatrix} +\boldsymbol{\gamma}'_{y.x}(L) \\ \boldsymbol{\Gamma}_{(x)}(L) +\end{bmatrix}\Delta\mathbf{z}_t+\begin{bmatrix} +\boldsymbol{\omega}'\Delta\mathbf{x}_{t} \\ \mathbf{0} +\end{bmatrix}+\begin{bmatrix} +{\nu}_{yt} \\ \boldsymbol{\varepsilon}_{xt} +\end{bmatrix} \tag{37}\] +, where

    +

    \[ +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x} \tag{38}\]

    +

    \[ +\mathbf{a}'_{(y).x}=\mathbf{a}'_{(y)}-\boldsymbol{\omega}'\mathbf{A}_{(x)}, \enspace \enspace \enspace \boldsymbol{\gamma}'_{y.x}(L)=\boldsymbol{\gamma}_{y}'(L)-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x)}(L). \tag{39}\]

    +

    According to (37), the long-run relationships of the VECM +turn out to be now included in the matrix

    +

    \[ +\begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}=\begin{bmatrix} +a_{yy}-\boldsymbol{\omega}'\mathbf{a}_{xy} & \mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{a}_{xy}&\mathbf{A}_{xx} +\end{bmatrix}. \tag{40}\]

    +

    To rule out the presence of long-run relationships between \(y_{t}\) and +\(\mathbf{x}_{t}\) in the marginal model, the \(\mathbf{x}_{t}\) variables +are assumed to be exogenous with respect to the ARDL parameters, that is +\(\mathbf{a}_{xy}\) is assumed to be a null vector. Accordingly, the +long-run matrix in (40) becomes

    +

    \[ +\widetilde{\mathbf{A}}=\begin{bmatrix}a_{yy} & \mathbf{a}'_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{0} & \mathbf{A}_{xx} +\end{bmatrix}=\begin{bmatrix} +a_{yy} & \widetilde{\mathbf{a}}_{y.x}' \\ \mathbf{0}&\mathbf{A}_{xx}\end{bmatrix} =\begin{bmatrix} +b_{yy}c_{yy} & b_{yy}\mathbf c_{yx}'+(\mathbf{b}_{yx}'-\boldsymbol{\omega}'\mathbf{B}_{xx})\mathbf{C}_{xx}' \\ \mathbf{0}& \mathbf{B}_{xx}\mathbf{C}_{xx}'\end{bmatrix}. \tag{41}\]

    +

    After these algebraic transformations, the ARDL equation for +\(\Delta y_{t}\) can be rewritten as in (6).
    +In light of the factorization (34) of the matrix +\(\mathbf{A}(1)\), the long-run equilibrium vector \(\boldsymbol{\theta}\) +can be expressed as

    +

    \[ +\boldsymbol{\theta}'= +-\frac{1}{a_{yy}}\underset{(1,r+1)}{\left[b_{yy}\enspace\enspace(\mathbf{b}_{yx}-\boldsymbol{\omega}'\mathbf{B}_{xx})\right]} +\underset{(r+1,K)}{\begin{bmatrix} \mathbf{c}'_{yx}\\ \mathbf{C}'_{xx} \end{bmatrix}}, \tag{42}\]

    +

    where

    +

    \(\widetilde{\mathbf{a}}_{y.x}=\mathbf{a}_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx}\).
    +Bearing in mind that \(\mathbf{C}'_{xx}\) is the cointegrating matrix for +the variables \(\mathbf{x}_t\), the equation (42) leads to the +following conclusion

    +

    \[ +rk\begin{bmatrix}\mathbf{c}'_{yx}\\ \mathbf{C}'_{xx}\end{bmatrix}=\begin{cases} +r \to \enspace y_{t} \sim I(0) \\ +r+1 \to \enspace y_{t} \sim I(1) +\end{cases}, \tag{43}\] +where \(r=rk(\mathbf{A}_{xx})\) and \(0 \leq r\leq K\).
    +

    +

    Section B - Intercept and trend specifications

    +

    Pesaran et al. (2001) introduced five different specifications for the ARDL +model, which depend on the deterministic components that can be absent +or restricted to the values they assume in the parent VAR model. In this +connection, note that, in light of (33), the drift and the +trend coefficient in the conditional VECM (37) are defined +as +\[\boldsymbol{\alpha_{0}}^{c}=\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} , \enspace \enspace +\boldsymbol{\alpha_{1}}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta},\] +where \(\widetilde{\mathbf{A}}(1)\) is as in (41) and +\(\widetilde{\boldsymbol{\Gamma}}(1)=\begin{bmatrix} \boldsymbol{\gamma}_{y.x}'(1) \\ \boldsymbol{\Gamma}_{(x)}(1) \end{bmatrix}\).
    +Accordingly, after partitioning the mean and the drift vectors as +\[\underset{(1,K+1)}{\boldsymbol{\mu}'}=[\underset{(1,1)}{\mu_{y}},\underset{(1,K)}{\boldsymbol{\mu}_x'}], \enspace \underset{(1,K+1)}{\boldsymbol{\eta}'}=[\underset{(1,1)}{\eta_{y}},\underset{(1,K)}{\boldsymbol{\eta}_{x}'}],\] +the intercept and the coefficient of the trend of the ARDL equation +(6) are defined as +\[\alpha_{0.y}^{EC} += \mathbf{e}_{1}'\boldsymbol{\alpha_{0}}^{c} +=a_{yy}\mu_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\mu}_{x}+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}=a_{yy}(\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x})+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}, \enspace +\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}}'_{y.x}}{a_{yy}}\]

    +

    \[\enspace \enspace \alpha_{1.y}^{EC}=\mathbf{e}_{1}'\boldsymbol{\alpha_{1}}^{c}= +a_{yy}\eta_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\eta}_{x}=a_{yy}(\eta_{y}-\boldsymbol{\theta'}\boldsymbol{\eta}_{x}),\] +where \(\mathbf{e}_{1}\) is the \(K+1\) first elementary vector.
    +In the error correction term +\[EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}\] +the parameters that partake in the calculation of intercept and trend +are +\[\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}.\] +In particular, these latter are not null only when they are assumed to +be restricted in the model specification.
    +The five specifications proposed by  Pesaran et al. (2001) are

    +
      +
    1. No intercept and no trend: +\[\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf{0}.\] +It follows that +\[\theta_{0}=\theta_{1}=\alpha_{0.y}=\alpha_{1.y}=0.\] +Accordingly, the model is as in (14).

    2. +
    3. Restricted intercept and no trend: +\[\boldsymbol{\alpha}_{0}^{c}= \widetilde{\mathbf{A}}(1)\boldsymbol{\mu},\enspace \enspace \boldsymbol{\eta}=\mathbf{0},\] +which entails +\[\theta_0 \neq 0 \enspace\enspace\alpha_{0.y}^{EC}=a_{yy}\theta_{0}, \enspace \enspace +\alpha_{0.y}=\theta_{1}=\alpha_{1.y}=0.\] +Therefore, the intercept stems from the EC term of the ARDL +equation. The model is specified as in (15)

    4. +
    5. Unrestricted intercept and no trend: +\[\boldsymbol{\alpha}_{0}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\mu}, \enspace \enspace \boldsymbol{\eta}=\mathbf{0}.\] +Thus, +\[\alpha_{0.y}\neq 0,\enspace \enspace \theta_{0}=\theta_{1}=\alpha_{1.y}=0.\] +Accordingly, the model is as in (16).

    6. +
    7. Unrestricted intercept, restricted trend: +\[\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta}\enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta},\] +which entails +\[\alpha_{0.y} \neq 0,\enspace \enspace +\theta_{0}=0 \enspace \enspace +\theta_{1}\neq 0\enspace\enspace +\alpha_{1.y}^{EC}=a_{yy}\theta_1\enspace\enspace +\alpha_{1.y}=0.\] +Accordingly, the trend stems from the EC term of the ARDL equation. +The model is as in (17).

    8. +
    9. Unrestricted intercept, unrestricted trend: +\[\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} \enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\eta}.\] +Accordingly, +\[\alpha_{0.y} \neq 0 \enspace \enspace\alpha_{1.y} \neq 0, \enspace \enspace\theta_{0}=\theta_{1}=0.\] +The model is as in (18).

    10. +
    +

    8 CRAN packages used

    +

    bootCT, dynamac, magrittr, gtools, pracma, Rcpp, RcppArmadillo, Rmisc, ARDL, aod, vars, urca, aTSA, tseries, reshape2, ggplot2, stringr, tidyverse, dplyr, ggplot

    +

    9 CRAN Task Views implied by cited packages

    +

    ChemPhys, Databases, DifferentialEquations, Econometrics, Environmetrics, Finance, HighPerformanceComputing, MixedModels, ModelDeployment, NumericalMathematics, Phylogenetics, Spatial, TeachingStatistics, TimeSeries

    +

    10 Note

    +

    This article is converted from a Legacy LaTeX article using the +texor package. +The pdf version is the official version. To report a problem with the html, +refer to CONTRIBUTE on the R Journal homepage.

    +
    +
    +K. R. Abbasi, M. Shahbaz, Z. Jiao and M. Tufail. How energy consumption, industrial growth, urbanization, and CO2 emissions affect economic growth in pakistan? A novel dynamic ARDL simulations approach. Energy, 221: 119793, 2021. DOI 10.1016/j.energy.2021.119793. +
    +
    +M. A. Arranz and A. Escribano. Cointegration testing under structural breaks: A robust extended error correction model. Oxford Bulletin of Economics and Statistics, 62(1): 23–52, 2000. DOI 10.1111/1468-0084.00158. +
    +
    +S. M. Bache and H. Wickham. magrittr: A forward-pipe operator for r. 2022. URL https://CRAN.R-project.org/package=magrittr. R package version 2.0.3. +
    +
    +S. Bertelli, G. Vacca and M. Zoia. Bootstrap cointegration tests in ARDL models. Economic Modelling, 116: 105987, 2022. DOI 10.1016/j.econmod.2022.105987. +
    +
    +B. Bolker, G. R. Warnes and T. Lumley. gtools: Various r programming tools. 2022. URL https://CRAN.R-project.org/package=gtools. R package version 3.9.4. +
    +
    +H. W. Borchers. pracma: Practical numerical math functions. 2022. URL https://CRAN.R-project.org/package=pracma. R package version 2.4.2. +
    +
    +S. Cook. The power of single equation tests for cointegration. Applied Economics Letters, 13(5): 265–267, 2006. DOI 10.1080/13504850500398534. +
    +
    +R. Davidson and J. G. MacKinnon. The case against JIVE. Journal of Applied Econometrics, 21(6): 827–833, 2005. DOI 10.1002/jae.873. +
    +
    +D. Eddelbuettel. Seamless R and C++ integration with Rcpp. New York: Springer, 2013. DOI 10.1007/978-1-4614-6868-4. ISBN 978-1-4614-6867-7. +
    +
    +D. Eddelbuettel, R. Francois, D. Bates, B. Ni and C. Sanderson. RcppArmadillo: Rcpp integration for the Armadillo templated linear algebra library. 2023. URL https://CRAN.R-project.org/package=RcppArmadillo. R package version 0.12.4.0.0. +
    +
    +R. F. Engle and C. W. Granger. Co-integration and error correction: Representation, estimation, and testing. Econometrica: journal of the Econometric Society, 251–276, 1987. DOI 10.2307/1913236. +
    +
    +R. F. Engle and B. S. Yoo. Forecasting and testing in co-integrated systems. Journal of Econometrics, 35(1): 143–159, 1987. DOI 10.1016/0304-4076(87)90085-6. +
    +
    +N. R. Ericsson and J. G. MacKinnon. Distributions of error correction tests for cointegration. The Econometrics Journal, 5(2): 285–318, 2002. DOI 10.1111/1368-423X.00085. +
    +
    +V. J. Gabriel, Z. Psaradakis and M. Sola. A simple method of testing for cointegration subject to multiple regime changes. Economics Letters, 76(2): 213–221, 2002. +
    +
    +M. Haseeb, I. S. Z. Abidin, Q. M. A. Hye and N. H. Hartani. The impact of renewable energy on economic well-being of malaysia: Fresh evidence from auto regressive distributed lag bound testing approach. International Journal of Energy Economics and Policy, 9(1): 269, 2019. DOI 10.32479/ijeep.7229. +
    +
    +R. M. Hope. Rmisc: Ryan miscellaneous. 2022. URL https://CRAN.R-project.org/package=Rmisc. R package version 1.5.1. +
    +
    +H. I. Hussain, M. A. Salem, A. Z. A. Rashid and F. Kamarudin. Environmental impact of sectoral energy consumption on economic growth in malaysia: Evidence from ARDL bound testing approach. Ekoloji Dergisi, (107): 2019. +
    +
    +S. Johansen. Estimation and hypothesis testing of cointegration vectors in gaussian vector autoregressive models. Econometrica: journal of the Econometric Society, 1551–1580, 1991. DOI 10.2307/2938278. +
    +
    +S. Jordan and A. Q. Philips. Dynamac: Dynamic simulation and testing for single-equation ARDL models. 2020. URL https://CRAN.R-project.org/package=dynamac. R package version 0.1.11. +
    +
    +A. Kanioura and P. Turner. Critical values for an f-test for cointegration in a multivariate model. Applied Economics, 37(3): 265–270, 2005. DOI 10.1080/00036840412331315051. +
    +
    +J. J. Kremers, N. R. Ericsson and J. J. Dolado. The power of cointegration tests. Oxford bulletin of economics and statistics, 54(3): 325–348, 1992. DOI 10.1111/j.1468-0084.1992.tb00005.x. +
    +
    +S. Kripfganz and D. C. Schneider. Response surface regressions for critical value bounds and approximate p-values in equilibrium correction models 1. Oxford Bulletin of Economics and Statistics, 82(6): 1456–1481, 2020. DOI 10.1111/obes.12377. +
    +
    +Lesnoff, M., Lancelot and R. aod: Analysis of overdispersed data. 2012. URL https://cran.r-project.org/package=aod. R package version 1.3.2. +
    +
    +H. Lütkepohl. New introduction to multiple time series analysis. Springer Science & Business Media, 2005. DOI 10.1007/978-3-540-27752-1. +
    +
    +J. G. Mackinnon. Critical values for cointegration tests. In Eds.), long-run economic relationship: Readings in cointegration, 1991. Oxford Press. +
    +
    +G. S. Maddala and I.-M. Kim. Unit roots, cointegration, and structural change. 1998. DOI 10.1017/CBO9780511751974. +
    +
    +R. McNown, C. Y. Sam and S. K. Goh. Bootstrapping the autoregressive distributed lag test for cointegration. Applied Economics, 50(13): 1509–1521, 2018. DOI 10.1080/00036846.2017.1366643. +
    +
    +A. N. Menegaki. The ARDL method in the energy-growth nexus field; best implementation strategies. Economies, 7(4): 105, 2019. DOI 10.3390/economies7040105. +
    +
    +T. C. Mills and E. J. Pentecost. The real exchange rate and the output response in four EU accession countries. Emerging Markets Review, 2(4): 418–430, 2001. DOI 10.1016/S1566-0141(01)00027-9. +
    +
    +P. K. Narayan. The saving and investment nexus for china: Evidence from cointegration tests. Applied economics, 37(17): 1979–1990, 2005. DOI 10.1080/00036840500278103. +
    +
    +P. K. Narayan and R. Smyth. Crime rates, male youth unemployment and real income in australia: Evidence from granger causality tests. Applied Economics, 36(18): 2079–2095, 2004. DOI 10.1080/0003684042000261842. +
    +
    +K. Natsiopoulos and N. Tzeremes. ARDL: ARDL, ECM and bounds-test for cointegration. 2021. URL https://CRAN.R-project.org/package=ARDL. R package version 0.1.1. +
    +
    +M. H. Pesaran, Y. Shin and R. J. Smith. Bounds testing approaches to the analysis of level relationships. Journal of applied econometrics, 16(3): 289–326, 2001. DOI 10.1002/jae.616. +
    +
    +B. Pfaff. Analysis of integrated and cointegrated time series with r. Second New York: Springer, 2008a. URL https://www.pfaffikus.de. ISBN 0-387-27960-1. +
    +
    +B. Pfaff. VAR, SVAR and SVEC models: Implementation within R package vars. Journal of Statistical Software, 27(4): 2008b. URL https://www.jstatsoft.org/v27/i04/. +
    +
    +D. Qiu. aTSA: Alternative time series analysis. 2015. URL https://CRAN.R-project.org/package=aTSA. R package version 3.1.2. +
    +
    +A. M. Reda and E. Nourhan. Using the ARDL bound testing approach to study the inflation rate in egypt. Economic consultant, (3 (31)): 24–41, 2020. DOI 10.46224/ecoc.2020.3.2. +
    +
    +C. Y. Sam, R. McNown and S. K. Goh. An augmented autoregressive distributed lag bounds test for cointegration. Economic Modelling, 80: 130–141, 2019. DOI 10.1016/j.econmod.2018.11.001. +
    +
    +A. Trapletti and K. Hornik. tseries: Time series analysis and computational finance. 2023. URL https://CRAN.R-project.org/package=tseries. R package version 0.10-54. +
    +
    +G. Vacca and S. Bertelli. bootCT: Bootstrapping the ARDL tests for cointegration. 2023. R package version 2.0.0. +
    +
    +H. Wickham. ggplot2: Elegant graphics for data analysis. Springer-Verlag New York, 2016. URL https://ggplot2.tidyverse.org. +
    +
    +H. Wickham. Reshaping data with the reshape package. Journal of Statistical Software, 21(12): 1–20, 2007. URL http://www.jstatsoft.org/v21/i12/. +
    +
    +H. Wickham. stringr: Simple, consistent wrappers for common string operations. 2022. URL https://CRAN.R-project.org/package=stringr. R package version 1.5.0. +
    +
    +H. Wickham, M. Averick, J. Bryan, W. Chang, L. D. McGowan, R. François, G. Grolemund, A. Hayes, L. Henry, J. Hester, et al. Welcome to the tidyverse. Journal of Open Source Software, 4(43): 1686, 2019. DOI 10.21105/joss.01686. +
    +
    +H. Wickham, R. François, L. Henry, K. Müller and D. Vaughan. dplyr: A grammar of data manipulation. 2023. URL https://CRAN.R-project.org/package=dplyr. R package version 1.1.2. +
    +
    +V. Yilanci, S. Bozoklu and M. S. Gorus. Are BRICS countries pollution havens? Evidence from a bootstrap ARDL bounds testing approach with a fourier function. Sustainable Cities and Society, 55: 102035, 2020. DOI 10.1016/j.scs.2020.102035. +
    +
    +
    +
    +
      +
    1. The R packages, either used in the creation of +bootCT or employed +in the analyses presented in this paper, are +magrittr +(Bache and Wickham 2022), gtools +(Bolker et al. 2022), pracma +(Borchers 2022), Rcpp +(Eddelbuettel 2013), +RcppArmadillo +(Eddelbuettel et al. 2023), +Rmisc (Hope 2022), +dynamac +(Jordan and Philips 2020), ARDL +(Natsiopoulos and Tzeremes 2021), aod +(Lesnoff et al. 2012), vars and +urca +(Pfaff 2008b; Pfaff 2008a), +aTSA (Qiu 2015), +tseries +(Trapletti and Hornik 2023), +reshape2, +ggplot2 and +stringr +(Wickham 2007, 2016, 2022), +tidyverse and +dplyr +(Wickham et al. 2019, 2023).↩︎

    2. +
    3. If the explanatory variables are stationary \(\mathbf{A}_{xx}\) is +non-singular (\(rk(\mathbf{A}_{xx})=K\)), while when they are +integrated but without cointegrating relationship \(\mathbf{A}_{xx}\) +is a null matrix.↩︎

    4. +
    5. The knowledge of the rank of the cointegrating matrix is necessary +to overcome this impasse.↩︎

    6. +
    7. The latter is introduced in the ARDL equation by the operation of +conditioning \(y_t\) on the other variables \(\mathbf{x}_t\) of the +model↩︎

    8. +
    9. In fact, as +\(\boldsymbol{\omega}'\mathbf{A}_{xx}\mathbf{x}_{t} \approx I(0)\), +the conclusion that \(y_{t}\approx I(0)\) must hold. This in turn +entails that no cointegration occurs between \(y_t\) and +\(\mathbf{x}_{t}\).↩︎

    10. +
    11. If the explanatory variables are stationary \(\mathbf{A}_{xx}\) is +non-singular (\(rk(\mathbf{A}_{xx})=K\)), while when they are +integrated but without cointegrating relationship \(\mathbf{A}_{xx}\) +is a null matrix

      +
      +↩︎
    12. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Vacca, et al., "bootCT: An R Package for Bootstrap Cointegration Tests in ARDL Models", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-003,
    +  author = {Vacca, Gianmarco and Zoia, Maria and Bertelli, Stefano},
    +  title = {bootCT: An R Package for Bootstrap Cointegration Tests in ARDL Models},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-003},
    +  doi = {10.32614/RJ-2024-003},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {39-66}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-003/RJ-2024-003.pdf b/_articles/RJ-2024-003/RJ-2024-003.pdf new file mode 100644 index 0000000000..49dbdb856c Binary files /dev/null and b/_articles/RJ-2024-003/RJ-2024-003.pdf differ diff --git a/_articles/RJ-2024-003/RJournal.sty b/_articles/RJ-2024-003/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-003/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-003/RJwrapper.md b/_articles/RJ-2024-003/RJwrapper.md new file mode 100644 index 0000000000..8ab9ddbb21 --- /dev/null +++ b/_articles/RJ-2024-003/RJwrapper.md @@ -0,0 +1,1657 @@ +--- +abstract: | + The Autoregressive Distributed Lag approach to cointegration or bound + testing, proposed by Pesaran in 2001, has become prominent in + empirical research. Although this approach has many advantages over + the classical cointegration tests, it is not exempt from drawbacks, + such as possible inconclusive inference and distortion in size. + Recently, Bertelli and coauthors developed a bootstrap approach to the + bound tests to overcome these drawbacks. This paper introduces the R + package bootCT, which implements this method by deriving the bootstrap + versions of the bound tests and of the asymptotic F-test on the + independent variables proposed by Sam and coauthors in 2019. As a + spinoff, a general method for generating random multivariate time + series following a given VECM/ARDL structure is provided in the + package. Empirical applications showcase the main functionality of the + package. +address: +- | + Gianmarco Vacca\ + Department of Economic Policy. Università Cattolica del Sacro Cuore\ + Largo Gemelli, 1, Milan.\ + Italy\ + (0000-0002-8996-5524)\ + [gianmarco.vacca@unicatt.it](gianmarco.vacca@unicatt.it){.uri} +- | + Maria Zoia\ + Department of Economic Policy. Università Cattolica del Sacro Cuore\ + Largo Gemelli, 1, Milan.\ + Italy\ + (0000-0002-8169-781X)\ + [maria.zoia@unicatt.it](maria.zoia@unicatt.it){.uri} +- | + Stefano Bertelli\ + CRO Area, Internal Validation and Controls Department, Operational + Risk and ICAAP Internal Systems, Intesa Sanpaolo, Milan\ + Viale Stelvio, 55/57, Milan.\ + Italy\ + [stefano.bertelli@intesasanpaolo.com](stefano.bertelli@intesasanpaolo.com){.uri} +author: +- by Gianmarco Vacca, Maria Zoia, Stefano Bertelli +bibliography: +- vacca-zoia-bertelli.bib +nocite: "[@RSOFT]" +title: "bootCT: An R Package for Bootstrap Cointegration Tests in ARDL + Models" +--- + +::::::::::::::: article +## Introduction {#sec:intro} + +Cointegration and error correction are fundamental concepts in the +analysis of economic data, insofar as they provide an appropriate +framework for testing economic hypotheses about growth and fluctuation. +Several approaches have been proposed in the literature to determine +whether two or more non-stationary time series are cointegrated, meaning +they share a common long-run relationship.\ +There are two basic types of tests for cointegration: single equation +tests and VAR-based tests. The former check the presence of unit roots +in cointegration residuals [see, e.g., +@engle1987co; @engleyoo87; @Mackinnon91; @gabriel2002; @cook2006power] +or test the significance of the error-correction (EC) term coefficient +[@kremers1992power; @maddala1998; @arranz2000; @ericsson2002]. The +latter, such as the @johansen1991 approach, tackle the problem of +detecting cointegrating relationships in a VAR model. This latter +approach, albeit having the advantage of avoiding the issue of +normalization, as well as allowing the detection of multiple +cointegrating vectors, is far from being perfect. In the VAR system all +variables are treated symmetrically, as opposed to the standard +univariate models that usually have a clear interpretation in terms of +exogenous and endogenous variables. Furthermore, in a VAR system all the +variables are estimated at the same time, which is problematic if the +relation between some variables is flawed, that is affected by some +source of error. In this case a simultaneous estimation process tends to +propagate the error affecting one equation to the others. Furthermore, a +multidimensional VAR models employs plenty of degrees of freedom.\ +The recent cointegration approach, known as Autoregressive Distributed +Lag (ARDL) approach to cointegration or bound testing, proposed by + @pesaran2001 (PSS), falls in the former strand of literature. It has +become prominent in empirical research because it shows several +advantages with respect to traditional methods for testing +cointegration. First, it is applicable also in cases of mixed order +integrated variables, albeit with integration not exceeding the first +order. Thus, it evades the necessity of pre-testing the variables and, +accordingly, avoids some common practices that may prevent finding +cointegrating relationships, such as dropping variables or transforming +them into stationary form  [see @mcnown2018bootstrapping]. Second, +cointegration bound tests are performed in an ARDL model that allows +different lag orders for each variable, thus providing a more flexible +framework than other commonly employed approaches. Finally, unlike other +cointegration techniques, which are sensitive to the sample size, the +ARDL approach provides robust and consistent results for small sample +sizes.\ +Notably, the ARDL bound testing methodology has quickly spread in +economics and econometrics to study the cointegrating relationships +between macroeconomic and financial variables, to evaluate the long-run +impact of energy variables, or to assess recent environmental policies +and their impact on the economy. Among the many applications, see for +instance @haseeb2019impact +[@reda2020using; @menegaki2019ardl; @yilanci2020brics; @hussain2019environmental; @abbasi2021energy].\ +The original bound tests proposed by @pesaran2001 are an $F$-test for +the significance of the coefficients of all lagged level variables +entering the error correction term ($F_{ov}$), and a $t$-test for the +coefficient of the lagged dependent variable. When either the dependent +or the independent variables do not appear in the long-run relationship, +a degenerate case arises. The bound $t$-test provides answers on the +occurrence of a degenerate case of second type, while the occurrence of +a degeneracy case of first type can be assessed by testing whether the +dependent variable is of integration order I(1). This type of check +violates the spirit and motivation of the bound tests, which are +supposed to be applicable in situations of unknown order of integration +for the variables.\ +Recently, @mcnown2018bootstrapping pointed out how, due to the low power +problem of unit root tests, investigating the presence of a first type +degeneracy by testing the integration order of the dependent variable +may lead to incorrect conclusions. Therefore, they suggested checking +for its occurrence by testing the significance of the lagged levels of +the independent variables via an extra $F$-test ($F_{ind}$), which was +also worked out in its asymptotic version [SMK; @sam2019augmented].\ +Besides problems in testing the occurrence of degenerate cases, in +general, the main drawback of the bound tests is the occurrence of +potentially inconclusive results, if the test statistic lies between the +bounds of the test distribution under the null. Furthermore, the +asymptotic distributions of the statistics may provide a poor +approximation of the true distributions in small samples. Finite sample +critical values, even if only for a subset of all possible model +specifications, have been worked out in the literature [see +@mills2001real; @narayan2004crime; @kanioura2005critical; @narayan2005saving], +while [@kripfganz2020response] provided the quantiles of the asymptotic +distributions of the tests as functions of the sample size, the lag +order and the number of long-run forcing variables. However, this +relevant improvement does not eliminate the uncertainty related to the +inconclusive regions, or the existence of other critical issues related +to the underlying assumptions of the bound test framework, such as the +(weak) exogeneity of the independent variables or the non-stationarity +of the dependent variable.\ +To overcome the mentioned bound test drawbacks, [@bertelli2022bootstrap] +proposed bootstrapping the ARDL cointegration test. Inference can always +be pursued with ARDL bootstrap tests, unlike what happens with both the +PSS tests and the SMK test on the independent variables. Bootstrap ARDL +tests were first put forward by [@mcnown2018bootstrapping] in an +unconditional ARDL model, which omits the instantaneous differences of +the exogenous variables in the ARDL equation, rather than a conditional +one, as originally proposed by [@pesaran2001]. The unconditional model +is often used, for reason of practical convenience, in empirical +research. Simulation results in [@bertelli2022bootstrap] have +highlighted the importance of employing the appropriate specification, +especially under degenerate cases. In fact, it has been pointed out that +a correct detection of these cases requires the comparison of the test +outcomes in both the conditional and unconditional settings. Erroneous +conclusions, based exclusively on one model specification, can thus be +avoided.\ +In this paper, bootstrap bound tests, thereby including the bootstrap +versions of the $F_{ov}$, $t$ and $F_{ind}$ bound tests, are carried out +in a conditional ARDL model setting. This approach allows to overcome +the problem of inconclusive regions of the standard bound tests. A +comparison with the outcomes engendered by the unconditional ARDL +bootstrap tests is nevertheless provided for the $F_{ind}$ test, to +avoid erroneous inference in presence of degenerate cases.\ +The paper is organized as follows. Section [2](#sec:cointegration) +introduces the theoretical results of the ARDL cointegration bound +tests. Section [3](#sec:boot) details the steps carried out by the +bootstrap procedure, which allows the construction of the (bootstrap) +distribution - under the null - for the $F_{ov}$, $t$, conditional +$F_{ind}$ and unconditional $F_{ind}$ tests. Section [4](#sec:pkg) +introduces the `R` package +[**bootCT**](https://CRAN.R-project.org/package=bootCT) [@bootCT] and +its functionalities: a method for the generation of random multivariate +time series that follow a user-specified VECM/ARDL structure, with some +examples, and the main function that carries out the aforementioned +bootstrap tests, while also computing the PSS and SMK bound tests. The +trade-off between accuracy and computational time of the bootstrap +procedure is also investigated, under several scenarios in terms of +sample size and number of replications. Notably, a function that +performs the PSS bound tests is already available in the +[**dynamac**](https://CRAN.R-project.org/package=dynamac) package +[@PKGDYNAMAC], while no `R` routine has so far been implemented for the +SMK test, to the best of our knowledge. Section [5](#sec:app) gives some +empirical applications that employ the core function of the package and +its possible outputs. Section [6](#sec:end) concludes. Appendix +[7](#sec:appendix) briefly delves into technical details of the +conditional ARDL model and its possible specifications [^1]. + +## Cointegration bound tests in ARDL models {#sec:cointegration} + +The starting point of the approach proposed by  [@pesaran2001] is a +$(K+1)$ VAR($p$) model +$$\label{eq:var} +\mathbf{A}(L)(\mathbf{z}_t-\boldsymbol{\mu}-\boldsymbol{\eta}t)=\boldsymbol{\varepsilon}_t \enspace \enspace \enspace \boldsymbol{\varepsilon}_t\sim N(\mathbf{0}, \boldsymbol{\Sigma}),\qquad\mathbf{A}(L)=\left(\mathbf{I}_{K+1}- \sum_{j=1}^{p}\mathbf{A}_j\mathbf{L}^j\right) +\enspace \enspace \enspace t=1,2,\dots,T. (\#eq:var)$$ +Here, $\mathbf{A}_j$ are square $(K+1)$ matrices, $\mathbf{z}_t$ a +vector of $(K+1)$ variables, $\boldsymbol{\mu}$ and $\boldsymbol{\eta}$ +are $(K+1)$ vectors representing the drift and the trend respectively, +and $\det(\mathbf{A}(z))=0$ for $|z| \geq 1$. If the matrix +$\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}$ is +singular, the components of $\mathbf{z}_t$ turn out to be integrated and +possibly cointegrated.\ +The VECM representation of \@ref(eq:var) is given by (see Appendix +[7.1](#sec:appendixa) for details) +$$\label{eq:vecm} +\Delta\mathbf{z}_t=\boldsymbol{\alpha}_{0}+\boldsymbol{\alpha}_{1}t-\mathbf{A}(1)\mathbf{z}_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta \mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_t. (\#eq:vecm)$$ +Now, to study the adjustment to the equilibrium of a single variable +$y_t$, given the other $\mathbf{x}_t$ variables, the vectors +$\mathbf{z}_t$ and $\boldsymbol{\varepsilon}_t$ are partitioned +$$\label{eq:vecpart} +\mathbf{z}_t=\begin{bmatrix} +\underset{(1,1)}{y_{t}} \\ \underset{(K,1)}{\mathbf{x}_{t}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\varepsilon}_t=\begin{bmatrix} +\underset{(1,1)}{\varepsilon_{yt}} \\ \underset{(K,1)}{\boldsymbol{\varepsilon}_{xt}} +\end{bmatrix}. (\#eq:vecpart)$$ +The matrix $\mathbf{A}(1)$, which is assumed to be singular to allow +cointegration, is partitioned conformably to $\mathbf{z}_{t}$ as [^2]\ + +$$\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}.$$ +Under the assumption +$$\label{eq:normerr} +\boldsymbol{\varepsilon}_t \sim N\Bigg(\mathbf{0}, \begin{bmatrix} +\underset{(1,1)}{\sigma_{yy}}& \underset{(1,K)}{\boldsymbol{\sigma}_{yx}'} \\ \underset{(K,1)}{\boldsymbol{\sigma}_{xy}} & \underset{(K,K)}{\boldsymbol{\Sigma}_{xx}} \end{bmatrix}\Bigg), (\#eq:normerr)$$ +the following holds +$$\label{eq:epsilonx} +\varepsilon_{yt}=\boldsymbol{\omega}'\boldsymbol{\varepsilon}_{xt}+\nu_{yt} \sim N(0,\sigma_{y.x}), (\#eq:epsilonx)$$ +where +$\sigma_{y.x}=\sigma_{yy}-\boldsymbol{\omega}'\boldsymbol{\sigma}_{xy}$ +with +$\boldsymbol{\omega}'=\boldsymbol{\sigma}'_{yx}\boldsymbol{\Sigma}^{-1}_{xx}$, +and $\nu_{yt}$ is independent of $\boldsymbol{\varepsilon}_{xt}$.\ +Substituting \@ref(eq:epsilonx) into \@ref(eq:vecm) and assuming that +the $\mathbf{x}_{t}$ variables are exogenous towards the ARDL parameters +(that is, setting $\mathbf{a}_{xy}=\mathbf{0}$ in $\mathbf{A}(1)$) +yields the system (see Appendix [7.1](#sec:appendixa) for details) +$$\label{eq:ardl} + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt} (\#eq:ardl)$$ + +$$\label{eq:marg} +\Delta\mathbf{x}_{t} += \boldsymbol{\alpha}_{0x} +\boldsymbol{\alpha}_{1x}t+ \mathbf{A}_{(x)}\mathbf{z}_{t-1}+ \boldsymbol{\Gamma}_{(x)}(L)\Delta\mathbf{z}_t+ \boldsymbol{\varepsilon}_{xt}, (\#eq:marg)$$ +where +$$\label{eq:ardlgamma} +\boldsymbol\gamma_{y.x,j}'=\boldsymbol\gamma_{y,j}'-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x),j} (\#eq:ardlgamma)$$ + +$$\label{eq:ardldet} +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x}, (\#eq:ardldet)$$ +and where the error correction term, $EC_{t-1}$, expressing the long-run +equilibrium relationship between $y_{t}$ and $\mathbf{x}_{t}$, is given +by +$$\label{eq:ec} +EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}, (\#eq:ec)$$ +with +$$\label{eq:const} +\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}, \enspace\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}'}_{y.x}}{a_{yy}}=-\frac{\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}}{a_{yy}}. (\#eq:const)$$ +Thus, no cointegration occurs when +$\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}$ or $a_{yy}=0$ . These two +circumstances are referred to as degenerate case of second and first +type, respectively. Degenerate cases imply no cointegration between +$y_{t}$ and $\mathbf{x}_{t}$.\ +To test the hypothesis of cointegration between $y_{t}$ and +$\mathbf{x}_{t}$, @pesaran2001 proposed an $F$-test, $F_{ov}$ hereafter, +based on the hypothesis system +$$\begin{aligned} +\label{eq:h0sys} +H_0: a_{yy}=0 \; \cap \;\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}\\ +H_1: a_{yy} \neq 0 \; \cup \;\widetilde{\mathbf{a}}_{y.x}\neq \mathbf{0}. +\end{aligned} (\#eq:h0sys)$$ +Note that $H_{1}$ covers also the degenerate cases +$$\begin{aligned} +\label{eq:h0deg} +H_1^{y.x}: a_{yy}=0 \; , \;\widetilde{\mathbf{a}}_{y.x}\neq\mathbf{0}\\ +H_1^{yy}: a_{yy} \neq 0 \; , \;\widetilde{\mathbf{a}}_{y.x} = \mathbf{0}. +\end{aligned} (\#eq:h0deg)$$ +The exact distribution of the $F$ statistic under the null is unknown, +but it is limited from above and below by two asymptotic distributions: +one corresponding to the case of stationary regressors, and another +corresponding to the case of first-order integrated regressors. As a +consequence, the test is called bound test and has an inconclusive area. +[^3]\ + @pesaran2001 worked out two sets of (asymptotic) critical values: one, +$\{\tau_{L,F}\}$, for the case when $\mathbf{x}_{t}\sim{I}(0)$ and +another, $\{\tau_{U,F}\}$, for the case when $\mathbf{x}_{t}\sim{I}(1)$. +These values vary in accordance with the number of regressors in the +ARDL equation, the sample size and the assumptions made about the +deterministic components (intercept and trend) of the data generating +process.\ +In this regard,  @pesaran2001 introduced five different specifications +for the ARDL model, depending on its deterministic components, which are +(see Appendix [7.2](#sec:appendixb) for details) + +I. *No intercept and no trend* + $$\begin{aligned} + \label{eq:case1} + \Delta y_t=-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, + \end{aligned} (\#eq:case1)$$ + where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$,\ + +II. *Restricted intercept and no trend* + $$\begin{aligned} + \label{eq:case2} + \Delta y_{t}= + -a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, + \end{aligned} (\#eq:case2)$$ + where + $EC_{t-1}=y_{t-1}-\theta_{0}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + The intercept extracted from the EC term is + $\alpha_{0.y}^{EC} = a_{yy}\theta_0$. + +III. *Unrestricted intercept and no trend* + $$\begin{aligned} + \label{eq:case3} + \Delta y_{t} + =\alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, + \end{aligned} (\#eq:case3)$$ + where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + +IV. *Unrestricted intercept, restricted trend* + $$\begin{aligned} + \label{eq:case4} + \Delta y_{t}= + \alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, + \end{aligned} (\#eq:case4)$$ + where + $EC_{t-1}=y_{t-1}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + The trend extracted from the EC term is + $\alpha_{1.y}^{EC} = a_{yy}\theta_1$. + +V. *Unrestricted intercept, unrestricted trend* + $$\begin{aligned} + \label{eq:case5} + \Delta y_{t} + =\alpha_{0.y}+\alpha_{1.y}t + -a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, + \end{aligned} (\#eq:case5)$$ + where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. + +The model in \@ref(eq:ardl) proposed by  @pesaran2001 represents the +correct framework in which to carry out bound tests. However, bound test +are often performed in an unconditional ARDL model setting, specified as +$$\label{eq:ardluc} + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{j}\Delta\mathbf{z}_{t-j}+\varepsilon_{yt}, (\#eq:ardluc)$$ +which omits the term $\boldsymbol{\omega}'\Delta\mathbf{x}_{t}$.\ +[@bertelli2022bootstrap] have highlighted that bootstrap tests performed +in these two ARDL specifications can lead to contrasting results. To +explain this divergence, note that the conditional model makes use of +the following vector in the EC term +$$\widetilde{\mathbf{a}}_{y.x}'=\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}$$ +(divided by $a_{yy}$, see \@ref(eq:const)) to carry out bound tests, +while the unconditional one only uses the vector $\mathbf{a}_{yx}'$, +(divided by $a_{yy}$), since it neglects the term +$\boldsymbol{\omega}'\mathbf{A}_{xx}$. [^4] This can lead to contrasting +inference in two instances. The first happens when a degeneracy of first +type occurs in the conditional model, that is +$$\label{eq:deg1cond} +\widetilde{\mathbf{a}}_{y.x}'=\mathbf{0}, (\#eq:deg1cond)$$ +because +$$\mathbf{a}_{yx}'=\boldsymbol{\omega}'\mathbf{A}_{xx}.$$ +In this case, the conditional model rejects cointegration, while the +unconditional one concludes the opposite. The other case happens when a +degeneracy of first type occurs in the unconditional model, that is +$$\label{eq:deg1uc} +\mathbf{a}_{yx}'=\mathbf{0}, (\#eq:deg1uc)$$ +but +$$\widetilde{\mathbf{a}}_{y.x}'=\boldsymbol{\omega}'\mathbf{A}_{xx} \neq \mathbf{0}.$$ +In this case, the unconditional model rejects cointegration, while the +conditional one concludes for the existence of cointegrating +relationships, which are however spurious. Only a comparison of the +outcomes of the $F_{ind}$ test performed in both the conditional and +unconditional ARDL equation can help to disentangle this problem. [^5]\ +In the following, bootstrap tests are carried out in the conditional +ARDL model \@ref(eq:ardl). However, when a degeneracy of first type +occurs in the unconditional model, the outcomes of the $F_{ind}$ +bootstrap test performed in both the conditional and unconditional +settings are provided. This, as previously outlined, is performed to +avoid the acceptance of spurious long-run relationships among the +dependent variable and the independent variables. + +## The new bootstrap procedure {#sec:boot} + +The bootstrap procedure here proposed focuses on a ARDL model specified +as in \@ref(eq:case1)-\@ref(eq:case5), depending on the assumptions on +the deterministic components.\ +The bootstrap procedure consists of the following steps: + +1. The ARDL model is estimated via OLS and the related test statistics + $F_{ov}$, $t$ or $F_{ind}$ are computed. + +2. In order to construct the distribution of each test statistic under + the corresponding null, the same model is re-estimated imposing the + appropriate restrictions on the coefficients according to the test + under consideration. + +3. Following [@mcnown2018bootstrapping], the ARDL restricted residuals + are then computed. For example, under Case III, the residuals are + $$\label{eq:resfov} + \widehat{\nu}_{yt}^{F_{ov}}=\Delta y_{t}-\widehat{\alpha}_{0.y}-\sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} (\#eq:resfov)$$ + + $$\label{eq:rest} + \widehat{\nu}_{yt}^{t}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{\widetilde{\mathbf{a}}}'_{y.x}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} (\#eq:rest)$$ + + $$\label{eq:resfind} + \widehat{\nu}_{yt}^{F_{ind}}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{a}_{yy}y_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t}. (\#eq:resfind)$$ + Here, the apex $"\widehat{\,\,.\,\,}"$ denotes the estimated + parameters. The other cases can be dealt with in a similar manner. + +4. The VECM model + $$\label{eq:vecmhat} + \Delta\mathbf{z}_{t}=\boldsymbol{\alpha}_{0}-\mathbf{A}\mathbf{z}_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_{t} (\#eq:vecmhat)$$ + is estimated as well (imposing weak exogeneity), and the residuals + $$\label{eq:resvecm} + \widehat{\boldsymbol{\varepsilon}}_{xt}= \Delta\mathbf{x}_{t}-\widehat{\boldsymbol{\alpha}}_{0x}+\widehat{\mathbf{A}}_{xx}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\Gamma}}_{(x)j}\Delta\mathbf{z}_{t-j} (\#eq:resvecm)$$ + are computed. This approach guarantees that the residuals + $\widehat{\boldsymbol{\varepsilon}}_{xt}$, associated to the + variables $\mathbf{x}_{t}$ explained by the marginal model + \@ref(eq:marg), are uncorrelated with the ARDL residuals + $\widehat{\nu}_{yt}^{.}$. + +5. A large set of $B$ bootstrap replicates are sampled from the + residuals calculated as in \@ref(eq:resfov),\@ref(eq:rest), + \@ref(eq:resfind) and \@ref(eq:resvecm). In each replication, the + following operations are carried out: + + 1. Each set of $(T-p)$ resampled residuals (with replacement) + $\widehat{\boldsymbol{\nu}}_{zt}^{(b)}=(\widehat{\nu}_{yt}^{(b)},\widehat{\boldsymbol{\varepsilon}}_{xt}^{(b)})$ + is re-centered [see @davidson2005case] + $$\begin{aligned} + \dot{\widehat{\nu}}^{(b)}_{yt}&=\widehat{\nu}^{(b)}_{yt} -\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\nu}^{(b)}_{yt} \label{eq:recentery} \\ + \dot{\widehat{\boldsymbol{\varepsilon}}}^{b}_{x_{i}t}&=\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}-\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}\qquad i=1,\dots,K.\label{eq:recenterx} + \end{aligned} (\#eq:recentery)$$ + + 2. A sequential set of $(T-p)$ bootstrap observations, + $y^{*}_{t}\enspace, \mathbf{x}^{*}_{t}\enspace t=p+1,\dots,T$, + is generated as follows + $$y^{*}_{t}=y^{*}_{t-1}+\Delta y^{*}_{t}, \enspace \enspace \mathbf{x}^{*}_{t}=\mathbf{x}^{*}_{t-1}+\Delta \mathbf{x}^{*}_{t},$$ + where $\Delta \mathbf{x}^{*}_{t}$ are obtained from + \@ref(eq:resvecm) and $\Delta y^{*}_{t}$ from either + \@ref(eq:resfov), \@ref(eq:rest) or \@ref(eq:resfind) after + replacing in each of these equations the original residuals with + the bootstrap ones.\ + The initial conditions, that is the observations before $t=p+1$, + are obtained by drawing randomly $p$ observations in block from + the original data, so as to preserve the data dependence + structure. + + 3. An unrestricted ARDL model is estimated via OLS using the + bootstrap observations, and the statistics $F_{ov}^{(b),H_0}$, + $t^{(b),H_0}$ $F_{ind}^{(b),H_0}$ are computed. + +6. The bootstrap distributions of + $\big\{F_{ov}^{(b),H_0}\big\}_{b=1}^B$, + $\big\{F_{ind}^{(b),H_0}\big\}_{b=1}^B$ and + $\big\{t^{(b),H_0}\big\}_{b=1}^B$ under the null are then employed + to determine the critical values of the tests. By denoting with + $M^*_b$ the ordered bootstrap test statistic, and with $\alpha$ the + nominal significance level, the bootstrap critical values are + determined as follows + $$\label{eq:bootf} + c^*_{\alpha,M}=\min\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{M^*_b >c\}} \leq\alpha\bigg\} + \qquad M\in\{F_{ov},F_{ind}\} (\#eq:bootf)$$ + for the $F$ tests and + $$\label{eq:boott} + c^*_{{\alpha,t}}=\max\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{t^*_b80$, at +the 5% significance level. Therefore, it is recommended a number of +bootstrap replicates of at least $B=1000$ for higher sample size, or at +least $B=2000$ for smaller samples. The analysis has been carried out +using an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of +RAM. + +::: {#tab:exec} + ------------------------------------------------------------------------------------------------------ + $T$ $B$ Exec. Time (sec) $cv^{(F_{ov})}(5\%)$ $cv^{(F_{ov})}(2.5\%)$ $cv^{(F_{ov})}(1\%)$ + ----- ------ ------------------ ---------------------- ------------------------ ---------------------- + 50 200 23.38 8.648 10.925 13.392 + + 50 500 48.37 6.312 6.952 8.640 + + 50 1000 96.65 4.806 5.613 6.288 + + 50 2000 231.15 4.255 4.226 4.946 + + 80 200 23.46 7.251 8.936 11.263 + + 80 500 50.19 4.998 6.220 7.946 + + 80 1000 143.00 3.882 4.453 5.305 + + 80 2000 255.64 2.912 3.623 4.518 + + 100 200 37.89 7.707 8.583 10.955 + + 100 500 52.86 4.691 5.304 7.557 + + 100 1000 184.51 3.512 4.567 5.695 + + 100 2000 212.65 3.519 3.674 4.185 + + 200 200 35.46 6.644 7.173 10.365 + + 200 500 76.78 4.734 5.355 6.225 + + 200 1000 148.25 3.124 4.177 5.034 + + 200 2000 484.51 2.811 3.361 3.907 + + 500 200 54.47 6.641 8.694 10.414 + + 500 500 133.17 5.137 5.816 6.408 + + 500 1000 271.87 3.905 4.585 5.283 + + 500 2000 561.71 3.221 3.490 4.145 + ------------------------------------------------------------------------------------------------------ + + : Table 1: Average execution times (in seconds) of the `boot_ardl` + function, for different combinations of sample size $T$ and bootstrap + replicates $B$. Coefficients of variation ($cv$) reported for the + $F_{ov}$ bootstrap critical values at level 5%, 2.5% and 1%. +::: + +## Empirical applications {#sec:app} + +This section provides two illustrative application which highlight the +performance of the bootstrap ARDL tests. + +### An application to the German macroeconomic dataset + +In the first example, the occurrence of a long-run relationship between +consumption \[C\], income \[INC\], and investment \[INV\] of Germany has +been investigated via a set of ARDL models, where each variable takes in +turn the role of dependent one, while the remaining are employed as +independent. The models have been estimated by employing the dataset of +@lutkepohl2005 which includes quarterly data of the series over the +years 1960 to 1982. The data have been employed in logarithmic form. +Figure \@ref(fig:figplotemp) displays these series over the sample +period.\ +Before applying the bootstrap procedure, the order of integration of +each series has been analyzed. Table \@ref(tab:adf) shows the results of +ADF test performed on both the series and their first-differences ($k=3$ +maximum lags). The results confirm the applicability of the ARDL +framework as no series is integrated of order higher than one.\ +The following ARDL equations have been estimated: + +1. First ARDL equation (C | INC, INV): + $$\begin{aligned} + \Delta \log \text{C}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{C}_{t-1} - {a}_{y.x_1}\log \text{INC}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{INC}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + + \end{aligned}$$ + +2. Second ARDL equation (INC | C, INV): + $$\begin{aligned} + \Delta \log \text{INC}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INC}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + + \end{aligned}$$ + +3. Third ARDL equation (INV | C, INC): + $$\begin{aligned} + \Delta \log \text{INV}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INV}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INC}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INV}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INC}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INC}_{t}+\nu_{t}. + + \end{aligned}$$ + +Table \@ref(tab:est) shows the estimation results for each ARDL and VECM +model. It is worth noting that the instantaneous difference of the +independent variables are highly significant in each conditional ARDL +model. Thus, neglecting these variables in the ARDL equation, as happens +in the unconditional version of the model, may potentially lead to +biased estimates and incorrect inference. For the sake of completeness, +also the results of the marginal VECM estimation are reported for each +model.\ +The code to prepare the data, available in the package as the +`ger_macro` dataset, is: + +``` r + data("ger_macro") + LNDATA = apply(ger_macro[,-1], 2, log) + col_ln = paste0("LN", colnames(ger_macro)[-1]) + LNDATA = as.data.frame(LNDATA) + colnames(LNDATA) = col_ln +``` + +Then, the `boot_ardl` function is called, to perform the bootstrap +tests. In the code chunk below, Model I is considered. + +``` r + set.seed(999) + BCT_res_CONS = boot_ardl(data = LNDATA, + yvar = "LNCONS", + xvar = c("LNINCOME", "LNINVEST"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) +``` + +to which follows the call to the `summary` function + +``` r + summary(BCT_res_CONS, out = "ARDL") + summary(BCT_res_CONS, out = "VECM") + summary(BCT_res_CONS, out = "cointVECM") + summary(BCT_res_CONS, out = "cointARDL") +``` + +The first summary line displays the output in the ARDL column of Table +\@ref(tab:est) and the second column of Table \@ref(tab:cointbig), Model +I. The second line corresponds to the VECM columns of Table +\@ref(tab:est), Model I - only for the independent variables. The +information on the rank of the $\mathbf A_{xx}$ in Table \@ref(tab:est) +is inferred from the third line. Finally, the fourth summary line +corresponds to the test results in Table \@ref(tab:cointbig), Model I. A +textual indication of the presence of spurious cointegration is +displayed at the bottom of the `"cointARDL"` summary, if detected.\ +In this example, the bootstrap and bound testing procedures are in +agreement only for model I, indicating the existence of a cointegrating +relationship. Additionally, no spurious cointegration is detected for +this model. As for models II and III, the null hypothesis is not +rejected by the bootstrap tests, while the PSS and SMG bound tests fail +to give a conclusive answer in the $F_{ind}$ test.\ +The running time of the entire analysis is of roughly 11 minutes, using +an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of RAM. + +:::: center +::: {#tab:adf} + ----------------------------------------------------------------------------------- + level variable first difference + -------------------- ----- ---------------- --------- ------------------ ---------- + Series lag ADF p.value ADF p-value + + $\log\text{C}_t$ 0 -1.690 0.450 -9.750 $< 0.01$ + + 1 -1.860 0.385 -5.190 $< 0.01$ + + 2 -1.420 0.549 -3.130 0.030 + + 3 -1.010 0.691 -2.720 0.080 + + $\log\text{INC}_t$ 0 -2.290 0.217 -11.140 $<0.01$ + + 1 -1.960 0.345 -7.510 $< 0.01$ + + 2 -1.490 0.524 -5.120 $< 0.01$ + + 3 -1.310 0.587 -3.290 0.020 + + $\log\text{INV}_t$ 0 -1.200 0.625 -8.390 $< 0.01$ + + 1 -1.370 0.565 -5.570 $< 0.01$ + + 2 -1.360 0.570 -3.300 0.020 + + 3 -1.220 0.619 -3.100 0.032 + ----------------------------------------------------------------------------------- + + : Table 2: ADF preliminary test (null hypothesis: random walk with + drift). +::: +:::: + +::: center +```{r figplotemp, echo=FALSE , fig.cap="log-consumption/investment/income graphs (level variables and first differences). Made with ggplot.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/tsgraph.png")) +``` +::: + +:::: landscape +::: {#tab:est} + ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Model I Model II Model III + ------------------------------ ------------------------ -------------------------- -------------------------- -------------------------- ------------------------- -------------------------- -------------------------- ------------------------- -------------------------- + ARDL VECM ARDL VECM ARDL VECM + + $\Delta\log\text{C}_t$ $\Delta\log\text{INV}_t$ $\Delta\log\text{INC}_t$ $\Delta\log\text{INC}_t$ $\Delta\log\text{C}_t$ $\Delta\log\text{INV}_t$ $\Delta\log\text{INV}_t$ $\Delta\log\text{C}_t$ $\Delta\log\text{INC}_t$ + + $\log\text{C}_{t-1}$ + + (0.055) + + (0.081) + + (0.0126) + + (0.0540) + + (0.339) + + (0.0704) + + (0.0796) + + $\log\text{INC}_{t-1}$ + + (0.055) + + (0.054) + + (0.014) + + (0.079) + + (0.340) + + (0.0681) + + (0.0772) + + $\log\text{INV}_{t-1}$ + + (0.011) + + (0.063) + + (0.017) + + (0.0135) + + (0.0142) + + (0.0607) + + (0.060) + + $\Delta\log\text{C}_{t-1}$ + + (0.079) + + (0.442) + + (0.113) + + (0.1086) + + (0.442) + + (0.441) + + (0.1142) + + $\Delta\log\text{C}_{t-2}$ + + (0.431) + + (0.4345) + + $\Delta\log\text{INC}_{t-1}$ + + (0.1095) + + $\Delta\log\text{INC}_{t-2}$ + + (0.0958) + + (0.0912) + + $\Delta\log\text{INV}_{t-1}$ + + (0.111) + + (0.029) + + (0.1097) + + (0.1075) + + (0.0282) + + $\Delta\log\text{INV}_{t-2}$ + + (0.027) + + (0.0245) + + (0.0223) + + (0.0266) + + $\Delta\log\text{C}_t$ + + (0.1093) + + (0.5425) + + $\Delta\log\text{INC}_t$ + + (0.074) + + (0.4726) + + $\Delta\log\text{INV}_t$ + + (0.019) + + (0.025) + + const. + + (0.013) + + (0.066) + + (0.017) + + (0.018) + + (0.0155) + + (0.0666) + + (0.072) + + (0.0157) + + (0.0177) + + J-test $rk(\mathbf{A_{xx}})=2$ $rk(\mathbf{A_{xx}})=2$ $rk(\mathbf{A_{xx}})=2$ + ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 3: Conditional ARDL and VECM results for the + consumption/income/investment dataset, along with rank of the + $\mathbf A_{xx}$ matrix via the Johansen (J) test.\ + Significance codes: (\*\*\*) 1%; (\*\*) 5%; (.) 10%. +::: +:::: + +::: {#tab:cointbig} + ------------------------------------------------------------------------------------------------------------------- + PSS / SMG Threshold Outcome + ------- --------- ----------- ----------------------- --------------------- --------- ----------- --------- ------- + Model Lags Test Boot. Critical Values I(0) 5% I(1) 5% Statistic Boot Bound + + I (1,0,0) $F_{ov}$ 3.79 3.79 4.85 10.75 Y Y + + $t$ -2.88 -2.86 -3.53 -5.608 + + $F_{ind}$ 4.92 3.01 5.42 15.636 + + II (1,1,0) $F_{ov}$ 5.79 3.79 4.85 2.867 N U + + $t$ -3.69 -2.86 -3.53 -2.315 + + $F_{ind}$ 7.38 3.01 5.42 3.308 + + III (1,1,0) $F_{ov}$ 5.50 3.79 4.85 3.013 N U + + $t$ -3.32 -2.86 -3.53 -2.020 + + $F_{ind}$ 6.63 3.01 5.42 4.189 + ------------------------------------------------------------------------------------------------------------------- + + : Table 4: Cointegration analysis for the three ARDL equations in the + German macroeconomic data. The optimal number of ARDL lags in the + short-run - in the form $(y,x_1,x_2)$, matching the model definition - + bootstrap critical values, bound test thresholds and test statistics + for each test are shown (case III).\ + The outcome columns draw conclusions on each type of model (bootstrap + or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of + type 1, D2 = degenerate of type 2, U = inconclusive inference. +::: + +### An application on Italian Macroeconomic Data + +Following @bertelli2022bootstrap, the relationship between foreign +direct investment \[FDI\], exports \[EXP\], and gross domestic product +\[GDP\] in Italy is investigated. The data of these three yearly +variables have been retrieved from the World Bank Database and cover the +period from 1970 to 2020. In the analysis, the log of the variables has +been used and \[EXP\] and \[FDI\] have been adjusted using the GDP +deflator. Figure \@ref(fig:figplotemp2) displays these series over the +sample period. + +::: center +```{r figplotemp2, echo=FALSE , fig.cap="log-GDP/export/investment graphs (level variables and first differences). Made with ggplot.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("figures/tsgraph2.png")) +``` +::: + +Table \@ref(tab:gdp1) shows the outcomes of the ADF test performed on +each variable, which ensures that the integration order is not higher +than one for all variables. Table \@ref(tab:cointbig2) shows the results +of bound and bootstrap tests performed in ARDL model by taking each +variable, in turn, as the dependent one. The following ARDL equations +have been estimated: + +1. First ARDL equation (GDP | EXP, FDI): + $$\begin{aligned} + \Delta \log \text{GDP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{GDP}_{t-1} - {a}_{y.x_1}\log \text{EXP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{EXP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t} + + \end{aligned}$$ + . For this model, a degenerate case of the first type can be + observed, while the simpler bound testing procedure does not signal + cointegration. + +2. Second ARDL equation (EXP | GDP, FDI): + $$\begin{aligned} + \Delta \log \text{EXP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{EXP}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t}. + + \end{aligned}$$ + For this model, the ARDL bootstrap test indicates absence of + cointegration, while the bound testing approach is inconclusive for + the $F_{ind}$ test. + +3. Third ARDL equation (FDI | GDP, EXP): + $$\begin{aligned} + \Delta \log \text{FDI}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{FDI}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{EXP}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{FDI}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{EXP}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{EXP}_{t}+\nu_{t}. + + \end{aligned}$$ + For this model, the long-run cointegrating relationship is confirmed + using both boostrap and bound testing. No spurious cointegration is + detected. + +The code to load the data and perform the analysis (e.g. for Model I) +is: + +``` r + data("ita_macro") + BCT_res_GDP = boot_ardl(data = ita_macro, + yvar = "LGDP", + xvar = c("LEXP", "LFI"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) +``` + +For the sake of simplicity, the conditional ARDL and VECM marginal +models outputs included in each cointegrating analysis is omitted. The +summary for the cointegration tests for Model I is called via + +``` r + summary(BCT_res_GDP, out = "ARDL") # extract lags + summary(BCT_res_GDP, out ="cointARDL") # ARDL cointegration +``` + +This empirical application further highlights the importance of dealing +with inconclusive inference via the bootstrap procedure, while naturally +including the effect of conditioning in the ARDL model, as highlighted +in @bertelli2022bootstrap. + +::: {#tab:gdp1} + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + No Drift, No Trend Drift, No Trend Drift and Trend + --------------------------- -------------------- ---------------------------- --------- --------- ----------------- --------- --------- --------- ----------------- --------- --------- --------- + Variable Lag = 0 Lag = 1 Lag = 2 Lag = 3 Lag = 0 Lag = 1 Lag = 2 Lag = 3 Lag = 0 Lag = 1 Lag = 2 Lag = 3 + + $\log \text{GDP}_t$ 0.99 0.974 0.941 0.796 $<0.01$ $<0.01$ $<0.01$ 0.084 0.99 0.99 0.99 0.99 + + $\log \text{FDI}_t$ 0.572 0.599 0.675 0.725 $<0.01$ 0.0759 0.3199 0.5174 $<0.01$ 0.013 0.151 0.46 + + $\log \text{EXP}_t$ 0.787 0.71 0.698 0.684 0.479 0.288 0.467 0.433 0.629 0.35 0.463 0.379 + + $\Delta\log \text{GDP}_t$ $<0.01$ $<0.01$``{=html}64 0.0429 0.0402 $<0.01$ 0.0861 0.3989 0.4267 $<0.01$ $<0.01$ 0.0166 0.017 + + $\Delta\log \text{FDI}_t$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ + + $\Delta\log \text{EXP}_t$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ $<0.01$ 0.0336 0.0315 + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 5: ADF preliminary test for the second example. +::: + +::: {#tab:cointbig2} + ------------------------------------------------------------------------------------------------------------------- + PSS / SMG Threshold Outcome + ------- --------- ----------- ----------------------- --------------------- --------- ----------- --------- ------- + Model Lags Test Boot. Critical Values I(0) 5% I(1) 5% Statistic Boot Bound + + I (1,1,0) $F_{ov}$ 3.730 4.070 5.190 9.758 D1 N + + $t$ -2.020 -2.860 -3.530 -2.338 + + $F_{ind}$ 3.710 3.220 5.620 2.273 + + II (1,0,0) $F_{ov}$ 5.400 4.070 5.190 2.649 N U + + $t$ -3.380 -2.860 -3.530 -1.889 + + $F_{ind}$ 5.630 3.220 5.620 3.481 + + III (1,0,0) $F_{ov}$ 5.360 4.070 5.190 6.716 Y Y + + $t$ -3.550 -2.860 -3.530 -4.202 + + $F_{ind}$ 6.500 3.220 5.620 7.017 + ------------------------------------------------------------------------------------------------------------------- + + : Table 6: Cointegration analysis for the three ARDL equations in the + Italian macroeconomic data. The optimal number of ARDL lags in the + short-run - in the form $(y,x_1,x_2)$, matching the model definition - + bootstrap critical values, bound test thresholds and test statistics + for each test are shown (case III).\ + The outcome columns draw conclusions on each type of model (bootstrap + or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of + type 1, D2 = degenerate of type 2, U = inconclusive inference. +::: + +## Conclusion {#sec:end} + +The [**bootCT**](https://CRAN.R-project.org/package=bootCT) package +allows the user to perform bootstrap cointegration tests in ARDL models +by overcoming the problem of inconclusive inference which is a +well-known drawback of standard bound tests. The package makes use of +different functions. The function `boot_ardl` performs the bootstrap +tests, and it acts as a wrapper of both the bootstrap and the standard +bound tests, including also the Johansen test on the independent +variables of the model. Finally, it also performs the bound $F$-test on +the lagged independent variables, so far not available in other extant +`R` packages. The function `sim_vecm_ardl`, which allows the simulation +of multivariate time series data following a user-defined DGP, enriches +the available procedures for multivariate data generation, while the +function `lag_mts` provides a supporting tool in building datasets of +lagged variables for any practical purpose. Finally, the use of Rcpp +functions gives a technical advantage in terms of computational speed, +performing the bootstrap analysis within an acceptable time frame. + +## Appendix {#sec:appendix} + +### Section A - the methodological framework of (conditional) VECM and ARDL models {#sec:appendixa} + +Expanding the matrix polynomial $\mathbf{A}(z)$ about $z=1$, yields +$$\label{eq:polyamat} +\mathbf{A}(z)=\mathbf{A}(1)z+(1-z)\boldsymbol{\Gamma}(z), (\#eq:polyamat)$$ +where +$$\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}$$ + +$$\label{eq:polygamma} +\boldsymbol{\Gamma}(z)=\mathbf{I}_{K+1}-\sum_{i=1}^{p-1}\boldsymbol{\Gamma}_{i}z^i, \enspace \enspace \boldsymbol{\Gamma}_{i}=-\sum_{j=i+1}^{p}\mathbf{A}_j. (\#eq:polygamma)$$ +The VECM model \@ref(eq:vecm) follows accordingly, and +$$\label{eq:vecmint} +\boldsymbol{\alpha}_0=\mathbf{A}(1)\boldsymbol{\mu}+(\boldsymbol{\Gamma}(1)-\mathbf{A}(1))\boldsymbol{\eta}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\mathbf{A}(1)\boldsymbol{\eta}. (\#eq:vecmint)$$ +Assuming that $\mathbf{A}(1)$ is singular and that the variables +$\mathbf{x}_{t}$ are cointegrated. This entails the following +$$\begin{aligned} +\label{eq:factt} + \mathbf{A}(1)=&\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}=\underset{(K+1,r+1)}{\mathbf{B}}\underset{(r+1,K+1)}{\mathbf{C}'}=\begin{bmatrix}b_{yy} & \mathbf{b}_{yx}'\\ \mathbf{b}_{xy} & \mathbf{B}_{xx} \end{bmatrix}\begin{bmatrix}c_{yy} & \mathbf{c}_{yx}'\\ \mathbf{c}_{xy} & \mathbf{C}_{xx}'\end{bmatrix}= \nonumber\\ +=&\begin{bmatrix}b_{yy}c_{yy}+\mathbf{b}_{yx}'\mathbf{c}_{xy} & b_{yy}\mathbf{c}_{yx}'+\mathbf{b}_{yx}'\mathbf{C}_{xx}'\\ +\mathbf{b}_{xy}c_{yy}+\mathbf{B}_{xx}\mathbf{c}_{xy} & \mathbf{b}_{xy}\mathbf{c}_{yx}'+ \mathbf{A}_{xx} \end{bmatrix}, \enspace \enspace \enspace rk(\mathbf{A}(1))=rk(\mathbf{B})=rk(\mathbf{C}), +\end{aligned} (\#eq:factt)$$ +where $\mathbf{B}$ and $\mathbf{C}$ are full column rank matrices +arising from the rank-factorization of +$\mathbf{A}(1)=\mathbf{B}\mathbf{C}'$ with $\mathbf{C}$ matrix of the +long-run relationships of the process and $\mathbf{B}_{xx}$, +$\mathbf{C}_{xx}$ arising from the rank factorization of +$\mathbf{A}_{xx}=\mathbf{B}_{xx}\mathbf{C}_{xx}'$, with +$rk(\mathbf{A}_{xx})=rk(\mathbf{B}_{xx})=rk(\mathbf{C}_{xx})=r$ [^6].\ +By partitioning the vectors $\boldsymbol{\alpha}_{0}$, +$\boldsymbol{\alpha}_{1}$, the matrix $\mathbf{A}(1)$ and the polynomial +matrix $\boldsymbol{\Gamma}(L)$ conformably to $\mathbf{z}_{t}$, as +follows +$$\label{eq:alphapart} +\boldsymbol{\alpha}_0=\begin{bmatrix} +\underset{(1,1)}{\alpha_{0y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{0x}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\begin{bmatrix} +\underset{(1,1)}{\alpha_{1y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{1x} } +\end{bmatrix} (\#eq:alphapart)$$ + +$$\label{eq:coeffpart} +\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,K+1)}{\mathbf{a}'_{(y)}} \\ \underset{(K,K+1)}{\mathbf{A}_{(x)}} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}'_{yx}} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx} } +\end{bmatrix}, +\enspace \enspace \enspace +\boldsymbol{\Gamma}(L)=\begin{bmatrix} +\underset{(1,K+1)}{\boldsymbol{\gamma}'_{y}(L)} \\ \underset{(K,K+1)}{\boldsymbol{\Gamma}_{(x)}(L)} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{\gamma_{yy}(L)} & \underset{(1,K)}{\boldsymbol{\gamma}'_{yx}(L)} \\ \underset{(K,1)}{\boldsymbol{\gamma}_{xy}(L)} & \underset{(K,K)}{\boldsymbol{\Gamma}_{xx}(L) } +\end{bmatrix} (\#eq:coeffpart)$$ +, and substituting \@ref(eq:epsilonx) into \@ref(eq:vecm) yields +$$\label{eq:condsys} +\Delta\mathbf{z}_t=\begin{bmatrix} +\Delta y_{t} \\ \Delta\mathbf{x}_{t} +\end{bmatrix}=\begin{bmatrix} +\alpha_{0.y} \\ \boldsymbol{\alpha}_{0x} +\end{bmatrix} + \begin{bmatrix} +\alpha_{1.y} \\ \boldsymbol{\alpha}_{1x} +\end{bmatrix}t- \begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}\begin{bmatrix} +y_{t-1} \\ \mathbf{x}_{t-1} +\end{bmatrix} + \begin{bmatrix} +\boldsymbol{\gamma}'_{y.x}(L) \\ \boldsymbol{\Gamma}_{(x)}(L) +\end{bmatrix}\Delta\mathbf{z}_t+\begin{bmatrix} +\boldsymbol{\omega}'\Delta\mathbf{x}_{t} \\ \mathbf{0} +\end{bmatrix}+\begin{bmatrix} +{\nu}_{yt} \\ \boldsymbol{\varepsilon}_{xt} +\end{bmatrix} (\#eq:condsys)$$ +, where +$$\label{eq:condintt} +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x} (\#eq:condintt)$$ + +$$\label{eq:condAmat} +\mathbf{a}'_{(y).x}=\mathbf{a}'_{(y)}-\boldsymbol{\omega}'\mathbf{A}_{(x)}, \enspace \enspace \enspace \boldsymbol{\gamma}'_{y.x}(L)=\boldsymbol{\gamma}_{y}'(L)-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x)}(L). (\#eq:condAmat)$$ +According to \@ref(eq:condsys), the long-run relationships of the VECM +turn out to be now included in the matrix +$$\label{eq:condAmat2} +\begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}=\begin{bmatrix} +a_{yy}-\boldsymbol{\omega}'\mathbf{a}_{xy} & \mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{a}_{xy}&\mathbf{A}_{xx} +\end{bmatrix}. (\#eq:condAmat2)$$ +To rule out the presence of long-run relationships between $y_{t}$ and +$\mathbf{x}_{t}$ in the marginal model, the $\mathbf{x}_{t}$ variables +are assumed to be exogenous with respect to the ARDL parameters, that is +$\mathbf{a}_{xy}$ is assumed to be a null vector. Accordingly, the +long-run matrix in \@ref(eq:condAmat2) becomes +$$\label{eq:cond} +\widetilde{\mathbf{A}}=\begin{bmatrix}a_{yy} & \mathbf{a}'_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{0} & \mathbf{A}_{xx} +\end{bmatrix}=\begin{bmatrix} +a_{yy} & \widetilde{\mathbf{a}}_{y.x}' \\ \mathbf{0}&\mathbf{A}_{xx}\end{bmatrix} =\begin{bmatrix} +b_{yy}c_{yy} & b_{yy}\mathbf c_{yx}'+(\mathbf{b}_{yx}'-\boldsymbol{\omega}'\mathbf{B}_{xx})\mathbf{C}_{xx}' \\ \mathbf{0}& \mathbf{B}_{xx}\mathbf{C}_{xx}'\end{bmatrix}. (\#eq:cond)$$ +After these algebraic transformations, the ARDL equation for +$\Delta y_{t}$ can be rewritten as in \@ref(eq:ardl).\ +In light of the factorization \@ref(eq:factt) of the matrix +$\mathbf{A}(1)$, the long-run equilibrium vector $\boldsymbol{\theta}$ +can be expressed as +$$\label{eq:thetat} +\boldsymbol{\theta}'= +-\frac{1}{a_{yy}}\underset{(1,r+1)}{\left[b_{yy}\enspace\enspace(\mathbf{b}_{yx}-\boldsymbol{\omega}'\mathbf{B}_{xx})\right]} +\underset{(r+1,K)}{\begin{bmatrix} \mathbf{c}'_{yx}\\ \mathbf{C}'_{xx} \end{bmatrix}}, (\#eq:thetat)$$ +where +$\widetilde{\mathbf{a}}_{y.x}=\mathbf{a}_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx}$.\ +Bearing in mind that $\mathbf{C}'_{xx}$ is the cointegrating matrix for +the variables $\mathbf{x}_t$, the equation \@ref(eq:thetat) leads to the +following conclusion +$$\label{eq:rank} +rk\begin{bmatrix}\mathbf{c}'_{yx}\\ \mathbf{C}'_{xx}\end{bmatrix}=\begin{cases} +r \to \enspace y_{t} \sim I(0) \\ +r+1 \to \enspace y_{t} \sim I(1) +\end{cases}, (\#eq:rank)$$ +where $r=rk(\mathbf{A}_{xx})$ and $0 \leq r\leq K$.\ + +### Section B - Intercept and trend specifications {#sec:appendixb} + + @pesaran2001 introduced five different specifications for the ARDL +model, which depend on the deterministic components that can be absent +or restricted to the values they assume in the parent VAR model. In this +connection, note that, in light of \@ref(eq:vecmint), the drift and the +trend coefficient in the conditional VECM \@ref(eq:condsys) are defined +as +$$\boldsymbol{\alpha_{0}}^{c}=\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} , \enspace \enspace +\boldsymbol{\alpha_{1}}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta},$$ +where $\widetilde{\mathbf{A}}(1)$ is as in \@ref(eq:cond) and +$\widetilde{\boldsymbol{\Gamma}}(1)=\begin{bmatrix} \boldsymbol{\gamma}_{y.x}'(1) \\ \boldsymbol{\Gamma}_{(x)}(1) \end{bmatrix}$.\ +Accordingly, after partitioning the mean and the drift vectors as +$$\underset{(1,K+1)}{\boldsymbol{\mu}'}=[\underset{(1,1)}{\mu_{y}},\underset{(1,K)}{\boldsymbol{\mu}_x'}], \enspace \underset{(1,K+1)}{\boldsymbol{\eta}'}=[\underset{(1,1)}{\eta_{y}},\underset{(1,K)}{\boldsymbol{\eta}_{x}'}],$$ +the intercept and the coefficient of the trend of the ARDL equation +\@ref(eq:ardl) are defined as +$$\alpha_{0.y}^{EC} += \mathbf{e}_{1}'\boldsymbol{\alpha_{0}}^{c} +=a_{yy}\mu_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\mu}_{x}+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}=a_{yy}(\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x})+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}, \enspace +\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}}'_{y.x}}{a_{yy}}$$ + +$$\enspace \enspace \alpha_{1.y}^{EC}=\mathbf{e}_{1}'\boldsymbol{\alpha_{1}}^{c}= +a_{yy}\eta_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\eta}_{x}=a_{yy}(\eta_{y}-\boldsymbol{\theta'}\boldsymbol{\eta}_{x}),$$ +where $\mathbf{e}_{1}$ is the $K+1$ first elementary vector.\ +In the error correction term +$$EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}$$ +the parameters that partake in the calculation of intercept and trend +are +$$\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}.$$ +In particular, these latter are not null only when they are assumed to +be restricted in the model specification.\ +The five specifications proposed by  @pesaran2001 are + +1. *No intercept and no trend*: + $$\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf{0}.$$ + It follows that + $$\theta_{0}=\theta_{1}=\alpha_{0.y}=\alpha_{1.y}=0.$$ + Accordingly, the model is as in \@ref(eq:case1). + +2. *Restricted intercept and no trend*: + $$\boldsymbol{\alpha}_{0}^{c}= \widetilde{\mathbf{A}}(1)\boldsymbol{\mu},\enspace \enspace \boldsymbol{\eta}=\mathbf{0},$$ + which entails + $$\theta_0 \neq 0 \enspace\enspace\alpha_{0.y}^{EC}=a_{yy}\theta_{0}, \enspace \enspace + \alpha_{0.y}=\theta_{1}=\alpha_{1.y}=0.$$ + Therefore, the intercept stems from the EC term of the ARDL + equation. The model is specified as in \@ref(eq:case2) + +3. *Unrestricted intercept and no trend*: + $$\boldsymbol{\alpha}_{0}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\mu}, \enspace \enspace \boldsymbol{\eta}=\mathbf{0}.$$ + Thus, + $$\alpha_{0.y}\neq 0,\enspace \enspace \theta_{0}=\theta_{1}=\alpha_{1.y}=0.$$ + Accordingly, the model is as in \@ref(eq:case3). + +4. *Unrestricted intercept, restricted trend*: + $$\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta}\enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta},$$ + which entails + $$\alpha_{0.y} \neq 0,\enspace \enspace + \theta_{0}=0 \enspace \enspace + \theta_{1}\neq 0\enspace\enspace + \alpha_{1.y}^{EC}=a_{yy}\theta_1\enspace\enspace + \alpha_{1.y}=0.$$ + Accordingly, the trend stems from the EC term of the ARDL equation. + The model is as in \@ref(eq:case4). + +5. *Unrestricted intercept, unrestricted trend*: + $$\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} \enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\eta}.$$ + Accordingly, + $$\alpha_{0.y} \neq 0 \enspace \enspace\alpha_{1.y} \neq 0, \enspace \enspace\theta_{0}=\theta_{1}=0.$$ + The model is as in \@ref(eq:case5). +::::::::::::::: + +[^1]: The `R` packages, either used in the creation of + [**bootCT**](https://CRAN.R-project.org/package=bootCT) or employed + in the analyses presented in this paper, are + [**magrittr**](https://CRAN.R-project.org/package=magrittr) + [@magrittr], [**gtools**](https://CRAN.R-project.org/package=gtools) + [@gtools], [**pracma**](https://CRAN.R-project.org/package=pracma) + [@pracma], [**Rcpp**](https://CRAN.R-project.org/package=Rcpp) + [@RCPP], + [**RcppArmadillo**](https://CRAN.R-project.org/package=RcppArmadillo) + [@RcppArmadillo2023], + [**Rmisc**](https://CRAN.R-project.org/package=Rmisc) [@Rmisc], + [**dynamac**](https://CRAN.R-project.org/package=dynamac) + [@PKGDYNAMAC], [**ARDL**](https://CRAN.R-project.org/package=ARDL) + [@PKGARDL], [**aod**](https://CRAN.R-project.org/package=aod) + [@aod], [**vars**](https://CRAN.R-project.org/package=vars) and + [**urca**](https://CRAN.R-project.org/package=urca) + [@PKGVARS; @urca], + [**aTSA**](https://CRAN.R-project.org/package=aTSA) [@PKGATSA], + [**tseries**](https://CRAN.R-project.org/package=tseries) + [@tseries], + [**reshape2**](https://CRAN.R-project.org/package=reshape2), + [**ggplot2**](https://CRAN.R-project.org/package=ggplot2) and + [**stringr**](https://CRAN.R-project.org/package=stringr) + [@reshape2; @ggplot; @stringr], + [**tidyverse**](https://CRAN.R-project.org/package=tidyverse) and + [**dplyr**](https://CRAN.R-project.org/package=dplyr) + [@tidyverse; @dplyr]. + +[^2]: If the explanatory variables are stationary $\mathbf{A}_{xx}$ is + non-singular ($rk(\mathbf{A}_{xx})=K$), while when they are + integrated but without cointegrating relationship $\mathbf{A}_{xx}$ + is a null matrix. + +[^3]: The knowledge of the rank of the cointegrating matrix is necessary + to overcome this impasse. + +[^4]: The latter is introduced in the ARDL equation by the operation of + conditioning $y_t$ on the other variables $\mathbf{x}_t$ of the + model + +[^5]: In fact, as + $\boldsymbol{\omega}'\mathbf{A}_{xx}\mathbf{x}_{t} \approx I(0)$, + the conclusion that $y_{t}\approx I(0)$ must hold. This in turn + entails that no cointegration occurs between $y_t$ and + $\mathbf{x}_{t}$. + +[^6]: If the explanatory variables are stationary $\mathbf{A}_{xx}$ is + non-singular ($rk(\mathbf{A}_{xx})=K$), while when they are + integrated but without cointegrating relationship $\mathbf{A}_{xx}$ + is a null matrix diff --git a/_articles/RJ-2024-003/RJwrapper.tex b/_articles/RJ-2024-003/RJwrapper.tex new file mode 100644 index 0000000000..e58380a3c5 --- /dev/null +++ b/_articles/RJ-2024-003/RJwrapper.tex @@ -0,0 +1,47 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} +\usepackage{lscape} +\usepackage{bm} +\usepackage{mathrsfs} +\usepackage{graphicx} +\usepackage{tablefootnote} +\usepackage{footmisc} +\usepackage{pdflscape} +\usepackage{enumerate} +\usepackage[shortlabels]{enumitem} +\usepackage{afterpage} +\usepackage{makecell} +\usepackage{capt-of}% or use the larger `caption` package +\usepackage{subfig} +\usepackage{listings} +\usepackage{adjustbox} +\usepackage{multirow} +\usepackage{booktabs} + +\usepackage{tikz} +\usetikzlibrary{matrix,calc,shapes,arrows} +\usetikzlibrary{shapes.multipart} + + +%% load any required packages FOLLOWING this line + +\begin{document} + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{39} + +%% replace RJtemplate with your article +\begin{article} + \input{vacca-zoia-bertelli} +\end{article} + +\end{document} diff --git a/_articles/RJ-2024-003/Rlogo-5.png b/_articles/RJ-2024-003/Rlogo-5.png new file mode 100644 index 0000000000..077505788a Binary files /dev/null and b/_articles/RJ-2024-003/Rlogo-5.png differ diff --git a/_articles/RJ-2024-003/figures/Rlogo-5.png b/_articles/RJ-2024-003/figures/Rlogo-5.png new file mode 100644 index 0000000000..077505788a Binary files /dev/null and b/_articles/RJ-2024-003/figures/Rlogo-5.png differ diff --git a/_articles/RJ-2024-003/figures/sim_ts.pdf b/_articles/RJ-2024-003/figures/sim_ts.pdf new file mode 100644 index 0000000000..848c4c25a8 Binary files /dev/null and b/_articles/RJ-2024-003/figures/sim_ts.pdf differ diff --git a/_articles/RJ-2024-003/figures/sim_ts.png b/_articles/RJ-2024-003/figures/sim_ts.png new file mode 100644 index 0000000000..363dc63710 Binary files /dev/null and b/_articles/RJ-2024-003/figures/sim_ts.png differ diff --git a/_articles/RJ-2024-003/figures/sim_ts_D2.pdf b/_articles/RJ-2024-003/figures/sim_ts_D2.pdf new file mode 100644 index 0000000000..1610338fbb Binary files /dev/null and b/_articles/RJ-2024-003/figures/sim_ts_D2.pdf differ diff --git a/_articles/RJ-2024-003/figures/sim_ts_D2.png b/_articles/RJ-2024-003/figures/sim_ts_D2.png new file mode 100644 index 0000000000..875cd37b70 Binary files /dev/null and b/_articles/RJ-2024-003/figures/sim_ts_D2.png differ diff --git a/_articles/RJ-2024-003/figures/tsgraph.pdf b/_articles/RJ-2024-003/figures/tsgraph.pdf new file mode 100644 index 0000000000..e7d0f13a12 Binary files /dev/null and b/_articles/RJ-2024-003/figures/tsgraph.pdf differ diff --git a/_articles/RJ-2024-003/figures/tsgraph.png b/_articles/RJ-2024-003/figures/tsgraph.png new file mode 100644 index 0000000000..1f20cee35f Binary files /dev/null and b/_articles/RJ-2024-003/figures/tsgraph.png differ diff --git a/_articles/RJ-2024-003/figures/tsgraph2.pdf b/_articles/RJ-2024-003/figures/tsgraph2.pdf new file mode 100644 index 0000000000..929d2c108d Binary files /dev/null and b/_articles/RJ-2024-003/figures/tsgraph2.pdf differ diff --git a/_articles/RJ-2024-003/figures/tsgraph2.png b/_articles/RJ-2024-003/figures/tsgraph2.png new file mode 100644 index 0000000000..42ba058f7f Binary files /dev/null and b/_articles/RJ-2024-003/figures/tsgraph2.png differ diff --git a/_articles/RJ-2024-003/sim_ts.pdf b/_articles/RJ-2024-003/sim_ts.pdf new file mode 100644 index 0000000000..848c4c25a8 Binary files /dev/null and b/_articles/RJ-2024-003/sim_ts.pdf differ diff --git a/_articles/RJ-2024-003/sim_ts_D2.pdf b/_articles/RJ-2024-003/sim_ts_D2.pdf new file mode 100644 index 0000000000..1610338fbb Binary files /dev/null and b/_articles/RJ-2024-003/sim_ts_D2.pdf differ diff --git a/_articles/RJ-2024-003/tikz/figflowchart.svg b/_articles/RJ-2024-003/tikz/figflowchart.svg new file mode 100644 index 0000000000..dcc2454d50 --- /dev/null +++ b/_articles/RJ-2024-003/tikz/figflowchart.svg @@ -0,0 +1,9351 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-003/tikz/figflowchart.tex b/_articles/RJ-2024-003/tikz/figflowchart.tex new file mode 100644 index 0000000000..333b7e5599 --- /dev/null +++ b/_articles/RJ-2024-003/tikz/figflowchart.tex @@ -0,0 +1,191 @@ +\documentclass{standalone} +\usepackage{xcolor} +\usepackage{verbatim} +\usepackage[T1]{fontenc} +\usepackage{graphics} +\usepackage{hyperref} +\newcommand{\code}[1]{\texttt{#1}} +\newcommand{\R}{R} +\newcommand{\pkg}[1]{#1} +\newcommand{\CRANpkg}[1]{\pkg{#1}}% +\newcommand{\BIOpkg}[1]{\pkg{#1}} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} +\usepackage{lscape} +\usepackage{bm} +\usepackage{mathrsfs} +\usepackage{graphicx} +\usepackage{tablefootnote} +\usepackage{footmisc} +\usepackage{pdflscape} +\usepackage{enumerate} +\usepackage[shortlabels]{enumitem} +\usepackage{afterpage} +\usepackage{makecell} +\usepackage{capt-of}% or use the larger `caption` package +\usepackage{subfig} +\usepackage{listings} +\usepackage{adjustbox} +\usepackage{multirow} +\usepackage{booktabs} +\usepackage{tikz} +\usetikzlibrary{matrix,calc,shapes,arrows} +\usetikzlibrary{shapes.multipart} +\tikzset{ + treenode/.style = {shape=rectangle, rounded corners, + draw,align=center, + top color=white, + text width = 4cm, + inner sep=2ex, + anchor=center}, + input/.style = {align=center, text width=4.5cm}, + decision/.style = {treenode, diamond, inner sep=3pt}, + action/.style = {treenode, circle, inner sep=1pt, + text width = 2.5cm}, + root/.style = {treenode}, + env/.style = {treenode}, + ginish/.style = {root}, + dummy/.style = {circle,draw}, + ar/.style={->,>=latex} +} +\tikzset{connect/.style={rounded corners=#1, + to path= ($(\tikztostart)!-#1!(\tikztotarget)!-#1!-90:(\tikztotarget)$) -- ($(\tikztotarget)!-#1!(\tikztostart)!-#1!90:(\tikztostart)$) -- + ($(\tikztotarget)!-#1!(\tikztostart)!#1!90:(\tikztostart)$) -- ($(\tikztostart)!-#1!(\tikztotarget)!-#1!90:(\tikztotarget)$) -- cycle (\tikztotarget) +}} +\tikzset{connect/.default=4mm} +\begin{document} +\nopagecolor +\centering + +\resizebox{\textwidth}{!}{% +\begin{tikzpicture}[-latex][scale=0.3] + \matrix (chart) + [ + matrix of nodes, + column sep = 2.5em, + row sep = 1ex, + row 2/.style = {nodes={decision}}, + row 5/.style = {nodes={env}} + ] + { %first row + |[input]| {\small VAR/VECM input\\ $\boldsymbol \mu$, $\boldsymbol \eta$, $\boldsymbol\alpha_0$, $\boldsymbol\alpha_1$, \texttt{case}} + & + |[input]| {\small VECM/ARDL input\\ $\mathbf A_{xx},\mathbf a_{yx},a_{yy},\boldsymbol{\Gamma}_j$}& + \\ + %second row + |[treenode]|{\scriptsize VECM + Intercept and trend\\ + \phantom{x}\\ + CASE I:\\ $\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf 0\rightarrow + \boldsymbol\alpha_0 = \boldsymbol\alpha_1 = \mathbf 0$ \\ + \phantom{x}\\ + CASE II:\\ + $\boldsymbol{\mu}$ input, $\boldsymbol\eta=\mathbf 0$, $\boldsymbol\alpha_{0} = \mathbf A(1) \boldsymbol{\mu}$, $\boldsymbol\alpha_1 = \mathbf 0$\\ + \phantom{x}\\ + CASE III:\\ + $\boldsymbol\eta=\mathbf 0 $ \\ + $\boldsymbol{\alpha}_{0}$ input, $\boldsymbol{\alpha}_1 = \mathbf 0$\\ + \phantom{x}\\ + CASE IV:\\ + $\boldsymbol{\alpha}_{0}$ input, $\boldsymbol{\eta}$ input, $\boldsymbol{\alpha}_1 =\mathbf A(1)\boldsymbol{\eta}$\\ + \phantom{x}\\ + CASE V:\\ + $\boldsymbol{\alpha}_{0}$ input, $\boldsymbol{\alpha}_1$ input + \normalsize}& + |[treenode]| {\small Long-run VECM matrix + $\mathbf A = + \begin{bmatrix} {a_{yy}} & {\mathbf{a}_{yx}'} \\ + {\mathbf 0} & {\mathbf{A}_{xx}} + \end{bmatrix}$.\\ + \phantom{x}\\ + Short-run VECM matrices + $\boldsymbol\Gamma_j$ \\ + $\boldsymbol\Gamma(1) = \mathbf{I_K}-\sum_{j=1}^p\boldsymbol\Gamma_j$}& + |[treenode]|{\scriptsize ARDL + Intercept and trend\\ + \phantom{x}\\ + CASE I:\\ $\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf 0\rightarrow$\\ + $\theta_0=\alpha_{0.y} = \theta_1=\alpha_{1.y} = 0$\\ + \phantom{x}\\ + CASE II:\\ + $\theta_0\neq 0 \enspace \alpha_{0.y} = 0$ (Intercept in $EC$)\\ + $\boldsymbol\eta=\mathbf 0 \rightarrow \theta_1 =\alpha_{1.y}= 0$\\ + \phantom{x}\\ + CASE III:\\ + $\alpha_{0.y}=\alpha_{0y}-\boldsymbol\omega'\boldsymbol\alpha_{0x}$ $(\theta_0 = 0)$\\ + $\boldsymbol\eta=\mathbf 0 \rightarrow \theta_1 =\alpha_{1.y}= 0$\\ + \phantom{x}\\ + CASE IV:\\ + $\alpha_{0.y}=\alpha_{0y}-\boldsymbol\omega'\boldsymbol\alpha_{0x}$ $(\theta_0 = 0)$\\ + $\theta_1\neq 0 \enspace \alpha_{1.y} = 0$ (Trend in $EC$)\\ + \phantom{x}\\ + CASE V:\\ + $\alpha_{0.y}=\alpha_{0y}-\boldsymbol\omega'\boldsymbol\alpha_{0x}$ $(\theta_0 = 0)$\\ + $\alpha_{1.y}=\alpha_{1y}-\boldsymbol\omega'\boldsymbol\alpha_{1x}$ $(\theta_1 = 0)$ + \normalsize} + \\ + %third row + |[action]| {\small $\boldsymbol{\Sigma}$ input.\\ Error generation\\ + $\mathbf u_t'\sim N_{K+1}(\mathbf 0,\boldsymbol\Sigma)$ + \normalsize}& + |[action]| {\small Conditioning\\ + $\boldsymbol\omega'= + \boldsymbol\sigma_{yx}'\boldsymbol\Sigma_{xx}^{-1}$ + \normalsize}& + |[treenode]|{\small $\mathbf {\tilde{a}}_{y.x}'=\mathbf a_{yx}'-\boldsymbol\omega'\mathbf A_{xx}$\\ + $\widetilde{\mathbf A} = + \begin{bmatrix} + {a_{yy}} & \mathbf{\tilde{a}}'_{y.x}\\ + {\mathbf 0} & {\mathbf{A}_{xx}} + \end{bmatrix}$\\ + $\boldsymbol\gamma_{y.x,j}= + \boldsymbol\gamma_{yx}-\boldsymbol\omega'\boldsymbol\Gamma_{(x),j}$\\ + $\widetilde{\boldsymbol\Gamma}_j = + \begin{bmatrix} + \boldsymbol{\gamma}_{y.x,j}\\ + \boldsymbol\Gamma_{(x),j} + \end{bmatrix}$\\ + $\nu_{yt}=\varepsilon_{yt}-\boldsymbol\omega'\boldsymbol\varepsilon_{xt}$ + \normalsize}\\ + |[input]| {\small Other input:\\ + \texttt{nobs}, \texttt{burn.in}} + & + |[action]| {\small $\Delta \mathbf x_t$ via \eqref{eq:marg}\\ + $\mathbf x_t = \Delta \mathbf x_t + \mathbf x_{t-1} $\\ + $\Delta y_t$ via \eqref{eq:ardl}\\ + $y_t = \Delta y_t + y_{t-1} $\\ + \normalsize}&\\ + }; + \draw[thick] + (chart-1-1) -> (chart-2-1); + \draw[thick] + (chart-1-2) -> (chart-2-2); + \draw[thick] + (chart-2-2) -> (chart-2-1); + \draw[thick] + (chart-2-1) -> (chart-3-2); + \draw[thick] + (chart-2-2) -> (chart-3-2); + \draw[thick] + (chart-3-1) -> (chart-3-2); + \draw[thick] + (chart-3-2) -> (chart-2-3); + \draw[thick] + (chart-3-2) -> (chart-3-3); + \begin{scope}[transform canvas={yshift=1.7em}] + \draw[red,dashed,ultra thick] (chart-2-3) to[connect=29.5mm,rounded corners=2mm] (chart-3-3); + \end{scope} + \begin{scope}[transform canvas={yshift=1em}] + \draw[blue,dashed,ultra thick] (chart-2-1) to[connect=27mm,rounded corners=2mm] (chart-3-1); + \end{scope} + \draw[blue,ultra thick,shorten < = 1.85cm] (chart-3-1) edge node[left=0.75cm,pos=0.67]{\small Unconditional parameters for $\Delta\mathbf x_t$\normalsize} (chart-4-2); + \draw[red,ultra thick,shorten < = 0.75cm] (chart-3-3) edge node[right=1cm,pos=0.5]{\small Conditional parameters for $\Delta y_t$\normalsize} (chart-4-2); + \draw[thick] + (chart-3-2) -> (chart-4-2); + \draw[thick] (chart-4-2) edge [in=-20, out=20,looseness=3,right=0.2cm] node[right=0.2cm]{\small Until \texttt{nobs+burn.in}. + Discard \texttt{burn.in}\normalsize} (chart-4-2); + \draw[thick] + (chart-4-1) -> (chart-4-2); + \end{tikzpicture} +} +\end{document} diff --git a/_articles/RJ-2024-003/tikz/figflowchart_ardl.svg b/_articles/RJ-2024-003/tikz/figflowchart_ardl.svg new file mode 100644 index 0000000000..31e322a26a --- /dev/null +++ b/_articles/RJ-2024-003/tikz/figflowchart_ardl.svg @@ -0,0 +1,14168 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-003/tikz/figflowchart_ardl.tex b/_articles/RJ-2024-003/tikz/figflowchart_ardl.tex new file mode 100644 index 0000000000..82a60cc18a --- /dev/null +++ b/_articles/RJ-2024-003/tikz/figflowchart_ardl.tex @@ -0,0 +1,251 @@ +\documentclass{standalone} +\usepackage{xcolor} +\usepackage{verbatim} +\usepackage[T1]{fontenc} +\usepackage{graphics} +\usepackage{hyperref} +\newcommand{\code}[1]{\texttt{#1}} +\newcommand{\R}{R} +\newcommand{\pkg}[1]{#1} +\newcommand{\CRANpkg}[1]{\pkg{#1}}% +\newcommand{\BIOpkg}[1]{\pkg{#1}} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} +\usepackage{lscape} +\usepackage{bm} +\usepackage{mathrsfs} +\usepackage{graphicx} +\usepackage{tablefootnote} +\usepackage{footmisc} +\usepackage{pdflscape} +\usepackage{enumerate} +\usepackage[shortlabels]{enumitem} +\usepackage{afterpage} +\usepackage{makecell} +\usepackage{capt-of}% or use the larger `caption` package +\usepackage{subfig} +\usepackage{listings} +\usepackage{adjustbox} +\usepackage{multirow} +\usepackage{booktabs} +\usepackage{tikz} +\usetikzlibrary{matrix,calc,shapes,arrows} +\usetikzlibrary{shapes.multipart} +\tikzset{ + treenode/.style = {shape=rectangle, rounded corners, + draw,align=center, + top color=white, + text width = 4.1cm, + inner sep=2ex, + anchor=center}, + treenodel/.style = {shape=rectangle, rounded corners, + draw,align=center, + top color=white, + text width = 2.2cm, + inner sep=2ex, + anchor=center}, + input/.style = {align=center, text width=4.5cm}, + decision/.style = {treenode, diamond, inner sep=2pt, + text width=2cm}, + decisionx/.style = {treenode, diamond, inner sep=2pt, + text width=1.5cm}, + decisiond/.style = {treenode, diamond, dashed, inner sep=3pt, + text width=2cm}, + treeuc/.style = {treenode,draw=blue,thick}, + treec/.style = {treenode,draw=red,thick}, + treeg/.style = {treenode,draw=green!60,thick,fill=green!5}, + action/.style = {treenode, circle, inner sep=1pt, + text width = 3cm}, + root/.style = {treenode}, + env/.style = {treenode}, + ginish/.style = {root}, + dummy/.style = {circle,draw}, + ar/.style={->,>=latex} +} +\tikzset{connect/.style={rounded corners=#1, + to path= ($(\tikztostart)!-#1!(\tikztotarget)!-#1!-90:(\tikztotarget)$) -- ($(\tikztotarget)!-#1!(\tikztostart)!-#1!90:(\tikztostart)$) -- + ($(\tikztotarget)!-#1!(\tikztostart)!#1!90:(\tikztostart)$) -- ($(\tikztostart)!-#1!(\tikztotarget)!-#1!90:(\tikztotarget)$) -- cycle (\tikztotarget) +}} +\tikzset{connect/.default=4mm} +\begin{document} +\nopagecolor +\centering +\hspace*{-2cm} +\resizebox{1000em}{!}{% +\begin{tikzpicture}[-latex][scale=0.3] + \matrix (chart)[ampersand replacement=\#] + [ + matrix of nodes, + column sep = 1.7em, + row sep = 1.5ex, + row 2/.style = {nodes={decision}}, + row 5/.style = {nodes={env}} + ] + { %first row + \# + |[input]| {\small \texttt{case}, \texttt{fix.vecm},\\ + \texttt{info.vecm}, \texttt{maxlag}}\# + |[input]| {\small \texttt{data},\\ + \texttt{xvar}, \texttt{yvar}}\# + |[input]|{\small{\texttt{case}, \texttt{fix.ardl},\\ + \texttt{info.ardl}, \texttt{maxlag}}} + \# + \\ + %second row + |[decision]|{\small + VECM\\Estimate} + \normalsize\# + |[treenode]| {\small + \centering + \begin{tabular}{c|c} + \multicolumn{2}{c}{VECM estimation (either)}\\ + Fixed order & \texttt{VARselect()}\\\hline + \texttt{fix.vecm} & \texttt{info.vecm} \\ + &\texttt{maxlag}\\ + \end{tabular}} + \normalsize\# + |[decisiond]|{\small + Compute $F_{ind}$ of \\UC ARDL + \normalsize} + \# + |[treenode]| {\small + \begin{tabular}{c|c} + \multicolumn{2}{c}{ARDL estimation (either)}\\ + Fixed order & \texttt{auto\_ardl()}\\\hline + \texttt{fix.ardl} & \texttt{info.ardl}\\ + &\texttt{maxlag}\\ + \end{tabular}} + \# + |[decision]|{\small + C ARDL\\Estimate} + \normalsize + \\ + %third row + |[decision]|{\small + Johansen test\\ + results on $\mathbf x_t$} + \normalsize\# + |[treeuc]| {\scriptsize + Estimation of the parameters:\\ + $\mathbf A$, $\boldsymbol\Gamma_j$ $(j=1,\dots,p)$\\ + $\boldsymbol\alpha_0$, $\boldsymbol\alpha_1$ based on \texttt{case}. \\ $\widehat{\boldsymbol\varepsilon}_{xt}$ obtained via \eqref{eq:resvecm}.\\ + Significant estimates of $\boldsymbol\Gamma_j$ filtered via \texttt{a.vecm}} + \normalsize\# + |[treenode]|{\scriptsize + Combine to get \\ + $\widetilde{\mathbf A} = + \begin{bmatrix} + \color{red}{a_{yy}} & \color{red} \widetilde{\mathbf{a}}'_{y.x}\\ + {\mathbf 0} & \color{blue}{\mathbf{A}}_{xx} + \end{bmatrix}$\\ + $\widetilde{\boldsymbol\Gamma}_j =\begin{bmatrix}\color{red}\boldsymbol{\gamma}_{y.x,j}\\ + \color{blue}\boldsymbol\Gamma_{(x),j} + \end{bmatrix}$\\ + \phantom{\tiny x}\\ + $\boldsymbol{\omega}$ (only in the C ARDL)\\ + $(\boldsymbol\alpha_{0}^{c})' = [\color{red}{\alpha_{0.y}}\;\color{blue}{\boldsymbol\alpha_{0x}'}]$, $(\boldsymbol\alpha_{1}^{c})' = [\color{red}{\alpha_{1.y}}\;\color{blue}{\boldsymbol\alpha_{1x}'}]$ + \normalsize} + \# + |[treec]| {\scriptsize + Estimation of:\\ + $ a_{yy}, \mathbf{a}_{y.x}$, $\boldsymbol\gamma_{y.x,j}$ $(j=1,\dots,p)$\\ + $\boldsymbol\omega$ (only in the C ARDL )\\ + $\alpha_{0.y}$,$\alpha_{1.y}$ based on \texttt{case}.\\ + Significant estimates of $\boldsymbol\gamma_{y.x,j}$ filtered via \texttt{a.ardl}} + \normalsize + \# + |[decisionx]|{\scriptsize + PSS/SMG results + in the C ARDL. + Compute\\ $F_{ov}$, $t$, $F_{ind}$} + \normalsize + \\ + \# + |[treenode]|{\scriptsize + Null elements of $\widetilde{\mathbf A}$ based on $H_0$.\\ + Nullity of $\boldsymbol\alpha_0^c$ and $\boldsymbol\alpha_1^c$ based on \texttt{case}.\\ + Combine the residuals \\ + $\widehat{\mathbf u}_t = [\color{red}\widehat{\nu}_{yt}^{*}\,\color{blue}\widehat{\boldsymbol\varepsilon}_{xt}]$} + \# + |[treec]| + { $F_{ov}$ test \\ + \small$H_0: a_{yy}=0,\; \widetilde{\mathbf{a}}_{y.x}=\mathbf 0$:\\ + Re-estimate ARDL, obtain\\ + $\widehat{\nu}_{yt}^{F_{ov}}$ via \eqref{eq:resfov} + \normalsize} + \# + |[treec]|{\small $t$-test \\$H_0: a_{yy}=0$:\\ + Re-estimate ARDL, obtain\\ + $\widehat{\nu}_{yt}^{t}$ via \eqref{eq:rest} + \normalsize} + \# + |[treec]|{\small $F_{ind}$ test \\$H_0: \widetilde{\mathbf{a}}_{y.x}=\mathbf 0$:\\ + Re-estimate ARDL, obtain\\ + $\widehat{\nu}_{yt}^{F_{ind}}$ via \eqref{eq:resfind} + \normalsize} + \\ + |[treenodel]| {\small Sample and \\center + from $\widehat{\mathbf U}$.\\ + Get ${\mathbf U^{(b)}}$. + \normalsize} + \# + |[treenode]|{\small $\Delta y_t^{(b)}$, $\Delta\boldsymbol x_t^{(b)}$ via (\ref{eq:resfov}-\ref{eq:rest}-\ref{eq:resfind}-\ref{eq:resvecm})\\ + $\mathbf{x}_t^{(b)}=\Delta\boldsymbol x_t^{(b)} + \mathbf{x}_{t-1}^{(b)}$. + \\ + ${y}_t^{(b)}=\Delta y_t^{(b)} +y_{t-1}^{(b)}$} + \# + |[treenode]|{\small ARDL estimation under $H_0$.\\ + Get $F_{ov}^{(b),H_0}$, $t^{(b),H_0}$,\\ $F_{ind}^{(b),H_0}$ (C) and $F_{ind}^{(b),H_0}$ (UC)} + \# + |[decisionx]|{\small $c_{\alpha,T}^*$ at level \texttt{a.boot.H0}.} + \# + |[treeg]|{\small Decide comparing \\ $F_{ov}$, $t$, $F_{ind}$ each to its $c_{\alpha,T}^{*}$.\\ + \textbf{IF} $F_{ind}>c_{\alpha,F_{ind}}^{*}$ (C)\\ + \textbf{AND} $F_{ind} (chart-2-2); + \draw[thick] + (chart-1-3) -> (chart-2-4); + \draw[thick] + (chart-1-2) -> (chart-2-2); + \draw[thick] + (chart-1-4) -> (chart-2-4); + \draw[thick] + (chart-2-2) -> (chart-2-1); + \draw[thick] + (chart-2-4) -> (chart-2-5); + \draw[thick] + (chart-2-2.south west) -> (chart-3-1); + \draw[thick] + (chart-2-2) -> (chart-3-2); + \draw[thick] + (chart-2-4.south east) to node[right=0.4cm,pos=-0.1,rotate=-39]{\small Based on \texttt{case}} (chart-3-5); + \draw[ar,thick] + ([xshift=-1 cm]chart-2-4.south) to node[left=0.1cm] {\small UC} ([xshift=-1 cm] chart-3-4.north); + \draw[ar,thick] ([xshift=1 cm]chart-2-4.south) to node[right=0.1cm] {\small C} ([xshift=1cm] chart-3-4.north); + \draw[thick] (chart-2-4) edge [in=70, out=40,looseness=2,left=2cm] node[pos=0.3,right=0.1cm]{\small UC and C model\normalsize} (chart-2-4); + \draw[thick] + (chart-2-4) -> (chart-2-3); + \draw[blue,ultra thick] (chart-3-2) -> (chart-3-3); + \draw[red,ultra thick] (chart-3-4) -> (chart-3-3); + \draw[red,ultra thick] (chart-3-4) -> (chart-4-3.north east); + \draw[red,ultra thick] (chart-3-4) -> (chart-4-4); + \draw[red,ultra thick] (chart-3-4) -> (chart-4-5.north west); + \draw[blue,ultra thick] (chart-3-2) -> (chart-4-2); + \draw[thick] (chart-3-3) -> (chart-4-2); + \draw[red,ultra thick, shorten < = 0.15cm] (chart-4-3) -> (chart-4-2); + + \begin{scope}[transform canvas={yshift=0em,xshift=3.4em}] + \draw[red,dashed,ultra thick] (chart-4-3.west) to[connect=13mm,rounded corners=1mm] (chart-4-5); + \end{scope} +\draw[thick] (chart-4-2.west) -> (chart-5-1.north); +\draw[thick] (chart-5-1) -> (chart-5-2); +\draw[thick] (chart-5-2) -> (chart-5-3); +\draw[thick] (chart-5-3) -> (chart-5-4); +\draw[ar,thick] (chart-5-3.south west) to [bend left=30] node[right=0.4cm,pos=0.2]{\small $b=1,\dots,B$}(chart-5-1.south east); + \end{tikzpicture} + } +\end{document} diff --git a/_articles/RJ-2024-003/vacca-zoia-bertelli.bib b/_articles/RJ-2024-003/vacca-zoia-bertelli.bib new file mode 100644 index 0000000000..0ba77eceec --- /dev/null +++ b/_articles/RJ-2024-003/vacca-zoia-bertelli.bib @@ -0,0 +1,761 @@ +@book{rao1997cointegration, + title={Cointegration for the applied economist}, + author={Rao, B Bhaskara}, + year={1997}, + publisher={Allied Publishers} +} +@article{haseeb2019impact, + title={The impact of renewable energy on economic well-being of Malaysia: Fresh evidence from auto regressive distributed lag bound testing approach}, + author={Haseeb, Muhammad and Abidin, Irwan Shah Zainal and Hye, Qazi Muhammad Adnan and Hartani, Nira Hariyatie}, + journal={International Journal of Energy Economics and Policy}, + volume={9}, + number={1}, + pages={269}, + year={2019}, +doi={10.32479/ijeep.7229}, + publisher={EconJournals} +} +@article{reda2020using, + title={Using the ARDL bound testing approach to study the inflation rate in Egypt}, + author={Reda, Abonazel Mohamed and Nourhan, Elnabawy}, + journal={Economic consultant}, + number={3 (31)}, + pages={24--41}, + year={2020}, +doi={10.46224/ecoc.2020.3.2}, + publisher={Общество с ограниченной ответственностью {\guillemotleft}Научно-образовательная инициатива{\guillemotright}} +} + +@article{hussain2019environmental, + title={Environmental Impact of Sectoral Energy Consumption on Economic Growth in Malaysia: Evidence from ARDL Bound Testing Approach.}, + author={Hussain, Hafezali Iqbal and Salem, Milad Abdelnabi and Rashid, Aimi Zulhazmi Abdul and Kamarudin, Fakarudin}, + journal={Ekoloji Dergisi}, + number={107}, + year={2019} +} + +@article{magaji2020impact, + title={The impact of price shocks on exchange rate and economic growth in Nigeria: an ARDL bound test cointegration approach}, + author={Magaji, Mustapha and Singla, Sonia}, + journal={Journal of Economics and Environment}, + volume={1}, + number={2}, + pages={24--39}, + year={2020} +} +@article{amaira2019analysis, + title={Analysis of the relationship between governance and economic growth: new evidence from tunisia an ARDL bounds testing approach}, + author={Amaira, Bouzid}, + journal={Magallat al-Tanmiyat wa-al-Siyasat al-Iqtisadiyyat}, + volume={21}, + number={1}, + pages={21--37}, + year={2019}, +doi={10.34066/0271-021-001-004}, + publisher={Arab Planning Institute} +} +@article{menegaki2019ardl, + title={The ARDL method in the energy-growth nexus field; best implementation strategies}, + author={Menegaki, Angeliki N}, + journal={Economies}, + volume={7}, + number={4}, + pages={105}, + year={2019}, +doi={10.3390/economies7040105}, + publisher={MDPI} +} +@article{yilanci2020brics, + title={Are BRICS countries pollution havens? Evidence from a bootstrap ARDL bounds testing approach with a Fourier function}, + author={Yilanci, Veli and Bozoklu, Seref and Gorus, Muhammed Sehid}, + journal={Sustainable Cities and Society}, + volume={55}, + pages={102035}, + year={2020}, +doi={10.1016/j.scs.2020.102035}, + publisher={Elsevier} +} + +@article{sam2019augmented, + title={An augmented autoregressive distributed lag bounds test for cointegration}, + author={Sam, Chung Yan and McNown, Robert and Goh, Soo Khoon}, + journal={Economic Modelling}, + volume={80}, + pages={130--141}, + year={2019}, +doi={10.1016/j.econmod.2018.11.001}, + publisher={Elsevier} +} +@article{khan2019effect, + title={Effect of energy consumption and economic growth on carbon dioxide emissions in Pakistan with dynamic ARDL simulations approach}, + author={Khan, Muhammad Kamran and Teng, Jian-Zhou and Khan, Muhammad Imran}, + journal={Environmental Science and Pollution Research}, + volume={26}, + pages={23480--23490}, + year={2019}, +doi={10.1007/s11356-019-05640-x}, + publisher={Springer} +} +@article{abbasi2021energy, + title={How energy consumption, industrial growth, urbanization, and CO2 emissions affect economic growth in Pakistan? A novel dynamic ARDL simulations approach}, + author={Abbasi, Kashif Raza and Shahbaz, Muhammad and Jiao, Zhilun and Tufail, Muhammad}, + journal={Energy}, + volume={221}, + pages={119793}, + year={2021}, +doi={10.1016/j.energy.2021.119793}, + publisher={Elsevier} +} +@article{nawaz2019natural, + title={Natural resources as blessings and finance-growth nexus: A bootstrap ARDL approach in an emerging economy}, + author={Nawaz, Kishwar and Lahiani, Amine and Roubaud, David}, + journal={Resources Policy}, + volume={60}, + pages={277--287}, + year={2019}, +doi={10.1016/j.resourpol.2019.01.007}, + publisher={Elsevier} +} + + + +@article{davidson1978econometric, + author = {Davidson, James E. H. and Hendry, David F. and Srba, Frank and Yeo, Stephen}, + title = "{Econometric Modelling of the Aggregate Time-Series Relationship Between Consumers' Expenditure and Income in the United Kingdom}", + journal = {The Economic Journal}, + volume = {88}, + number = {352}, + pages = {661-692}, + year = {1978}, + month = {12}, + doi = {10.2307/2231972} +} + + +@article{bertelli2022bootstrap, + title={Bootstrap cointegration tests in ARDL models}, + author={Bertelli, Stefano and Vacca, Gianmarco and Zoia, Maria}, + journal={Economic Modelling}, + volume={116}, + pages={105987}, + year={2022}, +doi={10.1016/j.econmod.2022.105987}, + publisher={Elsevier} +} + +@book{banerjee1993co, + title={Co-integration, error correction, and the econometric analysis of non-stationary data}, + author={Banerjee, Anindya and Dolado, Juan J and Galbraith, John W and Hendry, David}, + year={1993}, +doi={10.1093/0198288107.001.0001}, + publisher={Oxford university press} +} + +@article{banerjee1998error, + title={Error-correction mechanism tests for cointegration in a single-equation framework}, + author={Banerjee, Anindya and Dolado, Juan and Mestre, Ricardo}, + journal={Journal of time series analysis}, + volume={19}, + number={3}, + pages={267--283}, + year={1998}, +doi={ 10.1111/1467-9892.00091}, + publisher={Wiley Online Library} +} + +@article{narayan2005saving, + title={The saving and investment nexus for China: evidence from cointegration tests}, + author={Narayan, Paresh Kumar}, + journal={Applied economics}, + volume={37}, + number={17}, + pages={1979--1990}, + year={2005}, +doi={10.1080/00036840500278103}, + publisher={Taylor \& Francis} +} + + +@book{lutkepohl2005, + title={New introduction to multiple time series analysis}, + author={L{\"u}tkepohl, Helmut}, + year={2005}, +doi={10.1007/978-3-540-27752-1}, + publisher={Springer Science \& Business Media} +} + +@article{beran1988prepivoting, + title={Prepivoting test statistics: a bootstrap view of asymptotic refinements}, + author={Beran, Rudolf}, + journal={Journal of the American Statistical Association}, + volume={83}, + number={403}, + pages={687--697}, + year={1988}, +doi={10.1080/01621459.1988.10478649}, + publisher={Taylor \& Francis} +} + +@article{chang2003sieve, + title={A sieve bootstrap for the test of a unit root}, + author={Chang, Yoosoon and Park, Joon Y}, + journal={Journal of Time Series Analysis}, + volume={24}, + number={4}, + pages={379--400}, + year={2003}, +doi={ 10.1111/1467-9892.00312}, + publisher={Wiley Online Library} +} + +@book{patterson2006palgrave, + title={Palgrave handbook of econometrics}, + author={Patterson, Kerry and Mills, Terence C}, + year={2006}, + publisher={Palgrave Macmillan} +} + +@article{granger1981some, + title={Some properties of time series data and their use in econometric model specification}, + author={Granger, Clive WJ}, + journal={Journal of econometrics}, + volume={16}, + number={1}, + pages={121--130}, + year={1981}, +doi={10.1016/0304-4076(81)90079-8}, + publisher={North-Holland} +} + +@inproceedings{ko2011bootstrap, + title={A Bootstrap Granger Causality Test from Exchange Rates to Fundamentals}, + author={Ko, Hsiu-Hsin}, + booktitle={International Conference on Economics and Finance Research}, + year={2011} +} + +@article{chang2006bootstrapping, + title={Bootstrapping cointegrating regressions}, + author={Chang, Yoosoon and Park, Joon Y and Song, Kevin}, + journal={Journal of Econometrics}, + volume={133}, + number={2}, + pages={703--739}, + year={2006}, +doi={10.1016/S0304-4076(97)00043-2}, + publisher={Elsevier} +} + +@article{matsaglia1974equalities, + title={Equalities and inequalities for ranks of matrices}, + author={Matsaglia, George and PH Styan, George}, + journal={Linear and multilinear Algebra}, + volume={2}, + number={3}, + pages={269--292}, + year={1974}, +doi={10.1080/03081087408817070}, + publisher={Taylor \& Francis} +} + +@article{miller1988saving, + title={Are saving and investment co-integrated?}, + author={Miller, Stephen M}, + journal={Economics Letters}, + volume={27}, + number={1}, + pages={31--34}, + year={1988}, +doi={10.1016/0165-1765(88)90215-7}, + publisher={Elsevier} +} + +@article{morley2006causality, + title={Causality between economic growth and immigration: An ARDL bounds testing approach}, + author={Morley, Bruce}, + journal={Economics Letters}, + volume={90}, + number={1}, + pages={72--76}, + year={2006}, +doi={10.1016/j.econlet.2005.07.008}, + publisher={Elsevier} +} + +@article{mcnown2018bootstrapping, + title={Bootstrapping the autoregressive distributed lag test for cointegration}, + author={McNown, Robert and Sam, Chung Yan and Goh, Soo Khoon}, + journal={Applied Economics}, + volume={50}, + number={13}, + pages={1509--1521}, + year={2018}, +doi={10.1080/00036846.2017.1366643}, + publisher={Taylor \& Francis} +} + +@article{nkoro2016autoregressive, + title={Autoregressive Distributed Lag (ARDL) cointegration technique: application and interpretation}, + author={Nkoro, Emeka and Uko, Aham Kelvin and others}, + journal={Journal of Statistical and Econometric Methods}, + volume={5}, + number={4}, + pages={63--91}, + year={2016} +} + +@article{davidson2005case, + title={The case against JIVE}, + author={Davidson, Russell and MacKinnon, James G}, + journal={Journal of Applied Econometrics}, + volume={21}, + number={6}, + pages={827--833}, + year={2005}, +doi={ 10.1002/jae.873}, + publisher={Wiley Online Library} +} + +@article{McNown2017, +title = {Re-examining foreign direct investment, exports, and economic growth in asian economies using a bootstrap ARDL test for cointegration}, +journal = {Journal of Asian Economics}, +volume = {51}, +pages = {12-22}, +year = {2017}, +issn = {1049-0078}, +doi={10.1016/j.asieco.2017.06.001}, +author = {Goh, Soo Khoon and Sam, Chung Yan and McNown, Robert} +} + +@article{kanioura2005critical, + title={Critical values for an F-test for cointegration in a multivariate model}, + author={Kanioura, Athina and Turner, Paul}, + journal={Applied Economics}, + volume={37}, + number={3}, + pages={265--270}, + year={2005}, +doi={10.1080/00036840412331315051}, + publisher={Taylor \& Francis} +} + +@article{kripfganz2020response, + title={Response Surface Regressions for Critical Value Bounds and Approximate p-values in Equilibrium Correction Models 1}, + author={Kripfganz, Sebastian and Schneider, Daniel C}, + journal={Oxford Bulletin of Economics and Statistics}, + volume={82}, + number={6}, + pages={1456--1481}, + year={2020}, +doi={ 10.1111/obes.12377}, + publisher={Wiley Online Library} +} + +@article{narayan2004crime, + title={Crime rates, male youth unemployment and real income in Australia: evidence from Granger causality tests}, + author={Narayan, Paresh Kumar and Smyth, Russell}, + journal={Applied Economics}, + volume={36}, + number={18}, + pages={2079--2095}, + year={2004}, +doi={10.1080/0003684042000261842}, + publisher={Taylor \& Francis} +} + +@article{mills2001real, + title={The real exchange rate and the output response in four EU accession countries}, + author={Mills, Terence C and Pentecost, Eric J}, + journal={Emerging Markets Review}, + volume={2}, + number={4}, + pages={418--430}, + year={2001}, +doi={10.1016/S1566-0141(01)00027-9}, + publisher={Elsevier} +} + + + +@article{pesaran2001, + title={Bounds testing approaches to the analysis of level relationships}, + author={Pesaran, M Hashem and Shin, Yongcheol and Smith, Richard J}, + journal={Journal of applied econometrics}, + volume={16}, + number={3}, + pages={289--326}, + year={2001}, +doi={10.1002/jae.616}, + publisher={Wiley Online Library} +} + +@article{nar2015, + title={The financial econometrics of price discovery and predictability}, + author={Narayan, Seema and Smyth, Russell}, + journal={International Review of Financial Analysis}, + volume={42}, + pages={380--393}, + year={2015}, +doi={10.1016/j.irfa.2015.09.003}, + publisher={Elsevier} +} + +@article{li2000bootstrapping, + title={On bootstrapping regressions with unit root processes}, + author={Li, Hongyi and Xiao, Zhijie}, + journal={Statistics \& probability letters}, + volume={48}, + number={3}, + pages={261--267}, + year={2000}, +doi={10.1016/S0167-7152(00)00005-5}, + publisher={Elsevier} +} + +@article{kremers1992power, + title={The power of cointegration tests}, + author={Kremers, Jeroen JM and Ericsson, Neil R and Dolado, Juan J}, + journal={Oxford bulletin of economics and statistics}, + volume={54}, + number={3}, + pages={325--348}, + year={1992}, + doi={10.1111/j.1468-0084.1992.tb00005.x}, + publisher={Wiley Online Library} +} + +@article{johansen1990maximum, + title={Maximum likelihood estimation and inference on cointegration—with applications to the demand for money}, + author={Johansen, S{\o}ren and Juselius, Katarina}, + journal={Oxford Bulletin of Economics and statistics}, + volume={52}, + number={2}, + pages={169--210}, +doi={ 10.1111/j.1468-0084.1990.mp52002003.x}, + year={1990} +} +@article{johansen1992cointegration, + title={Cointegration in partial systems and the efficiency of single-equation analysis}, + author={Johansen, S{\o}ren}, + journal={Journal of econometrics}, + volume={52}, + number={3}, + pages={389--402}, + year={1992}, +doi={10.1016/0304-4076(92)90019-N}, + publisher={Elsevier} +} + +@article{harris1998small, + title={Small sample testing for cointegration using the bootstrap approach}, + author={Harris, Richard ID and Judge, Guy}, + journal={Economics Letters}, + volume={58}, + number={1}, + pages={31--37}, + year={1998}, +doi={10.1016/S0165-1765(97)00275-9}, + publisher={Elsevier} +} + +@article{ferretti1996unit, + title={Unit root bootstrap tests for AR (1) models}, + author={Ferretti, Nelida and Romo, Juan}, + journal={Biometrika}, + volume={83}, + number={4}, + pages={849--860}, + year={1996}, +doi={10.1093/biomet/83.4.849}, + publisher={Oxford University Press} +} + +@article{engle1987co, + title={Co-integration and error correction: representation, estimation, and testing}, + author={Engle, Robert F and Granger, Clive WJ}, + journal={Econometrica: journal of the Econometric Society}, + pages={251--276}, + year={1987}, +doi={10.2307/1913236}, + publisher={JSTOR} +} + +@article{chang2006bootstrapping, + title={Bootstrapping cointegrating regressions}, + author={Chang, Yoosoon and Park, Joon Y and Song, Kevin}, + journal={Journal of Econometrics}, + volume={133}, + number={2}, + pages={703--739}, + year={2006}, +doi={10.1016/S0304-4076(97)00043-2}, + publisher={Elsevier} +} + +@article{engleyoo87, +title = {Forecasting and testing in co-integrated systems}, +journal = {Journal of Econometrics}, +volume = {35}, +number = {1}, +pages = {143-159}, +year = {1987}, +doi = {10.1016/0304-4076(87)90085-6}, +author = {Robert F. Engle and Byung Sam Yoo} +} + +@INPROCEEDINGS{Mackinnon91, + author = {James G. Mackinnon}, + title = {Critical values for cointegration tests}, + booktitle = {Eds.), Long-Run Economic Relationship: Readings in Cointegration}, + year = {1991}, + publisher = {Oxford Press} +} + +@article{gabriel2002, + title={A simple method of testing for cointegration subject to multiple regime changes}, + author={Gabriel, Vasco J and Psaradakis, Zacharias and Sola, Martin}, + journal={Economics Letters}, + volume={76}, + number={2}, + pages={213--221}, + year={2002}, + publisher={Elsevier} +} + +@article{cook2006power, + title={The power of single equation tests for cointegration}, + author={Cook, Steven}, + journal={Applied Economics Letters}, + volume={13}, + number={5}, + pages={265--267}, + year={2006}, +doi={10.1080/13504850500398534}, + publisher={Taylor \& Francis} +} + + +@article{maddala1998, + title={Unit roots, cointegration, and structural change}, + author={Maddala, Gangadharrao S and Kim, In-Moo}, + year={1998}, +doi={10.1017/CBO9780511751974}, + publisher={Cambridge university press} +} + +@article{arranz2000, +author = {Arranz, Miguel A. and Escribano, Alvaro}, +title = {Cointegration Testing Under Structural Breaks: A Robust Extended Error Correction Model}, +journal = {Oxford Bulletin of Economics and Statistics}, +volume = {62}, +number = {1}, +pages = {23-52}, +doi = {10.1111/1468-0084.00158}, +year = {2000} +} +@article{ericsson2002, + title={Distributions of error correction tests for cointegration}, + author={Ericsson, Neil R and MacKinnon, James G}, + journal={The Econometrics Journal}, + volume={5}, + number={2}, + pages={285--318}, + year={2002}, +doi={10.1111/1368-423X.00085}, + publisher={Oxford University Press Oxford, UK} +} +@article{johansen1991, + title={Estimation and hypothesis testing of cointegration vectors in Gaussian vector autoregressive models}, + author={Johansen, S{\o}ren}, + journal={Econometrica: journal of the Econometric Society}, + pages={1551--1580}, + year={1991}, +doi={10.2307/2938278}, + publisher={JSTOR} +} + +@Manual{RSOFT, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2022}, + url = {https://www.R-project.org/}, + } + +@Book{RCPP, + title = {Seamless {R} and {C++} Integration with {Rcpp}}, + author = {Dirk Eddelbuettel}, + publisher = {Springer}, + address = {New York}, + year = {2013}, + note = {ISBN 978-1-4614-6867-7}, + doi = {10.1007/978-1-4614-6868-4}, + } + + @Manual{PKGARDL, + title = {{ARDL}: ARDL, ECM and Bounds-Test for Cointegration}, + author = {Kleanthis Natsiopoulos and Nickolaos Tzeremes}, + year = {2021}, + note = {R package version 0.1.1}, + url = {https://CRAN.R-project.org/package=ARDL}, + } + + @Article{PKGVARS, + title = {VAR, SVAR and SVEC Models: Implementation Within {R} Package {vars}}, + author = {Bernhard Pfaff}, + journal = {Journal of Statistical Software}, + year = {2008}, + volume = {27}, + number = {4}, + url = {https://www.jstatsoft.org/v27/i04/}, +} + @Manual{PKGDYNAMAC, + title = {dynamac: Dynamic Simulation and Testing for Single-Equation ARDL Models}, + author = {Soren Jordan and Andrew Q. Philips}, + year = {2020}, + note = {R package version 0.1.11}, + url = {https://CRAN.R-project.org/package=dynamac}, + } + +@Manual{PKGATSA, +title = {{aTSA}: Alternative Time Series Analysis}, + author = {Debin Qiu}, + year = {2015}, + note = {R package version 3.1.2}, + url = {https://CRAN.R-project.org/package=aTSA}, +} + +@Manual{bootCT, +title = {{bootCT}: Bootstrapping the ARDL Tests for Cointegration}, + author = {Gianmarco Vacca and Stefano Bertelli}, + year = {2023}, + note = {R package version 2.0.0}, +} + +@Book{ggplot, + author = {Hadley Wickham}, + title = {ggplot2: Elegant Graphics for Data Analysis}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-3-319-24277-4}, + url = {https://ggplot2.tidyverse.org}, + } + +@Article{reshape2, + title = {Reshaping Data with the {reshape} Package}, + author = {Hadley Wickham}, + journal = {Journal of Statistical Software}, + year = {2007}, + volume = {21}, + number = {12}, + pages = {1--20}, + url = {http://www.jstatsoft.org/v21/i12/}, +} +@Manual{Rmisc, +title = {{Rmisc}: Ryan Miscellaneous}, + author = {Ryan M. Hope}, + year = {2022}, + note = {R package version 1.5.1}, + url = {https://CRAN.R-project.org/package=Rmisc}, +} +@Manual{tseries, +title = {{tseries}: Time Series Analysis and Computational Finance}, + author = {Adrian Trapletti and Kurt Hornik}, + year = {2023}, + note = {R package version 0.10-54}, + url = {https://CRAN.R-project.org/package=tseries}, +} +@Book{urca, + title = {Analysis of Integrated and Cointegrated Time Series with R}, + author = {B. Pfaff}, + publisher = {Springer}, + edition = {Second}, + address = {New York}, + year = {2008}, + note = {ISBN 0-387-27960-1}, + url = {https://www.pfaffikus.de}, +} +@Article{tidyverse, + title = {Welcome to the {tidyverse}}, + author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, + year = {2019}, + journal = {Journal of Open Source Software}, + volume = {4}, + number = {43}, + pages = {1686}, + doi = {10.21105/joss.01686}, +} + +@Manual{pracma, +title = {{pracma}: Practical Numerical Math Functions}, + author = {Hans W. Borchers}, + year = {2022}, + note = {R package version 2.4.2}, + url = {https://CRAN.R-project.org/package=pracma}, +} + +@Manual{aod, +title = {{aod}: Analysis of Overdispersed Data}, + author = {{Lesnoff} and {M.} and {Lancelot} and {R.}}, + year = {2012}, + note = {R package version 1.3.2}, + url = {https://cran.r-project.org/package=aod}, +} + +@Manual{gtools, +title = {{gtools}: Various R Programming Tools}, + author = {Ben Bolker and Gregory R. Warnes and Thomas Lumley}, + year = {2022}, + note = {R package version 3.9.4}, + url = {https://CRAN.R-project.org/package=gtools}, +} +@Manual{magrittr, +title = {{magrittr}: A Forward-Pipe Operator for R}, + author = {Stefan Milton Bache and Hadley Wickham}, + year = {2022}, + note = {R package version 2.0.3}, + url = {https://CRAN.R-project.org/package=magrittr}, +} +@Manual{stringr, +title = {{stringr}: Simple, Consistent Wrappers for Common String Operations}, + author = {Hadley Wickham}, + year = {2022}, + note = {R package version 1.5.0}, + url = {https://CRAN.R-project.org/package=stringr}, +} +@Manual{dplyr, +title = {{dplyr}: A Grammar of Data Manipulation}, + author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan}, + year = {2023}, + note = {R package version 1.1.2}, + url = {https://CRAN.R-project.org/package=dplyr}, +} +@Manual{usethis, +title = {{usethis}: Automate Package and Project Setup}, + author = {Hadley Wickham and Jennifer Bryan and Malcolm Barrett and Andy Teucher}, + year = {2023}, + note = {R package version 2.2.2}, + url = {https://CRAN.R-project.org/package=usethis}, +} + +@Manual{RcppArmadillo2023, +title = {{RcppArmadillo}: `{Rcpp}' Integration for the `{Armadillo}' Templated Linear Algebra +Library}, + author = {Dirk Eddelbuettel and Romain Francois and Doug Bates and Binxiang Ni and Conrad Sanderson}, + year = {2023}, + note = {R package version 0.12.4.0.0}, + url = {https://CRAN.R-project.org/package=RcppArmadillo}, +} + +@Article{RcppArmadillo2014, +title = {{RcppArmadillo}: Accelerating R with high-performance C++ linear algebra}, + author = {Dirk Eddelbuettel and Conrad Sanderson}, + journal = {Computational Statistics and Data Analysis}, + year = {2014}, + volume = {71}, + month = {March}, + pages = {1054--1063}, + doi = {10.1016/j.csda.2013.02.005}, +} + diff --git a/_articles/RJ-2024-003/vacca-zoia-bertelli.tex b/_articles/RJ-2024-003/vacca-zoia-bertelli.tex new file mode 100644 index 0000000000..3c4cbf1461 --- /dev/null +++ b/_articles/RJ-2024-003/vacca-zoia-bertelli.tex @@ -0,0 +1,1410 @@ + + +% !TeX root = RJwrapper.tex +\title{bootCT: An R Package for Bootstrap Cointegration Tests in ARDL Models} +\author{by Gianmarco Vacca, Maria Zoia, Stefano Bertelli} + +\maketitle +\nocite{RSOFT} +\abstract{ +The Autoregressive Distributed Lag approach to cointegration or bound testing, proposed by Pesaran in 2001, has become prominent in empirical research. Although this approach has many advantages over the classical cointegration tests, it is not exempt from drawbacks, such as possible inconclusive inference and distortion in size. +Recently, Bertelli and coauthors developed a bootstrap approach to the bound tests to overcome these drawbacks. This paper introduces the R package bootCT, which implements this method by deriving the bootstrap versions of the bound tests and of the asymptotic F-test on the independent variables proposed by Sam and coauthors in 2019. +As a spinoff, a general method for generating random multivariate time series following a given VECM/ARDL structure is provided in the package. +Empirical applications showcase the main functionality of the package. +} + +\section{Introduction}\label{sec:intro} + Cointegration and error correction are fundamental concepts in the analysis of economic data, insofar as they provide an appropriate framework for testing economic hypotheses about growth and fluctuation. + Several approaches have been proposed in the literature to determine whether two or more non-stationary time series are cointegrated, meaning they share a common long-run relationship.\\ +There are two basic types of tests for cointegration: single equation tests and VAR-based tests. The former check the presence of unit roots in cointegration residuals \citep[see, e.g.,][]{engle1987co,engleyoo87,Mackinnon91,gabriel2002,cook2006power} or test the significance of the error-correction (EC) term coefficient \citep{kremers1992power,maddala1998,arranz2000,ericsson2002}. The latter, such as the \citet{johansen1991} approach, tackle the problem of detecting cointegrating relationships in a VAR model. +This latter approach, albeit having the advantage of avoiding the issue of normalization, as well as allowing the detection of multiple cointegrating vectors, is far from being perfect. In the VAR system all variables are treated symmetrically, as opposed to the standard univariate models that usually have a clear interpretation in terms of exogenous and endogenous variables. Furthermore, in a VAR system all the variables are estimated at the same time, which is problematic if the relation between some variables is flawed, that is affected by some source of error. In this case a simultaneous estimation process tends to propagate the error affecting one equation to the others. +Furthermore, a multidimensional VAR models employs plenty of degrees of freedom.\\ +The recent cointegration approach, known as Autoregressive Distributed Lag (ARDL) approach to cointegration or bound testing, proposed by ~\citet{pesaran2001} (PSS), falls in the former strand of literature. It has become prominent in empirical research because it shows several advantages with respect to traditional methods for testing cointegration. First, it is applicable also in cases of mixed order integrated variables, albeit with integration not exceeding the first order. +Thus, it evades the necessity of pre-testing the variables and, accordingly, avoids some common practices that may prevent finding cointegrating relationships, such as dropping variables or transforming them into stationary form ~\citep[see][]{mcnown2018bootstrapping}. +Second, cointegration bound tests are performed in an ARDL model that allows different lag orders for each variable, thus providing a more flexible framework than other commonly employed approaches. +Finally, unlike other cointegration techniques, which are sensitive to the sample size, the ARDL approach provides robust and consistent results for small sample sizes.\\ +Notably, the ARDL bound testing methodology has quickly spread in economics and econometrics to study the cointegrating relationships between macroeconomic and financial variables, to evaluate the long-run impact of energy variables, or to assess recent environmental policies and their impact on the economy. Among the many applications, see for instance \citet{haseeb2019impact,reda2020using, menegaki2019ardl,yilanci2020brics,hussain2019environmental,abbasi2021energy}.\\ +The original bound tests proposed by \citet{pesaran2001} are an $F$-test for the significance of the coefficients of all lagged level variables entering the error correction term ($F_{ov}$), and a $t$-test for the coefficient of the lagged dependent variable. When either the dependent or the independent variables do not appear in the long-run relationship, a degenerate case arises. The bound $t$-test provides answers on the occurrence of a degenerate case of second type, while the occurrence of a degeneracy case of first type can be assessed by testing whether the dependent variable is of integration order I(1). +This type of check violates the spirit and motivation of the bound tests, which are supposed to be applicable in situations of unknown order of integration for the variables.\\ +Recently, \citet{mcnown2018bootstrapping} pointed out how, due to the low power problem of unit root tests, investigating the presence of a first type degeneracy by testing the integration order of the dependent variable may lead to incorrect conclusions. Therefore, they suggested checking for its occurrence by testing the significance of the lagged levels of the independent variables via an extra $F$-test ($F_{ind}$), which was also worked out in its asymptotic version \citep[SMK;][]{sam2019augmented}.\\ +Besides problems in testing the occurrence of degenerate cases, in general, the main drawback of the bound tests is the occurrence of potentially inconclusive results, if the test statistic lies between the bounds of the test distribution under the null. Furthermore, the asymptotic distributions of the statistics may provide a poor approximation of the true distributions in small samples. Finite sample critical values, even if only for a subset of all possible model specifications, have been worked out in the literature \citep[see][]{mills2001real,narayan2004crime,kanioura2005critical,narayan2005saving}, while \cite{kripfganz2020response} provided the quantiles of the asymptotic distributions of the tests as functions of the sample size, the lag order and the number of long-run forcing variables. However, this relevant improvement does not eliminate the uncertainty related to the inconclusive regions, or the existence of other critical issues related to the underlying assumptions of the bound test framework, such as the (weak) exogeneity of the independent variables or the non-stationarity of the dependent variable.\\ +To overcome the mentioned bound test drawbacks, \cite{bertelli2022bootstrap} proposed bootstrapping the ARDL cointegration test. Inference can always be pursued with ARDL bootstrap tests, unlike what happens with both the PSS tests and the SMK test on the independent variables. +Bootstrap ARDL tests were first put forward by \cite{mcnown2018bootstrapping} in an unconditional ARDL model, which omits the instantaneous differences of the exogenous variables in the ARDL equation, rather than a conditional one, as originally proposed by \cite{pesaran2001}. +The unconditional model is often used, for reason of practical convenience, in empirical research. Simulation results in \cite{bertelli2022bootstrap} have highlighted the importance of employing the appropriate specification, especially under degenerate cases. In fact, it has been pointed out that a correct detection of these cases requires the comparison of the test outcomes in both the conditional and unconditional settings. Erroneous conclusions, based exclusively on one model specification, can thus be avoided.\\ +In this paper, bootstrap bound tests, thereby including the bootstrap versions of the $F_{ov}$, $t$ and $F_{ind}$ bound tests, are carried out in a conditional ARDL model setting. This approach allows to overcome the problem of inconclusive regions of the standard bound tests. A comparison with the outcomes engendered by the unconditional ARDL bootstrap tests is nevertheless provided for the $F_{ind}$ test, to avoid erroneous inference in presence of degenerate cases.\\ +The paper is organized as follows. Section \ref{sec:cointegration} introduces the theoretical results of the ARDL cointegration bound tests. Section \ref{sec:boot} details the steps carried out by the bootstrap procedure, which allows the construction of the (bootstrap) distribution - under the null - for the $F_{ov}$, $t$, conditional $F_{ind}$ and unconditional $F_{ind}$ tests. Section \ref{sec:pkg} introduces the \code{R} package \CRANpkg{bootCT} \citep{bootCT} and its functionalities: a method for the generation of random multivariate time series that follow a user-specified VECM/ARDL structure, with some examples, and the main function that carries out the aforementioned bootstrap tests, while also computing the PSS and SMK bound tests. +The trade-off between accuracy and computational time of the bootstrap procedure is also investigated, under several scenarios in terms of sample size and number of replications. Notably, a function that performs the PSS bound tests is already available in the \CRANpkg{dynamac} package \citep{PKGDYNAMAC}, while no \code{R} routine has so far been implemented for the SMK test, to the best of our knowledge. +Section \ref{sec:app} gives some empirical applications that employ the core function of the package and its possible outputs. Section \ref{sec:end} concludes. Appendix \ref{sec:appendix} briefly delves into technical details of the conditional ARDL model and its possible specifications +\footnote{The \code{R} packages, either used in the creation of \CRANpkg{bootCT} or employed in the analyses presented in this paper, are \CRANpkg{magrittr} \citep{magrittr}, \CRANpkg{gtools} \citep{gtools}, \CRANpkg{pracma} \citep{pracma}, \CRANpkg{Rcpp} \citep{RCPP}, \CRANpkg{RcppArmadillo} \citep{RcppArmadillo2023}, \CRANpkg{Rmisc} \citep{Rmisc}, \CRANpkg{dynamac} \citep{PKGDYNAMAC}, \CRANpkg{ARDL} \citep{PKGARDL}, \CRANpkg{aod} \citep{aod}, \CRANpkg{vars} and \CRANpkg{urca} \citep{PKGVARS, urca}, \CRANpkg{aTSA} \citep{PKGATSA}, \CRANpkg{tseries} \citep{tseries}, \CRANpkg{reshape2}, \CRANpkg{ggplot2} and \CRANpkg{stringr} \citep{reshape2,ggplot,stringr}, \CRANpkg{tidyverse} and \CRANpkg{dplyr} \citep{tidyverse,dplyr}.}. + +\section{Cointegration bound tests in ARDL models}\label{sec:cointegration} +The starting point of the approach proposed by ~\cite{pesaran2001} is a $(K+1)$ VAR($p$) model +\begin{equation}\label{eq:var} +\mathbf{A}(L)(\mathbf{z}_t-\boldsymbol{\mu}-\boldsymbol{\eta}t)=\boldsymbol{\varepsilon}_t \enspace \enspace \enspace \boldsymbol{\varepsilon}_t\sim N(\mathbf{0}, \boldsymbol{\Sigma}),\qquad\mathbf{A}(L)=\left(\mathbf{I}_{K+1}- \sum_{j=1}^{p}\mathbf{A}_j\mathbf{L}^j\right) +\enspace \enspace \enspace t=1,2,\dots,T. +\end{equation} +Here, $\mathbf{A}_j$ are square $(K+1)$ matrices, $\mathbf{z}_t$ a vector of $(K+1)$ variables, +$\boldsymbol{\mu}$ and $\boldsymbol{\eta}$ are $(K+1)$ vectors representing the drift and the trend respectively, and $\det(\mathbf{A}(z))=0$ for $|z| \geq 1$. If the matrix $\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j}$ is singular, the components of $\mathbf{z}_t$ turn out to be integrated and possibly cointegrated.\\ +The VECM representation of \eqref{eq:var} is given by (see Appendix \ref{sec:appendixa} for details) +\begin{equation}\label{eq:vecm} +\Delta\mathbf{z}_t=\boldsymbol{\alpha}_{0}+\boldsymbol{\alpha}_{1}t-\mathbf{A}(1)\mathbf{z}_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta \mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_t. +\end{equation} +Now, to study the adjustment to the equilibrium of a single variable $y_t$, given the other $\mathbf{x}_t$ variables, the vectors $\mathbf{z}_t$ and $\boldsymbol{\varepsilon}_t$ are partitioned +\begin{equation}\label{eq:vecpart} +\mathbf{z}_t=\begin{bmatrix} +\underset{(1,1)}{y_{t}} \\ \underset{(K,1)}{\mathbf{x}_{t}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\varepsilon}_t=\begin{bmatrix} +\underset{(1,1)}{\varepsilon_{yt}} \\ \underset{(K,1)}{\boldsymbol{\varepsilon}_{xt}} +\end{bmatrix}. +\end{equation} +%Furthermore, let us assume that + The matrix $\mathbf{A}(1)$, which is assumed to be singular to allow cointegration, is partitioned conformably to $\mathbf{z}_{t}$ as \footnote{ If the explanatory variables are stationary $\mathbf{A}_{xx}$ is non-singular ($rk(\mathbf{A}_{xx})=K$), while when they are integrated but without cointegrating relationship $\mathbf{A}_{xx}$ is a null matrix.} \\ +\begin{equation} + \mathbf{A}(1)=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}. +\end{equation} +Under the assumption +\begin{equation}\label{eq:normerr} +\boldsymbol{\varepsilon}_t \sim N\Bigg(\mathbf{0}, \begin{bmatrix} +\underset{(1,1)}{\sigma_{yy}}& \underset{(1,K)}{\boldsymbol{\sigma}_{yx}'} \\ \underset{(K,1)}{\boldsymbol{\sigma}_{xy}} & \underset{(K,K)}{\boldsymbol{\Sigma}_{xx}} \end{bmatrix}\Bigg), +\end{equation} +the following holds +\begin{equation}\label{eq:epsilonx} +\varepsilon_{yt}=\boldsymbol{\omega}'\boldsymbol{\varepsilon}_{xt}+\nu_{yt} \sim N(0,\sigma_{y.x}), +\end{equation} +where $\sigma_{y.x}=\sigma_{yy}-\boldsymbol{\omega}'\boldsymbol{\sigma}_{xy}$ with $\boldsymbol{\omega}'=\boldsymbol{\sigma}'_{yx}\boldsymbol{\Sigma}^{-1}_{xx}$, and $\nu_{yt}$ is independent of $\boldsymbol{\varepsilon}_{xt}$.\\ +Substituting \eqref{eq:epsilonx} into \eqref{eq:vecm} and assuming that the $\mathbf{x}_{t}$ variables are exogenous towards the ARDL parameters (that is, setting $\mathbf{a}_{xy}=\mathbf{0}$ in $\mathbf{A}(1)$) yields the system (see Appendix \ref{sec:appendixa} for details) +\begin{equation}\label{eq:ardl} + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt} +\end{equation} +\begin{equation}\label{eq:marg} +\Delta\mathbf{x}_{t} += \boldsymbol{\alpha}_{0x} +\boldsymbol{\alpha}_{1x}t+ \mathbf{A}_{(x)}\mathbf{z}_{t-1}+ \boldsymbol{\Gamma}_{(x)}(L)\Delta\mathbf{z}_t+ \boldsymbol{\varepsilon}_{xt}, +\end{equation} +where +\begin{equation}\label{eq:ardlgamma} +\boldsymbol\gamma_{y.x,j}'=\boldsymbol\gamma_{y,j}'-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x),j} +\end{equation} +\begin{equation}\label{eq:ardldet} +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x}, +\end{equation} +and where the error correction term, $EC_{t-1}$, expressing the long-run equilibrium relationship between $y_{t}$ and $\mathbf{x}_{t}$, is given by +\begin{equation}\label{eq:ec} +EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}, +\end{equation} +with +\begin{equation}\label{eq:const} +\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}, \enspace\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}'}_{y.x}}{a_{yy}}=-\frac{\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx}}{a_{yy}}. +\end{equation} +Thus, no cointegration occurs when $\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}$ or $a_{yy}=0$ . +These two circumstances are referred to as degenerate case of second and first type, respectively. Degenerate cases imply no cointegration between $y_{t}$ and $\mathbf{x}_{t}$.\\ +To test the hypothesis of cointegration between $y_{t}$ and $\mathbf{x}_{t}$, \citet{pesaran2001} proposed an $F$-test, $F_{ov}$ hereafter, based on the hypothesis system +\begin{align}\label{eq:h0sys} +H_0: a_{yy}=0 \; \cap \;\widetilde{\mathbf{a}}_{y.x}=\mathbf{0}\\ +H_1: a_{yy} \neq 0 \; \cup \;\widetilde{\mathbf{a}}_{y.x}\neq \mathbf{0}. +\end{align} +Note that $H_{1}$ covers also the degenerate cases +\begin{align}\label{eq:h0deg} +H_1^{y.x}: a_{yy}=0 \; , \;\widetilde{\mathbf{a}}_{y.x}\neq\mathbf{0}\\ +H_1^{yy}: a_{yy} \neq 0 \; , \;\widetilde{\mathbf{a}}_{y.x} = \mathbf{0}. +\end{align} +The exact distribution of the $F$ statistic under the null is unknown, but it is limited from above and below by two asymptotic distributions: one corresponding to the case of stationary regressors, and another corresponding to the case of first-order integrated regressors. As a consequence, the test is called bound test and has an inconclusive area. +\footnote{The knowledge of the rank of the cointegrating matrix is necessary to overcome this impasse.}\\ +~\citet{pesaran2001} worked out two sets of (asymptotic) critical values: one, $\{\tau_{L,F}\}$, for the case when $\mathbf{x}_{t}\sim{I}(0)$ and another, $\{\tau_{U,F}\}$, for the case when $\mathbf{x}_{t}\sim{I}(1)$. These values vary in accordance with the number of regressors in the ARDL equation, the sample size and the assumptions made about the deterministic components (intercept and trend) of the data generating process. \\ +In this regard, ~\citet{pesaran2001} introduced five different specifications for the ARDL model, depending on its deterministic components, which are (see Appendix \ref{sec:appendixb} for details) +%\ref{sec:appendixb}) +\begin{enumerate}[I.] +\item \textit{No intercept and no trend} +\begin{align}\label{eq:case1} +\Delta y_t=-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, +\end{align} +where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$, \\ +\item \textit{Restricted intercept and no trend} +\begin{align}\label{eq:case2} +\Delta y_{t}= +-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, +\end{align} +where $EC_{t-1}=y_{t-1}-\theta_{0}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. The intercept extracted from the EC term is $\alpha_{0.y}^{EC} = a_{yy}\theta_0$. +\item \textit{Unrestricted intercept and no trend} +\begin{align}\label{eq:case3} +\Delta y_{t} +=\alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, +\end{align} +where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. +\item \textit{Unrestricted intercept, restricted trend} +\begin{align}\label{eq:case4} +\Delta y_{t}= +\alpha_{0.y}-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, +\end{align} +where $EC_{t-1}=y_{t-1}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. The trend extracted from the EC term is $\alpha_{1.y}^{EC} = a_{yy}\theta_1$. +\item \textit{Unrestricted intercept, unrestricted trend} +\begin{align}\label{eq:case5} +\Delta y_{t} +=\alpha_{0.y}+\alpha_{1.y}t +-a_{yy}EC_{t-1}+\sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{y.x,j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\omega}'\Delta\mathbf{x}_{t}+\nu_{yt}, +\end{align} +where $EC_{t-1}=y_{t-1}-\boldsymbol{\theta}'\mathbf{x}_{t-1}$. +\end{enumerate} +The model in \eqref{eq:ardl} proposed by ~\citet{pesaran2001} represents the correct framework in which to carry out bound tests. However, bound test are often performed in an unconditional ARDL model setting, specified as +\begin{equation}\label{eq:ardluc} + \Delta y_{t}=\alpha_{0.y}+\alpha_{1.y}t -a_{yy}EC_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\gamma}'_{j}\Delta\mathbf{z}_{t-j}+\varepsilon_{yt}, +\end{equation} +which omits the term $\boldsymbol{\omega}'\Delta\mathbf{x}_{t}$.\\ +\cite{bertelli2022bootstrap} have highlighted that bootstrap tests performed in these two ARDL specifications can lead to contrasting results. To explain this divergence, note that the conditional model makes use of the following vector in the EC term +\begin{equation} +\widetilde{\mathbf{a}}_{y.x}'=\mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx} +\end{equation} +(divided by $a_{yy}$, see \eqref{eq:const}) to carry out bound tests, while the unconditional one only uses the vector $\mathbf{a}_{yx}'$, (divided by $a_{yy}$), since it neglects the term $\boldsymbol{\omega}'\mathbf{A}_{xx}$. \footnote{The latter is introduced in the ARDL equation by the operation of conditioning $y_t$ on the other variables $\mathbf{x}_t$ of the model} This can lead to contrasting inference in two instances. The first happens when a degeneracy of first type occurs in the conditional model, that is +\begin{equation}\label{eq:deg1cond} +\widetilde{\mathbf{a}}_{y.x}'=\mathbf{0}, +\end{equation} +because + \begin{equation} + \mathbf{a}_{yx}'=\boldsymbol{\omega}'\mathbf{A}_{xx}. + \end{equation} +In this case, the conditional model rejects cointegration, while the unconditional one concludes the opposite. +The other case happens when a degeneracy of first type occurs in the unconditional model, that is +\begin{equation}\label{eq:deg1uc} +\mathbf{a}_{yx}'=\mathbf{0}, +\end{equation} +but +\begin{equation} +\widetilde{\mathbf{a}}_{y.x}'=\boldsymbol{\omega}'\mathbf{A}_{xx} \neq \mathbf{0}. + \end{equation} +In this case, the unconditional model rejects cointegration, while the conditional one concludes for the existence of cointegrating relationships, which are however spurious. +Only a comparison of the outcomes of the $F_{ind}$ test performed in both the conditional and unconditional ARDL equation can help to disentangle this problem. +\footnote{In fact, as $\boldsymbol{\omega}'\mathbf{A}_{xx}\mathbf{x}_{t} \approx I(0)$, the conclusion that $y_{t}\approx I(0)$ must hold. This in turn entails that no cointegration occurs between $y_t$ and $\mathbf{x}_{t}$.} \\ +In the following, bootstrap tests are carried out in the conditional ARDL model \eqref{eq:ardl}. However, when a degeneracy of first type occurs in the unconditional model, the outcomes of the $F_{ind}$ bootstrap test performed in both the conditional and unconditional settings are provided. This, as previously outlined, is performed to avoid the acceptance of spurious long-run relationships among the dependent variable and the independent variables. + +\section{The new bootstrap procedure}\label{sec:boot} +The bootstrap procedure here proposed focuses on a ARDL model specified as in \eqref{eq:case1}-\eqref{eq:case5}, depending on the assumptions on the deterministic components.\\ +The bootstrap procedure consists of the following steps: +\begin{enumerate} +\item The ARDL model is estimated via OLS and the related test statistics $F_{ov}$, $t$ or $F_{ind}$ are computed. +\item In order to construct the distribution of each test statistic under the corresponding null, the same model is re-estimated imposing the appropriate restrictions on the coefficients according to the test under consideration. +\item Following \cite{mcnown2018bootstrapping}, the ARDL restricted residuals are then computed. For example, under Case III, the residuals are +\begin{equation}\label{eq:resfov} +\widehat{\nu}_{yt}^{F_{ov}}=\Delta y_{t}-\widehat{\alpha}_{0.y}-\sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} +\end{equation} +\begin{equation}\label{eq:rest} +\widehat{\nu}_{yt}^{t}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{\widetilde{\mathbf{a}}}'_{y.x}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t} +\end{equation} +\begin{equation}\label{eq:resfind} +\widehat{\nu}_{yt}^{F_{ind}}=\Delta y_{t}-\widehat{\alpha}_{0.y}+\widehat{a}_{yy}y_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\gamma}}_{y.x,j}'\Delta\mathbf{z}_{t-j}-\widehat{\boldsymbol{\omega}}'\Delta\mathbf{x}_{t}. +\end{equation} +Here, the apex $"\widehat{\,\,.\,\,}"$ denotes the estimated parameters. The other cases can be dealt with in a similar manner. +\item The VECM model +\begin{equation}\label{eq:vecmhat} + \Delta\mathbf{z}_{t}=\boldsymbol{\alpha}_{0}-\mathbf{A}\mathbf{z}_{t-1}+ \sum_{j=1}^{p-1}\boldsymbol{\Gamma}_{j}\Delta\mathbf{z}_{t-j}+\boldsymbol{\varepsilon}_{t} +\end{equation} +is estimated as well (imposing weak exogeneity), and the residuals +\begin{equation}\label{eq:resvecm} +\widehat{\boldsymbol{\varepsilon}}_{xt}= \Delta\mathbf{x}_{t}-\widehat{\boldsymbol{\alpha}}_{0x}+\widehat{\mathbf{A}}_{xx}\mathbf{x}_{t-1}- \sum_{j=1}^{p-1}\widehat{\boldsymbol{\Gamma}}_{(x)j}\Delta\mathbf{z}_{t-j} +\end{equation} + are computed. This approach guarantees that the residuals $\widehat{\boldsymbol{\varepsilon}}_{xt}$, associated to the variables $\mathbf{x}_{t}$ explained by the marginal model \eqref{eq:marg}, are uncorrelated with the ARDL residuals $\widehat{\nu}_{yt}^{.}$. + \item A large set of $B$ bootstrap replicates are sampled from the residuals calculated as in \eqref{eq:resfov},\eqref{eq:rest}, \eqref{eq:resfind} and \eqref{eq:resvecm}. In each replication, the following operations are carried out: +\begin{enumerate} + \item Each set of $(T-p)$ resampled residuals (with replacement) $\widehat{\boldsymbol{\nu}}_{zt}^{(b)}=(\widehat{\nu}_{yt}^{(b)},\widehat{\boldsymbol{\varepsilon}}_{xt}^{(b)})$ is re-centered \citep[see][]{davidson2005case} +\begin{align} +\dot{\widehat{\nu}}^{(b)}_{yt}&=\widehat{\nu}^{(b)}_{yt} -\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\nu}^{(b)}_{yt} \label{eq:recentery} \\ +\dot{\widehat{\boldsymbol{\varepsilon}}}^{b}_{x_{i}t}&=\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}-\frac{1}{T-p}\sum_{t=p+1}^{T}\widehat{\boldsymbol{\varepsilon}}^{(b)}_{x_{i}t}\qquad i=1,\dots,K.\label{eq:recenterx} +\end{align} +\item A sequential set of $(T-p)$ bootstrap observations, $y^{*}_{t}\enspace, \mathbf{x}^{*}_{t}\enspace t=p+1,\dots,T$, is generated as follows + \begin{equation} + y^{*}_{t}=y^{*}_{t-1}+\Delta y^{*}_{t}, \enspace \enspace \mathbf{x}^{*}_{t}=\mathbf{x}^{*}_{t-1}+\Delta \mathbf{x}^{*}_{t}, +\end{equation} +where $\Delta \mathbf{x}^{*}_{t}$ are obtained from \eqref{eq:resvecm} and $\Delta y^{*}_{t}$ from either \eqref{eq:resfov}, \eqref{eq:rest} or \eqref{eq:resfind} after replacing in each of these equations the original residuals with the bootstrap ones. \\ +The initial conditions, that is the observations before $t=p+1$, are obtained by drawing randomly $p$ observations in block from the original data, so as to preserve the data dependence structure. +\item An unrestricted ARDL model is estimated via OLS using the bootstrap observations, and the statistics $F_{ov}^{(b),H_0}$, $t^{(b),H_0}$ $F_{ind}^{(b),H_0}$ are computed. +\end{enumerate} + +\item The bootstrap distributions of $\big\{F_{ov}^{(b),H_0}\big\}_{b=1}^B$, $\big\{F_{ind}^{(b),H_0}\big\}_{b=1}^B$ and $\big\{t^{(b),H_0}\big\}_{b=1}^B$ under the null are then employed to determine the critical values of the tests. By denoting with $M^*_b$ the ordered bootstrap test statistic, and with $\alpha$ the nominal significance level, the bootstrap critical values are determined as follows + \begin{equation}\label{eq:bootf} +c^*_{\alpha,M}=\min\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{M^*_b >c\}} \leq\alpha\bigg\} +\qquad M\in\{F_{ov},F_{ind}\}\end{equation} +for the $F$ tests and +\begin{equation}\label{eq:boott} +c^*_{{\alpha,t}}=\max\bigg\{c:\sum_{b=1}^{B}\mathbf{1}_{\{t^*_b,>=latex} +} +\tikzset{connect/.style={rounded corners=#1, + to path= ($(\tikztostart)!-#1!(\tikztotarget)!-#1!-90:(\tikztotarget)$) -- ($(\tikztotarget)!-#1!(\tikztostart)!-#1!90:(\tikztostart)$) -- + ($(\tikztotarget)!-#1!(\tikztostart)!#1!90:(\tikztostart)$) -- ($(\tikztostart)!-#1!(\tikztotarget)!-#1!90:(\tikztotarget)$) -- cycle (\tikztotarget) +}} +\tikzset{connect/.default=4mm} +\begin{figure}[ht!] +\centering +\begin{tikzpicture}[-latex][scale=0.3] + \matrix (chart) + [ + matrix of nodes, + column sep = 2.5em, + row sep = 1ex, + row 2/.style = {nodes={decision}}, + row 5/.style = {nodes={env}} + ] + { %first row + |[input]| {\small VAR/VECM input\\ $\boldsymbol \mu$, $\boldsymbol \eta$, $\boldsymbol\alpha_0$, $\boldsymbol\alpha_1$, \texttt{case}} + & + |[input]| {\small VECM/ARDL input\\ $\mathbf A_{xx},\mathbf a_{yx},a_{yy},\boldsymbol{\Gamma}_j$}& + \\ + %second row + |[treenode]|{\scriptsize VECM + Intercept and trend\\ + \phantom{x}\\ + CASE I:\\ $\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf 0\rightarrow + \boldsymbol\alpha_0 = \boldsymbol\alpha_1 = \mathbf 0$ \\ + \phantom{x}\\ + CASE II:\\ + $\boldsymbol{\mu}$ input, $\boldsymbol\eta=\mathbf 0$, $\boldsymbol\alpha_{0} = \mathbf A(1) \boldsymbol{\mu}$, $\boldsymbol\alpha_1 = \mathbf 0$\\ + \phantom{x}\\ + CASE III:\\ + $\boldsymbol\eta=\mathbf 0 $ \\ + $\boldsymbol{\alpha}_{0}$ input, $\boldsymbol{\alpha}_1 = \mathbf 0$\\ + \phantom{x}\\ + CASE IV:\\ + $\boldsymbol{\alpha}_{0}$ input, $\boldsymbol{\eta}$ input, $\boldsymbol{\alpha}_1 =\mathbf A(1)\boldsymbol{\eta}$\\ + \phantom{x}\\ + CASE V:\\ + $\boldsymbol{\alpha}_{0}$ input, $\boldsymbol{\alpha}_1$ input + \normalsize}& + |[treenode]| {\small Long-run VECM matrix + $\mathbf A = + \begin{bmatrix} {a_{yy}} & {\mathbf{a}_{yx}'} \\ + {\mathbf 0} & {\mathbf{A}_{xx}} + \end{bmatrix}$.\\ + \phantom{x}\\ + Short-run VECM matrices + $\boldsymbol\Gamma_j$ \\ + $\boldsymbol\Gamma(1) = \mathbf{I_K}-\sum_{j=1}^p\boldsymbol\Gamma_j$}& + |[treenode]|{\scriptsize ARDL + Intercept and trend\\ + \phantom{x}\\ + CASE I:\\ $\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf 0\rightarrow$\\ + $\theta_0=\alpha_{0.y} = \theta_1=\alpha_{1.y} = 0$\\ + \phantom{x}\\ + CASE II:\\ + $\theta_0\neq 0 \enspace \alpha_{0.y} = 0$ (Intercept in $EC$)\\ + $\boldsymbol\eta=\mathbf 0 \rightarrow \theta_1 =\alpha_{1.y}= 0$\\ + \phantom{x}\\ + CASE III:\\ + $\alpha_{0.y}=\alpha_{0y}-\boldsymbol\omega'\boldsymbol\alpha_{0x}$ $(\theta_0 = 0)$\\ + $\boldsymbol\eta=\mathbf 0 \rightarrow \theta_1 =\alpha_{1.y}= 0$\\ + \phantom{x}\\ + CASE IV:\\ + $\alpha_{0.y}=\alpha_{0y}-\boldsymbol\omega'\boldsymbol\alpha_{0x}$ $(\theta_0 = 0)$\\ + $\theta_1\neq 0 \enspace \alpha_{1.y} = 0$ (Trend in $EC$)\\ + \phantom{x}\\ + CASE V:\\ + $\alpha_{0.y}=\alpha_{0y}-\boldsymbol\omega'\boldsymbol\alpha_{0x}$ $(\theta_0 = 0)$\\ + $\alpha_{1.y}=\alpha_{1y}-\boldsymbol\omega'\boldsymbol\alpha_{1x}$ $(\theta_1 = 0)$ + \normalsize} + \\ + %third row + |[action]| {\small $\boldsymbol{\Sigma}$ input.\\ Error generation\\ + $\mathbf u_t'\sim N_{K+1}(\mathbf 0,\boldsymbol\Sigma)$ + \normalsize}& + |[action]| {\small Conditioning\\ + $\boldsymbol\omega'= + \boldsymbol\sigma_{yx}'\boldsymbol\Sigma_{xx}^{-1}$ + \normalsize}& + |[treenode]|{\small $\mathbf {\tilde{a}}_{y.x}'=\mathbf a_{yx}'-\boldsymbol\omega'\mathbf A_{xx}$\\ + $\widetilde{\mathbf A} = + \begin{bmatrix} + {a_{yy}} & \mathbf{\tilde{a}}'_{y.x}\\ + {\mathbf 0} & {\mathbf{A}_{xx}} + \end{bmatrix}$\\ + $\boldsymbol\gamma_{y.x,j}= + \boldsymbol\gamma_{yx}-\boldsymbol\omega'\boldsymbol\Gamma_{(x),j}$\\ + $\widetilde{\boldsymbol\Gamma}_j = + \begin{bmatrix} + \boldsymbol{\gamma}_{y.x,j}\\ + \boldsymbol\Gamma_{(x),j} + \end{bmatrix}$\\ + $\nu_{yt}=\varepsilon_{yt}-\boldsymbol\omega'\boldsymbol\varepsilon_{xt}$ + \normalsize}\\ + |[input]| {\small Other input:\\ + \texttt{nobs}, \texttt{burn.in}} + & + |[action]| {\small $\Delta \mathbf x_t$ via \eqref{eq:marg}\\ + $\mathbf x_t = \Delta \mathbf x_t + \mathbf x_{t-1} $\\ + $\Delta y_t$ via \eqref{eq:ardl}\\ + $y_t = \Delta y_t + y_{t-1} $\\ + \normalsize}&\\ + }; + \draw[thick] + (chart-1-1) -> (chart-2-1); + \draw[thick] + (chart-1-2) -> (chart-2-2); + \draw[thick] + (chart-2-2) -> (chart-2-1); + \draw[thick] + (chart-2-1) -> (chart-3-2); + \draw[thick] + (chart-2-2) -> (chart-3-2); + \draw[thick] + (chart-3-1) -> (chart-3-2); + \draw[thick] + (chart-3-2) -> (chart-2-3); + \draw[thick] + (chart-3-2) -> (chart-3-3); + \begin{scope}[transform canvas={yshift=1.7em}] + \draw[red,dashed,ultra thick] (chart-2-3) to[connect=29.5mm,rounded corners=2mm] (chart-3-3); + \end{scope} + \begin{scope}[transform canvas={yshift=1em}] + \draw[blue,dashed,ultra thick] (chart-2-1) to[connect=27mm,rounded corners=2mm] (chart-3-1); + \end{scope} + \draw[blue,ultra thick,shorten < = 1.85cm] (chart-3-1) edge node[left=0.75cm,pos=0.67]{\small Unconditional parameters for $\Delta\mathbf x_t$\normalsize} (chart-4-2); + \draw[red,ultra thick,shorten < = 0.75cm] (chart-3-3) edge node[right=1cm,pos=0.5]{\small Conditional parameters for $\Delta y_t$\normalsize} (chart-4-2); + \draw[thick] + (chart-3-2) -> (chart-4-2); + \draw[thick] (chart-4-2) edge [in=-20, out=20,looseness=3,right=0.2cm] node[right=0.2cm]{\small Until \texttt{nobs+burn.in}. + Discard \texttt{burn.in}\normalsize} (chart-4-2); + \draw[thick] + (chart-4-1) -> (chart-4-2); + \end{tikzpicture} + \caption{Flowchart of the \texttt{sim\_vecm\_ardl} function inner steps. When applying \eqref{eq:ardl} and \eqref{eq:marg}, $y_{t_j}=0$, $\Delta y_{t_j}=0$, $\mathbf x_{t_j}=\mathbf 0$, $\Delta \mathbf x_{t_j}= \mathbf 0$ for any $t_j < 1$. Boxes denote parameter definitions and transformations. Circles denote crucial actions, Empty nodes denote function inputs.}\label{fig:flowchart} + \end{figure} +\end{landscape} +\subsection{Bootstrapping the ARDL bound tests: the \code{boot\_ardl} function} +This function develops the bootstrap procedure detailed previously. As an option in the initial estimation phase, it offers the possibility of automatically choosing the best order for the lagged differences of all the variables in the ARDL and VECM models. This is done by using several criteria. In particular, AIC, BIC, AICc, $R^2$ and $R^2_{adj}$ are used as lag selection criteria for the ARDL model, while the overall minimum between AIC, HQIC, SC and FPE is used for the lag selection for the VECM.\\ +In particular, the \code{auto\_ardl} function in the package \CRANpkg{ARDL} \citep{PKGARDL} selects the best ARDL order in terms of the short-run parameter vectors $\boldsymbol\gamma_{y.x,j}$, while the \code{VARselect} function in the package \CRANpkg{vars} \citep{PKGVARS} selects the best VECM order in terms of the short-run parameter matrices $\boldsymbol\Gamma_{(x),j}$. Furthermore, the user can input a significance threshold for the retention of single parameters in the $\boldsymbol\Gamma_j$ and in the $\boldsymbol\gamma_{y.x,j}$ vectors.\\ +The function \code{boot\_ardl} takes the following arguments: +\begin{itemize} +\item \code{data}: input dataset. Must contain a dependent variable and a set of independent variables; +\item \code{yvar}: name of the dependent variable enclosed in quotation marks. If unspecified, the first variable in the dataset is used; +\item \code{xvar}: vector of names of the independent variables, each enclosed in quotation marks. If unspecified, all variables in the dataset except the first are used; +\item \code{fix.ardl}: vector $(j_1,\dots,j_K)$, containing the maximum orders of the lagged differences (i.e., $\Delta y_{t-j_1}, \Delta x_{1,t-j_2},\dots,$ $\Delta x_{1,t-j_K}$) for the short term part of the ARDL equation, chosen in advance; +\item \code{info.ardl}: (alternatively to \code{fix.ardl}) the information criterion used to choose the best lag order for the short term part of the ARDL equation. It must be one between \code{AIC} (default), \code{AICc}, \code{BIC}, \code{R2}, , \code{adjR2}; +\item \code{fix.vecm}: scalar $m$ containing the maximum order of the lagged differences (i.e., $\Delta\mathbf z_{t-m}$) for the short term part of the VECM equation, chosen in advance; +\item \code{info.vecm}: (alternatively to \code{fix.vecm}) the information criterion used to choose the best lag order for the short term part of the VECM equation. Must be one among \code{AIC} (default), \code{HQIC}, \code{SC}, \code{FPE}; +\item \code{maxlag}: (in conjunction with \code{info.ardl} / \code{info.vecm}) maximum number of lags for the \code{auto\_ardl} function in the package \CRANpkg{ARDL}, and for the \code{VARselect} function in the package \CRANpkg{vars}; +\item \code{a.ardl}: significance threshold for the short-term ARDL coefficients ($\boldsymbol\gamma_{y.x,j}$) in the ARDL model estimation; +\item \code{a.vecm}: significance threshold for the short-term VECM coefficients (in $\boldsymbol\Gamma_j$) in the VECM model estimation; +\item \code{nboot}: number of bootstrap replications; +\item \code{case}: type of the specification for the conditional ARDL in terms of deterministic components (intercept and trend) among the five proposed by \cite{pesaran2001}, given in \eqref{eq:case1}-\eqref{eq:case5}; +\item \code{a.boot.H0}: probability/ies $\alpha$ by which the critical quantiles of the bootstrap distribution(s) $c^{*}_{\alpha,F_{ov}}$, $c^{*}_{\alpha,t}$ and $c^{*}_{\alpha,F_{ind}}$ must be calculated; +\item \code{print}: if set to \code{TRUE}, shows the progress bar. +\end{itemize} +\code{boot\_ardl} makes use of the \code{lag\_mts} function which produces lagged versions of a given matrix of time series, each column with a separate order. \code{lag\_mts} takes as parameters the data included in a matrix \code{X} and the lag orders in a vector \code{k}, with the addition of a boolean parameter \code{last.only}, which allows to specify whether only the $k$-th order lags have to be retained, or all the lag orders from the first to the $k$-th.\\ +\code{boot\_ardl} also acts as a wrapper for the most common methodologies detecting cointegration, offering a comprehensive view on the testing procedures involved in the analysis. +The resulting object, of class \code{bootCT}, contains all the information about +\begin{itemize} +\item The conditional ARDL model estimates, and the unconditional VECM model estimates; + \item the bootstrap tests performed in the conditional ARDL model; + \item the Pesaran, Shin and Smith bound testing procedure ($F_{ov}$ and $t$-test, when applicable); + \item the Sam, McNown and Goh bound testing procedure for $F_{ind}$, when applicable; + \item the Johansen rank and trace cointegration tests on the independent variables. + \end{itemize} + Internally, the bootstrap data generation under the null is executed via a \code{Rcpp} function, employing the \CRANpkg{Rcpp} and \CRANpkg{RcppArmadillo} packages \citep{RCPP}, so as to greatly speed up computational times. + As explained in the previous section, cointegration tests in the unconditional ARDL model are performed in order to uncover the presence of spurious cointegrating relationships.\\ +To this end, the function provides + \begin{itemize} + \item the bootstrap critical values of the $F_{ov}$, $t$ and $F_{ind}$ tests in the conditional model, at level \code{a.boot.H0}, along with the same statistics computed in the conditional model. + \item a flag, called \code{fakecoint}, that indicates divergence between the outcomes of the $F_{ind}$ test performed in both the conditional and unconditional model. In this circumstance, as explained before, there is no cointegration \citep[see][]{bertelli2022bootstrap}. +\end{itemize} +A \code{summary} method has been implemented to present the results in a visually clear manner. It accepts the additional argument "\code{out}" that lets the user choose which output(s) to visualize: \code{ARDL} prints the conditional ARDL model summary, \code{VECM} prints the VECM model summary, \code{cointARDL} prints the summary of the bound tests and the bootstrap tests, \code{cointVECM} prints the summary of the Johansen test on the independent variables.\\ +A detailed flowchart showing the function's workflow is displayed in Figure \ref{fig:flowchart_ardl}. There, the expressions "C ARDL" and "UC ARDL" stand for conditional and unconditional ARDL model, respectively.\\ +\newpage +\begin{landscape} +\tikzset{ + treenode/.style = {shape=rectangle, rounded corners, + draw,align=center, + top color=white, + text width = 4.1cm, + inner sep=2ex, + anchor=center}, + treenodel/.style = {shape=rectangle, rounded corners, + draw,align=center, + top color=white, + text width = 2.2cm, + inner sep=2ex, + anchor=center}, + input/.style = {align=center, text width=4.5cm}, + decision/.style = {treenode, diamond, inner sep=2pt, + text width=2cm}, + decisionx/.style = {treenode, diamond, inner sep=2pt, + text width=1.5cm}, + decisiond/.style = {treenode, diamond, dashed, inner sep=3pt, + text width=2cm}, + treeuc/.style = {treenode,draw=blue,thick}, + treec/.style = {treenode,draw=red,thick}, + treeg/.style = {treenode,draw=green!60,thick,fill=green!5}, + action/.style = {treenode, circle, inner sep=1pt, + text width = 3cm}, + root/.style = {treenode}, + env/.style = {treenode}, + ginish/.style = {root}, + dummy/.style = {circle,draw}, + ar/.style={->,>=latex} +} +\tikzset{connect/.style={rounded corners=#1, + to path= ($(\tikztostart)!-#1!(\tikztotarget)!-#1!-90:(\tikztotarget)$) -- ($(\tikztotarget)!-#1!(\tikztostart)!-#1!90:(\tikztostart)$) -- + ($(\tikztotarget)!-#1!(\tikztostart)!#1!90:(\tikztostart)$) -- ($(\tikztostart)!-#1!(\tikztotarget)!-#1!90:(\tikztotarget)$) -- cycle (\tikztotarget) +}} +\tikzset{connect/.default=4mm} + +\begin{figure}[ht!] +\centering +\hspace*{-2cm} +\begin{tikzpicture}[-latex][scale=0.3] + \matrix (chart)[ampersand replacement=\#] + [ + matrix of nodes, + column sep = 1.7em, + row sep = 1.5ex, + row 2/.style = {nodes={decision}}, + row 5/.style = {nodes={env}} + ] + { %first row + \# + |[input]| {\small \texttt{case}, \texttt{fix.vecm},\\ + \texttt{info.vecm}, \texttt{maxlag}}\# + |[input]| {\small \texttt{data},\\ + \texttt{xvar}, \texttt{yvar}}\# + |[input]|{\small{\texttt{case}, \texttt{fix.ardl},\\ + \texttt{info.ardl}, \texttt{maxlag}}} + \# + \\ + %second row + |[decision]|{\small + VECM\\Estimate} + \normalsize\# + |[treenode]| {\small + \centering + \begin{tabular}{c|c} + \multicolumn{2}{c}{VECM estimation (either)}\\ + Fixed order & \texttt{VARselect()}\\\hline + \texttt{fix.vecm} & \texttt{info.vecm} \\ + &\texttt{maxlag}\\ + \end{tabular}} + \normalsize\# + |[decisiond]|{\small + Compute $F_{ind}$ of \\UC ARDL + \normalsize} + \# + |[treenode]| {\small + \begin{tabular}{c|c} + \multicolumn{2}{c}{ARDL estimation (either)}\\ + Fixed order & \texttt{auto\_ardl()}\\\hline + \texttt{fix.ardl} & \texttt{info.ardl}\\ + &\texttt{maxlag}\\ + \end{tabular}} + \# + |[decision]|{\small + C ARDL\\Estimate} + \normalsize + \\ + %third row + |[decision]|{\small + Johansen test\\ + results on $\mathbf x_t$} + \normalsize\# + |[treeuc]| {\scriptsize + Estimation of the parameters:\\ + $\mathbf A$, $\boldsymbol\Gamma_j$ $(j=1,\dots,p)$\\ + $\boldsymbol\alpha_0$, $\boldsymbol\alpha_1$ based on \texttt{case}. \\ $\widehat{\boldsymbol\varepsilon}_{xt}$ obtained via \eqref{eq:resvecm}.\\ + Significant estimates of $\boldsymbol\Gamma_j$ filtered via \texttt{a.vecm}} + \normalsize\# + |[treenode]|{\scriptsize + Combine to get \\ + $\widetilde{\mathbf A} = + \begin{bmatrix} + \color{red}{a_{yy}} & \color{red} \widetilde{\mathbf{a}}'_{y.x}\\ + {\mathbf 0} & \color{blue}{\mathbf{A}}_{xx} + \end{bmatrix}$\\ + $\widetilde{\boldsymbol\Gamma}_j =\begin{bmatrix}\color{red}\boldsymbol{\gamma}_{y.x,j}\\ + \color{blue}\boldsymbol\Gamma_{(x),j} + \end{bmatrix}$\\ + \phantom{\tiny x}\\ + $\boldsymbol{\omega}$ (only in the C ARDL)\\ + $(\boldsymbol\alpha_{0}^{c})' = [\color{red}{\alpha_{0.y}}\;\color{blue}{\boldsymbol\alpha_{0x}'}]$, $(\boldsymbol\alpha_{1}^{c})' = [\color{red}{\alpha_{1.y}}\;\color{blue}{\boldsymbol\alpha_{1x}'}]$ + \normalsize} + \# + |[treec]| {\scriptsize + Estimation of:\\ + $ a_{yy}, \mathbf{a}_{y.x}$, $\boldsymbol\gamma_{y.x,j}$ $(j=1,\dots,p)$\\ + $\boldsymbol\omega$ (only in the C ARDL )\\ + $\alpha_{0.y}$,$\alpha_{1.y}$ based on \texttt{case}.\\ + Significant estimates of $\boldsymbol\gamma_{y.x,j}$ filtered via \texttt{a.ardl}} + \normalsize + \# + |[decisionx]|{\scriptsize + PSS/SMG results + in the C ARDL. + Compute\\ $F_{ov}$, $t$, $F_{ind}$} + \normalsize + \\ + \# + |[treenode]|{\scriptsize + Null elements of $\widetilde{\mathbf A}$ based on $H_0$.\\ + Nullity of $\boldsymbol\alpha_0^c$ and $\boldsymbol\alpha_1^c$ based on \texttt{case}.\\ + Combine the residuals \\ + $\widehat{\mathbf u}_t = [\color{red}\widehat{\nu}_{yt}^{*}\,\color{blue}\widehat{\boldsymbol\varepsilon}_{xt}]$} + \# + |[treec]| + { $F_{ov}$ test \\ + \small$H_0: a_{yy}=0,\; \widetilde{\mathbf{a}}_{y.x}=\mathbf 0$:\\ + Re-estimate ARDL, obtain\\ + $\widehat{\nu}_{yt}^{F_{ov}}$ via \eqref{eq:resfov} + \normalsize} + \# + |[treec]|{\small $t$-test \\$H_0: a_{yy}=0$:\\ + Re-estimate ARDL, obtain\\ + $\widehat{\nu}_{yt}^{t}$ via \eqref{eq:rest} + \normalsize} + \# + |[treec]|{\small $F_{ind}$ test \\$H_0: \widetilde{\mathbf{a}}_{y.x}=\mathbf 0$:\\ + Re-estimate ARDL, obtain\\ + $\widehat{\nu}_{yt}^{F_{ind}}$ via \eqref{eq:resfind} + \normalsize} + \\ + |[treenodel]| {\small Sample and \\center + from $\widehat{\mathbf U}$.\\ + Get ${\mathbf U^{(b)}}$. + \normalsize} + \# + |[treenode]|{\small $\Delta y_t^{(b)}$, $\Delta\boldsymbol x_t^{(b)}$ via (\ref{eq:resfov}-\ref{eq:rest}-\ref{eq:resfind}-\ref{eq:resvecm})\\ + $\mathbf{x}_t^{(b)}=\Delta\boldsymbol x_t^{(b)} + \mathbf{x}_{t-1}^{(b)}$. + \\ + ${y}_t^{(b)}=\Delta y_t^{(b)} +y_{t-1}^{(b)}$} + \# + |[treenode]|{\small ARDL estimation under $H_0$.\\ + Get $F_{ov}^{(b),H_0}$, $t^{(b),H_0}$,\\ $F_{ind}^{(b),H_0}$ (C) and $F_{ind}^{(b),H_0}$ (UC)} + \# + |[decisionx]|{\small $c_{\alpha,T}^*$ at level \texttt{a.boot.H0}.} + \# + |[treeg]|{\small Decide comparing \\ $F_{ov}$, $t$, $F_{ind}$ each to its $c_{\alpha,T}^{*}$.\\ + \textbf{IF} $F_{ind}>c_{\alpha,F_{ind}}^{*}$ (C)\\ + \textbf{AND} $F_{ind} (chart-2-2); + \draw[thick] + (chart-1-3) -> (chart-2-4); + \draw[thick] + (chart-1-2) -> (chart-2-2); + \draw[thick] + (chart-1-4) -> (chart-2-4); + \draw[thick] + (chart-2-2) -> (chart-2-1); + \draw[thick] + (chart-2-4) -> (chart-2-5); + \draw[thick] + (chart-2-2.south west) -> (chart-3-1); + \draw[thick] + (chart-2-2) -> (chart-3-2); + \draw[thick] + (chart-2-4.south east) to node[right=0.4cm,pos=-0.1,rotate=-39]{\small Based on \texttt{case}} (chart-3-5); + \draw[ar,thick] + ([xshift=-1 cm]chart-2-4.south) to node[left=0.1cm] {\small UC} ([xshift=-1 cm] chart-3-4.north); + \draw[ar,thick] ([xshift=1 cm]chart-2-4.south) to node[right=0.1cm] {\small C} ([xshift=1cm] chart-3-4.north); + \draw[thick] (chart-2-4) edge [in=70, out=40,looseness=2,left=2cm] node[pos=0.3,right=0.1cm]{\small UC and C model\normalsize} (chart-2-4); + \draw[thick] + (chart-2-4) -> (chart-2-3); + \draw[blue,ultra thick] (chart-3-2) -> (chart-3-3); + \draw[red,ultra thick] (chart-3-4) -> (chart-3-3); + \draw[red,ultra thick] (chart-3-4) -> (chart-4-3.north east); + \draw[red,ultra thick] (chart-3-4) -> (chart-4-4); + \draw[red,ultra thick] (chart-3-4) -> (chart-4-5.north west); + \draw[blue,ultra thick] (chart-3-2) -> (chart-4-2); + \draw[thick] (chart-3-3) -> (chart-4-2); + \draw[red,ultra thick, shorten < = 0.15cm] (chart-4-3) -> (chart-4-2); + + \begin{scope}[transform canvas={yshift=0em,xshift=3.4em}] + \draw[red,dashed,ultra thick] (chart-4-3.west) to[connect=13mm,rounded corners=1mm] (chart-4-5); + \end{scope} +\draw[thick] (chart-4-2.west) -> (chart-5-1.north); +\draw[thick] (chart-5-1) -> (chart-5-2); +\draw[thick] (chart-5-2) -> (chart-5-3); +\draw[thick] (chart-5-3) -> (chart-5-4); +\draw[ar,thick] (chart-5-3.south west) to [bend left=30] node[right=0.4cm,pos=0.2]{\small $b=1,\dots,B$}(chart-5-1.south east); + \end{tikzpicture} + \caption{Flowchart of the \texttt{boot\_ardl} function inner steps. Boxes denote parameter definitions and transformations. Diamonds denote function outputs. Dashed diamonds denote intermediate output (not shown after function call). Empty nodes denote function inputs. The first $p+1$ rows of $\mathbf z_t^{(b)}$ are set equal to the first $p+1$ rows of the original data. The best lag order for each difference variable in the ARDL model is determined via \texttt{auto\_ardl()}. It is reported as a unique value $p$ in $\boldsymbol{\gamma}_{y.x,j}$ for brevity in the flowchart.}\label{fig:flowchart_ardl} + \end{figure} +\end{landscape} +\subsection{Execution time and technical remarks} +In order to investigate the sensitivity of the procedure to different sample sizes and number of bootstrap replicates, an experiment has been run using a three-dimensional time series of length $T=\{50,80,100,200,500\}$, generating 100 datasets for each sample size with the \code{sim\_vecm\_ardl} function (Case II, with cointegrated variables, and 2 lags in the short-run section of the model).\\ Then, the \code{boot\_ardl} function has been called + +\begin{example} +boot_ardl(data = df_sim, + nboot = bootr, + case = 2, + fix.ardl = rep(2, 3), + fix.vecm = 2) +\end{example} + +\noindent In the code above, \code{bootr} has been set equal to $B=\{200,500,1000,2000\}$, the number of lags has been assumed known (\code{fix.ardl} and \code{fix.vecm}), while default values have been used for every other argument (such as \code{a.ardl}, \code{a.vecm} and \code{a.boot.H0}).\\ +Table \ref{tab:exec} shows the average running time per replication together with the coefficient of variation (\%) of the bootstrap critical values of the $F_{ov}$ test, for each value of $T$ and $B$, across 100 replications for each scenario.\\ +Naturally, the running time increases as both sample size and bootstrap replicates increase. However, it can be noticed how the coefficients of variation tend to stabilize for $B \geq 1000$, especially for $T>80$, at the 5\% significance level. Therefore, it is recommended a number of bootstrap replicates of at least $B=1000$ for higher sample size, or at least $B=2000$ for smaller samples. The analysis has been carried out using an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of RAM. + +\begin{table}[htbp] + \centering + \begin{tabular}{cccccc} + \multicolumn{1}{c}{$T$} & \multicolumn{1}{l}{$B$} & \multicolumn{1}{c}{Exec. Time (sec)} & \multicolumn{1}{c}{$cv^{(F_{ov})}(5\%)$} & \multicolumn{1}{c}{$cv^{(F_{ov})}(2.5\%)$} & \multicolumn{1}{c}{$cv^{(F_{ov})}(1\%)$} \\ + \midrule + 50 & 200 & 23.38 & 8.648 & 10.925 & 13.392 \\ + 50 & 500 & 48.37 & 6.312 & 6.952 & 8.640 \\ + 50 & 1000 & 96.65 & 4.806 & 5.613 & 6.288 \\ + 50 & 2000 & 231.15 & 4.255 & 4.226 & 4.946 \\ + \midrule + 80 & 200 & 23.46 & 7.251 & 8.936 & 11.263 \\ + 80 & 500 & 50.19 & 4.998 & 6.220 & 7.946 \\ + 80 & 1000 & 143.00 & 3.882 & 4.453 & 5.305 \\ + 80 & 2000 & 255.64 & 2.912 & 3.623 & 4.518 \\ + \midrule + 100 & 200 & 37.89 & 7.707 & 8.583 & 10.955 \\ + 100 & 500 & 52.86 & 4.691 & 5.304 & 7.557 \\ + 100 & 1000 & 184.51 & 3.512 & 4.567 & 5.695 \\ + 100 & 2000 & 212.65 & 3.519 & 3.674 & 4.185 \\ + \midrule + 200 & 200 & 35.46 & 6.644 & 7.173 & 10.365 \\ + 200 & 500 & 76.78 & 4.734 & 5.355 & 6.225 \\ + 200 & 1000 & 148.25 & 3.124 & 4.177 & 5.034 \\ + 200 & 2000 & 484.51 & 2.811 & 3.361 & 3.907 \\ + \midrule + 500 & 200 & 54.47 & 6.641 & 8.694 & 10.414 \\ + 500 & 500 & 133.17 & 5.137 & 5.816 & 6.408 \\ + 500 & 1000 & 271.87 & 3.905 & 4.585 & 5.283 \\ + 500 & 2000 & 561.71 & 3.221 & 3.490 & 4.145 \\ + \bottomrule + \end{tabular}% + \caption{Average execution times (in seconds) of the \code{boot\_ardl} function, for different combinations of sample size $T$ and bootstrap replicates $B$. Coefficients of variation ($cv$) reported for the $F_{ov}$ bootstrap critical values at level 5\%, 2.5\% and 1\%.} + \label{tab:exec}% +\end{table}% + +\section{Empirical applications}\label{sec:app} +This section provides two illustrative application which highlight the performance of the bootstrap ARDL tests. +\subsection{An application to the German macroeconomic dataset} +In the first example, the occurrence of a long-run relationship between consumption [C], income [INC], and investment [INV] of Germany has been investigated via a set of ARDL models, where each variable takes in turn the role of dependent one, while the remaining are employed as independent. + The models have been estimated by employing the dataset of \citet{lutkepohl2005} which includes quarterly data of the series over the years 1960 to 1982. + The data have been employed in logarithmic form. Figure \ref{fig:plotemp} displays these series over the sample period.\\ + Before applying the bootstrap procedure, the order of integration of each series has been analyzed. Table \ref{tab:adf} shows the results of ADF test performed on both the series and their first-differences ($k=3$ maximum lags). The results confirm the applicability of the ARDL framework as no series is integrated of order higher than one.\\ + The following ARDL equations have been estimated: +\begin{enumerate}[I] + \item First ARDL equation (C | INC, INV): + \begin{align} + \Delta \log \text{C}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{C}_{t-1} - {a}_{y.x_1}\log \text{INC}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{INC}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + \end{align} + + \item Second ARDL equation (INC | C, INV): + \begin{align} + \Delta \log \text{INC}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INC}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INV}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INC}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INV}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INV}_{t}+\nu_{t}. + \end{align} + + \item Third ARDL equation (INV | C, INC): + \begin{align} + \Delta \log \text{INV}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{INV}_{t-1} - {a}_{y.x_1}\log \text{C}_{t-1} - {a}_{y.x_2}\log \text{INC}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{INV}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{C}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{INC}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{C}_{t}+ + \omega_2 \Delta\log \text{INC}_{t}+\nu_{t}. + \end{align} + +\end{enumerate} + +\noindent Table \ref{tab:est} shows the estimation results for each ARDL and VECM model. It is worth noting that the instantaneous difference of the independent variables are highly significant in each conditional ARDL model. +Thus, neglecting these variables in the ARDL equation, as happens in the unconditional version of the model, may potentially lead to biased estimates and incorrect inference. +For the sake of completeness, also the results of the marginal VECM estimation are reported for each model.\\ +The code to prepare the data, available in the package as the \code{ger\_macro} dataset, is: +\begin{example} + data("ger_macro") + LNDATA = apply(ger_macro[,-1], 2, log) + col_ln = paste0("LN", colnames(ger_macro)[-1]) + LNDATA = as.data.frame(LNDATA) + colnames(LNDATA) = col_ln +\end{example} + +\noindent Then, the \code{boot\_ardl} function is called, to perform the bootstrap tests. In the code chunk below, Model I is considered. + +\begin{example} + set.seed(999) + BCT_res_CONS = boot_ardl(data = LNDATA, + yvar = "LNCONS", + xvar = c("LNINCOME", "LNINVEST"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) +\end{example} +to which follows the call to the \code{summary} function +\begin{example} + summary(BCT_res_CONS, out = "ARDL") + summary(BCT_res_CONS, out = "VECM") + summary(BCT_res_CONS, out = "cointVECM") + summary(BCT_res_CONS, out = "cointARDL") +\end{example} +The first summary line displays the output in the ARDL column of Table \ref{tab:est} and the second column of Table \ref{tab:cointbig}, Model I. The second line corresponds to the VECM columns of Table \ref{tab:est}, Model I - only for the independent variables. The information on the rank of the $\mathbf A_{xx}$ in Table \ref{tab:est} is inferred from the third line. Finally, the fourth summary line corresponds to the test results in Table \ref{tab:cointbig}, Model I. +A textual indication of the presence of spurious cointegration is displayed at the bottom of the \code{"cointARDL"} summary, if detected.\\ +In this example, the bootstrap and bound testing procedures are in agreement only for model I, indicating the existence of a cointegrating relationship. Additionally, no spurious cointegration is detected for this model. As for models II and III, the null hypothesis is not rejected by the bootstrap tests, while the PSS and SMG bound tests fail to give a conclusive answer in the $F_{ind}$ test.\\ +The running time of the entire analysis is of roughly 11 minutes, using an Intel(R) Core(TM) i7-1165G7 CPU @ 2.80GHz processor, 16GB of RAM. + +\begin{center} +\begin{table}[htbp] +\centering + \resizebox{0.65\textwidth}{!}{ + \begin{tabular}{crrrrr} + & & \multicolumn{2}{c}{level variable} & \multicolumn{2}{c}{first difference} \\ +\cmidrule{3-6} Series & \multicolumn{1}{c}{lag} & \multicolumn{1}{c}{ADF} & \multicolumn{1}{c}{p.value} & \multicolumn{1}{c}{ADF} & \multicolumn{1}{c}{p-value} \\ + \midrule + \multirow{4}{*}{$\log\text{C}_t$} + & 0 & -1.690 & 0.450 & -9.750 & $< 0.01$ \\ + & 1 & -1.860 & 0.385 & -5.190 & $< 0.01$ \\ + & 2 & -1.420 & 0.549 & -3.130 & 0.030 \\ + & 3 & -1.010 & 0.691 & -2.720 & 0.080 \\ + \midrule + \multirow{4}{*}{$\log\text{INC}_t$} + & 0 & -2.290 & 0.217 & -11.140 & $<0.01$ \\ + & 1 & -1.960 & 0.345 & -7.510 & $< 0.01$ \\ + & 2 & -1.490 & 0.524 & -5.120 & $< 0.01$ \\ + & 3 & -1.310 & 0.587 & -3.290 & 0.020 \\ + \midrule + \multirow{4}{*}{$\log\text{INV}_t$} + & 0 & -1.200 & 0.625 & -8.390 & $< 0.01$ \\ + & 1 & -1.370 & 0.565 & -5.570 & $< 0.01$ \\ + & 2 & -1.360 & 0.570 & -3.300 & 0.020 \\ + & 3 & -1.220 & 0.619 & -3.100 & 0.032 \\ + \bottomrule + \end{tabular} + } + \caption{ADF preliminary test (null hypothesis: random walk with drift).} + \label{tab:adf} +\end{table}% +\end{center} +\begin{center} +\begin{figure}[htbp!] + \centering + \includegraphics[scale=0.8]{figures/tsgraph.pdf} + \caption{log-consumption/investment/income graphs (level variables and first differences). Made with \CRANpkg{ggplot}.} + \label{fig:plotemp} + +\end{figure} +\end{center} + +\begin{landscape} +\begin{table}[ht!] +\resizebox{1.6\textwidth}{!}{ +\begin{tabular}{c lll lll lll} + +& +\multicolumn{3}{c}{Model I}& +\multicolumn{3}{c}{Model II}& +\multicolumn{3}{c}{Model III}\\ + +\cmidrule{2-10} +& \multicolumn{1}{c}{ARDL} & \multicolumn{2}{c}{VECM} +& \multicolumn{1}{c}{ARDL} & \multicolumn{2}{c}{VECM} +& \multicolumn{1}{c}{ARDL} & \multicolumn{2}{c}{VECM}\\ + +& \multicolumn{1}{c}{$\Delta\log\text{C}_t$} & \multicolumn{1}{c}{$\Delta\log\text{INV}_t$} & \multicolumn{1}{c}{$\Delta\log\text{INC}_t$} +& \multicolumn{1}{c}{$\Delta\log\text{INC}_t$} & \multicolumn{1}{c}{$\Delta\log\text{C}_t$} & \multicolumn{1}{c}{$\Delta\log\text{INV}_t$} +& \multicolumn{1}{c}{$\Delta\log\text{INV}_t$} & \multicolumn{1}{c}{$\Delta\log\text{C}_t$} & \multicolumn{1}{c}{$\Delta\log\text{INC}_t$}\\ + + \midrule + + $\log\text{C}_{t-1}$ & + \makecell{-0.307 ***\\ (0.055)} & & & + \makecell{0.168 *\\ (0.081)} & \makecell{-0.0011\\ (0.0126)}& \makecell{0.1286*\\ (0.0540)}& + \makecell{0.611 . \\ (0.339)}& + \makecell{-0.2727***\\ (0.0704)} & \makecell{-0.0508\\ (0.0796)} \\ + + $\log\text{INC}_{t-1}$ & + \makecell{0.297 ***\\ (0.055)} & \makecell{0.124 *\\ (0.054)} & \makecell{-0.017\\ (0.014)} & + \makecell{-0.183*\\ (0.079)} & & & + \makecell{-0.491\\ (0.340)} & + \makecell{ 0.2619***\\ (0.0681)}&\makecell{ 0.0464\\ (0.0772)}\\ + + $\log\text{INV}_{t-1}$ & + \makecell{-0.001\\ (0.011)} & \makecell{-0.152 *\\ (0.063)} & \makecell{0.016\\ (0.017)} & + \makecell{0.0209\\ (0.0135)} & \makecell{-0.00107\\ (0.0142)} & \makecell{-0.1531*\\ (0.0607)} & + \makecell{-0.1212*\\ (0.060)} & & \\ + + \midrule + + $\Delta\log\text{C}_{t-1}$ & + \makecell{-0.248 **\\ (0.079)} & \makecell{0.899 *\\ (0.442)} & \makecell{0.211 .\\ (0.113)}& + \makecell{0.375***\\ (0.1086)} & &\makecell{0.9288*\\ (0.442)} & + \makecell{1.113 *\\ (0.441)}& + & \makecell{0.2072 . \\ (0.1142)}\\ + + $\Delta\log\text{C}_{t-2}$ + & & \makecell{0.744 \\ (0.431)} & & & & \makecell{0.8049 . \\ (0.4345)}& & & \\ + + $\Delta\log\text{INC}_{t-1}$ & + & & & \makecell{-0.1404\\ (0.1095)} & & & + & & \\ + + $\Delta\log\text{INC}_{t-2}$ & + & & + & &\makecell{0.2675**\\ (0.0958)} & + & &\makecell{0.1522.\\ (0.0912)} & \\ + + $\Delta\log\text{INV}_{t-1}$ & + & \makecell{-0.18\\ (0.111)} & \makecell{0.035\\ (0.029)} & + & & \makecell{-0.189 . \\ (0.1097)} & + \makecell{-0.175\\ (0.1075)} & + & \makecell{0.0479 . \\ (0.0282)} \\ + + $\Delta\log\text{INV}_{t-2}$ & + & & + \makecell{0.049 .\\ (0.027) }& & \makecell{0.0591*\\ (0.0245) }& + & &\makecell{0.0578*\\ (0.0223) } & \makecell{0.0562*\\ (0.0266)} \\ + + \midrule + + $\Delta\log\text{C}_t$ & + & & & + \makecell{0.7070***\\ (0.1093)} & & & + \makecell{1.8540***\\ (0.5425)} & & \\ + + $\Delta\log\text{INC}_t$ & + \makecell{0.471***\\ (0.074)} & & & + & & & + \makecell{-0.445***\\ (0.4726)} & & \\ + + $\Delta\log\text{INV}_t$ & + \makecell{0.065**\\ (0.019)} & & & + \makecell{-0.0230\\ (0.025)} & & + & & & \\ + + const. & + \makecell{0.048 ***\\ (0.013) } & + \makecell{0.036\\ (0.066)} & \makecell{0.033 *\\ (0.017)} & + \makecell{0.002 \\ (0.018)} & + \makecell{0.0266 . \\ (0.0155) } & \makecell{0.023 \\ (0.0666) } & + \makecell{-0.056 \\ (0.072) } & + \makecell{0.0517**\\ (0.0157)}& \makecell{0.0378*\\ (0.0177)}\\ + \hline + J-test&&\multicolumn{2}{c}{\rule{0pt}{1em}$rk(\mathbf{A_{xx}})=2$}&&\multicolumn{2}{c}{$rk(\mathbf{A_{xx}})=2$}&&\multicolumn{2}{c}{$rk(\mathbf{A_{xx}})=2$}\\ + \bottomrule + \end{tabular}% + } + \caption{Conditional ARDL and VECM results for the consumption/income/investment dataset, along with rank of the $\mathbf A_{xx}$ matrix via the Johansen (J) test.\\ + Significance codes: (***) 1\%; (**) 5\%; (.) 10\%.} + \label{tab:est}% +\end{table}% +\end{landscape} + +% Table generated by Excel2LaTeX from sheet 'Foglio1' +\begin{table}[htbp] + \centering + \resizebox{\textwidth}{!}{ + \begin{tabular}{ccccccccc} + & & & & \multicolumn{2}{c}{PSS / SMG Threshold} & & \multicolumn{2}{c}{Outcome} \\ + \midrule + \multicolumn{1}{c}{Model} & \multicolumn{1}{c}{Lags} & Test & \multicolumn{1}{c}{Boot. Critical Values} & \multicolumn{1}{c}{I(0) 5\%} & \multicolumn{1}{c}{I(1) 5\%} & \multicolumn{1}{c}{Statistic} & \multicolumn{1}{c}{Boot} & \multicolumn{1}{c}{Bound} \\ + \midrule + + \multirow{3}{*}{I} & \multirow{3}{*}{(1,0,0)} & $F_{ov}$ & + 3.79 & 3.79 & 4.85 & 10.75 + & \multirow{3}{*}{Y} & \multirow{3}{*}{Y} \\ + & & $t$ & + -2.88& -2.86 & -3.53 & -5.608 & & \\ & & $F_{ind}$ & + 4.92 & 3.01& 5.42 & 15.636 & & \\ + + \midrule + + \multirow{3}{*}{II} & \multirow{3}{*}{(1,1,0)} & $F_{ov}$ & + 5.79 & 3.79 & 4.85 & 2.867 & + \multirow{3}{*}{N} & \multirow{3}{*}{U} \\ + & & $t$ & + -3.69 & -2.86 & -3.53 & -2.315 & & \\& & $F_{ind}$ & + 7.38 & 3.01 & 5.42 & 3.308 & & \\ + + \midrule + + \multirow{3}{*}{III} & \multirow{3}{*}{(1,1,0)} & $F_{ov}$ & + 5.50 & 3.79 & 4.85 & 3.013& + \multirow{3}{*}{N} & \multirow{3}{*}{U} \\ + & & $t$ & + -3.32 & -2.86 &-3.53 &-2.020 & & \\& & $F_{ind}$ & + 6.63 & 3.01&5.42 & 4.189 & & \\ + \bottomrule + \end{tabular}% + } + \caption{ + Cointegration analysis for the three ARDL equations in the German macroeconomic data. The optimal number of ARDL lags in the short-run - in the form $(y,x_1,x_2)$, matching the model definition - bootstrap critical values, bound test thresholds and test statistics for each test are shown (case III).\\ The outcome columns draw conclusions on each type of model (bootstrap or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of type 1, D2 = degenerate of type 2, U = inconclusive inference. + \label{tab:cointbig}} +\end{table}% +\subsection{An application on Italian Macroeconomic Data} +Following \citet{bertelli2022bootstrap}, the relationship between foreign direct investment [FDI], exports [EXP], and gross domestic product [GDP] in Italy is investigated. +The data of these three yearly variables have been retrieved from the World Bank Database and cover the period from 1970 to 2020. In the analysis, the log of the variables has been used and [EXP] and [FDI] have been adjusted using the GDP deflator. Figure \ref{fig:plotemp2} displays these series over the sample period. + +\begin{center} +\begin{figure}[htbp!] + \centering + \includegraphics[scale=0.7]{figures/tsgraph2.pdf} + \caption{log-GDP/export/investment graphs (level variables and first differences). Made with \CRANpkg{ggplot}.} + \label{fig:plotemp2} +\end{figure} +\end{center} + +\noindent Table \ref{tab:gdp1} shows the outcomes of the ADF test performed on each variable, which ensures that the integration order is not higher than one for all variables. Table \ref{tab:cointbig2} shows the results of bound and bootstrap tests performed in ARDL model by taking each variable, in turn, as the dependent one. +The following ARDL equations have been estimated: +\begin{enumerate}[I] + \item First ARDL equation (GDP | EXP, FDI): + \begin{align} + \Delta \log \text{GDP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{GDP}_{t-1} - {a}_{y.x_1}\log \text{EXP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{EXP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t} + \end{align}. + For this model, a degenerate case of the first type can be observed, while the simpler bound testing procedure does not signal cointegration. + \item Second ARDL equation (EXP | GDP, FDI): + \begin{align} + \Delta \log \text{EXP}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{EXP}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{FDI}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{EXP}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{FDI}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{FDI}_{t}+\nu_{t}. + \end{align} + For this model, the ARDL bootstrap test indicates absence of cointegration, while the bound testing approach is inconclusive for the $F_{ind}$ test. + \item Third ARDL equation (FDI | GDP, EXP): + \begin{align} + \Delta \log \text{FDI}_{t}&=\alpha_{0.y} - + a_{yy} \log \text{FDI}_{t-1} - {a}_{y.x_1}\log \text{GDP}_{t-1} - {a}_{y.x_2}\log \text{EXP}_{t-1} +\\\nonumber + &\sum_{j=1}^{p-1}\gamma_{y.j} \Delta\log \text{FDI}_{t-j} + + \sum_{j=1}^{s-1}\gamma_{x_1.j} \Delta\log \text{GDP}_{t-j} + + \sum_{j=1}^{r-1}\gamma_{x_2.j} \Delta\log \text{EXP}_{t-j} +\\\nonumber + &\omega_1 \Delta\log \text{GDP}_{t}+ + \omega_2 \Delta\log \text{EXP}_{t}+\nu_{t}. + \end{align} + For this model, the long-run cointegrating relationship is confirmed using both boostrap and bound testing. No spurious cointegration is detected. +\end{enumerate} +The code to load the data and perform the analysis (e.g. for Model I) is: +\begin{example} + data("ita_macro") + BCT_res_GDP = boot_ardl(data = ita_macro, + yvar = "LGDP", + xvar = c("LEXP", "LFI"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) +\end{example} +For the sake of simplicity, the conditional ARDL and VECM marginal models outputs included in each cointegrating analysis is omitted. The summary for the cointegration tests for Model I is called via +\begin{example} + summary(BCT_res_GDP, out = "ARDL") # extract lags + summary(BCT_res_GDP, out ="cointARDL") # ARDL cointegration +\end{example} +This empirical application further highlights the importance of dealing with inconclusive inference via the bootstrap procedure, while naturally including the effect of conditioning in the ARDL model, as highlighted in \citet{bertelli2022bootstrap}. +\begin{table}[htbp] + \centering + \resizebox{\textwidth}{!}{ + \begin{tabular}{lrrrrrrrrrrrr} + & \multicolumn{4}{c}{No Drift, No Trend} & \multicolumn{4}{c}{Drift, No Trend} & \multicolumn{4}{c}{Drift and Trend} \\ +\cmidrule{2-13} Variable & \multicolumn{1}{l}{Lag = 0} & \multicolumn{1}{l}{Lag = 1} & \multicolumn{1}{l}{Lag = 2} & \multicolumn{1}{l}{Lag = 3} & \multicolumn{1}{l}{Lag = 0} & \multicolumn{1}{l}{Lag = 1} & \multicolumn{1}{l}{Lag = 2} & \multicolumn{1}{l}{Lag = 3} & \multicolumn{1}{l}{Lag = 0} & \multicolumn{1}{l}{Lag = 1} & \multicolumn{1}{l}{Lag = 2} & \multicolumn{1}{l}{Lag = 3} \\ + \midrule$\log \text{GDP}_t$ & 0.99 & 0.974 & 0.941 & 0.796 & $<0.01$ & $<0.01$ & $<0.01$ & 0.084 & 0.99 & 0.99 & 0.99 & 0.99 \\ + $\log \text{FDI}_t$ & 0.572 & 0.599 & 0.675 & 0.725 & $<0.01$ & 0.0759 & 0.3199 & 0.5174 & $<0.01$ & 0.013 & 0.151 & 0.46 \\ + $\log \text{EXP}_t$ & 0.787 & 0.71 & 0.698 & 0.684 & 0.479 & 0.288 & 0.467 & 0.433 & 0.629 & 0.35 & 0.463 & 0.379 \\ +\midrule $\Delta\log \text{GDP}_t$ & $<0.01$ & $<0.01$64 & 0.0429 & 0.0402 & $<0.01$ & 0.0861 & 0.3989 & 0.4267 & $<0.01$ & $<0.01$ & 0.0166 & 0.017 \\ + $\Delta\log \text{FDI}_t$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ \\ + $\Delta\log \text{EXP}_t$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & $<0.01$ & 0.0336 & 0.0315 \\ + \bottomrule + \end{tabular}% + } + \caption{ADF preliminary test for the second example.} + \label{tab:gdp1}% +\end{table}% +\begin{table}[htbp] + \centering + \resizebox{\textwidth}{!}{ + \begin{tabular}{ccc ccc ccc} + & & & & \multicolumn{2}{c}{PSS / SMG Threshold} & & \multicolumn{2}{c}{Outcome} \\ + \midrule + \multicolumn{1}{c}{Model} & \multicolumn{1}{c}{Lags} & Test &\multicolumn{1}{c}{Boot. Critical Values} & \multicolumn{1}{c}{I(0) 5\%} & \multicolumn{1}{c}{I(1) 5\%} & Statistic & \multicolumn{1}{c}{Boot} & \multicolumn{1}{c}{Bound} \\ + \midrule +\multirow{3}{*}{I} & \multirow{3}{*}{(1,1,0)} +& $F_{ov}$ & 3.730 & 4.070 & 5.190 & 9.758 & \multirow{3}{*}{D1} & \multirow{3}{*}{N}\\ + & +& $t$ & -2.020 & -2.860 & -3.530 & -2.338 & & \\ + & +& $F_{ind}$ & 3.710 & 3.220 & 5.620 & 2.273 & & \\ +\midrule +\multirow{3}{*}{II} & \multirow{3}{*}{(1,0,0)} +& $F_{ov}$ & 5.400 & 4.070 & 5.190 & 2.649 & \multirow{3}{*}{N} & \multirow{3}{*}{U} \\ & +& $t$ & -3.380 & -2.860 & -3.530 & -1.889 & & \\ & & + $F_{ind}$ & 5.630 & 3.220 & 5.620 & 3.481 & & \\ + +\midrule +\multirow{3}{*}{III} + & \multirow{3}{*}{(1,0,0)} +& $F_{ov}$ & 5.360 & 4.070 & 5.190 & 6.716 & \multirow{3}{*}{Y} & \multirow{3}{*}{Y} \\ + & +& $t$ & -3.550 & -2.860 & -3.530 & -4.202 & & \\& +& $F_{ind}$ & 6.500 & 3.220 & 5.620 & 7.017 & & \\ + + \bottomrule + \end{tabular}% + } + \caption{Cointegration analysis for the three ARDL equations in the Italian macroeconomic data. The optimal number of ARDL lags in the short-run - in the form $(y,x_1,x_2)$, matching the model definition - bootstrap critical values, bound test thresholds and test statistics for each test are shown (case III).\\ The outcome columns draw conclusions on each type of model (bootstrap or bound): Y = cointegrated, N = not cointegrated, D1 = degenerate of type 1, D2 = degenerate of type 2, U = inconclusive inference.} + \label{tab:cointbig2}% +\end{table}% +\section{Conclusion}\label{sec:end} +The \CRANpkg{bootCT} package allows the user to perform bootstrap cointegration tests in ARDL models by overcoming the problem of inconclusive inference which is a well-known drawback of standard bound tests. +The package makes use of different functions. +The function \code{boot\_ardl} performs the bootstrap tests, and it acts as a wrapper of both the bootstrap and the standard bound tests, including also the Johansen test on the independent variables of the model. Finally, it also performs the bound $F$-test on the lagged independent variables, so far not available in other extant \code{R} packages. +The function \code{sim\_vecm\_ardl}, which allows the simulation of multivariate time series data following a user-defined DGP, enriches the available procedures for multivariate data generation, while the function \code{lag\_mts} provides a supporting tool in building datasets of lagged variables for any practical purpose. Finally, the use of Rcpp functions gives a technical advantage in terms of computational speed, performing the bootstrap analysis within an acceptable time frame. + +\newpage +\section{Appendix}\label{sec:appendix} +\subsection{Section A - the methodological framework of (conditional) VECM and ARDL models} \label{sec:appendixa} +Expanding the matrix polynomial $\mathbf{A}(z)$ about $z=1$, yields +\begin{equation}\label{eq:polyamat} +\mathbf{A}(z)=\mathbf{A}(1)z+(1-z)\boldsymbol{\Gamma}(z), +\end{equation} +where +\begin{equation} +\mathbf{A}(1)=\mathbf{I}_{K+1}-\sum_{j=1}^{p}\mathbf{A}_{j} +\end{equation} +\begin{equation}\label{eq:polygamma} +\boldsymbol{\Gamma}(z)=\mathbf{I}_{K+1}-\sum_{i=1}^{p-1}\boldsymbol{\Gamma}_{i}z^i, \enspace \enspace \boldsymbol{\Gamma}_{i}=-\sum_{j=i+1}^{p}\mathbf{A}_j. +\end{equation} +The VECM model \eqref{eq:vecm} follows accordingly, and +\begin{equation}\label{eq:vecmint} +\boldsymbol{\alpha}_0=\mathbf{A}(1)\boldsymbol{\mu}+(\boldsymbol{\Gamma}(1)-\mathbf{A}(1))\boldsymbol{\eta}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\mathbf{A}(1)\boldsymbol{\eta}. +\end{equation} +Assuming that $\mathbf{A}(1)$ is singular and that +the variables $\mathbf{x}_{t}$ are cointegrated. This entails the following +\begin{align}\label{eq:factt} + \mathbf{A}(1)=&\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}_{yx}'} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx}} +\end{bmatrix}=\underset{(K+1,r+1)}{\mathbf{B}}\underset{(r+1,K+1)}{\mathbf{C}'}=\begin{bmatrix}b_{yy} & \mathbf{b}_{yx}'\\ \mathbf{b}_{xy} & \mathbf{B}_{xx} \end{bmatrix}\begin{bmatrix}c_{yy} & \mathbf{c}_{yx}'\\ \mathbf{c}_{xy} & \mathbf{C}_{xx}'\end{bmatrix}= \nonumber\\ +=&\begin{bmatrix}b_{yy}c_{yy}+\mathbf{b}_{yx}'\mathbf{c}_{xy} & b_{yy}\mathbf{c}_{yx}'+\mathbf{b}_{yx}'\mathbf{C}_{xx}'\\ +\mathbf{b}_{xy}c_{yy}+\mathbf{B}_{xx}\mathbf{c}_{xy} & \mathbf{b}_{xy}\mathbf{c}_{yx}'+ \mathbf{A}_{xx} \end{bmatrix}, \enspace \enspace \enspace rk(\mathbf{A}(1))=rk(\mathbf{B})=rk(\mathbf{C}), +\end{align} +where $\mathbf{B}$ and $\mathbf{C}$ are full column rank matrices arising from the rank-factorization of $\mathbf{A}(1)=\mathbf{B}\mathbf{C}'$ with $\mathbf{C}$ matrix of the long-run relationships of the process +and $\mathbf{B}_{xx}$, $\mathbf{C}_{xx}$ arising from the rank factorization of $\mathbf{A}_{xx}=\mathbf{B}_{xx}\mathbf{C}_{xx}'$, with $rk(\mathbf{A}_{xx})=rk(\mathbf{B}_{xx})=rk(\mathbf{C}_{xx})=r$ \footnote{ If the explanatory variables are stationary $\mathbf{A}_{xx}$ is non-singular ($rk(\mathbf{A}_{xx})=K$), while when they are integrated but without cointegrating relationship $\mathbf{A}_{xx}$ is a null matrix }. \\ +By partitioning the vectors $\boldsymbol{\alpha}_{0}$, $\boldsymbol{\alpha}_{1}$, the matrix $\mathbf{A}(1)$ and the polynomial matrix $\boldsymbol{\Gamma}(L)$ conformably to $\mathbf{z}_{t}$, as follows +\begin{equation}\label{eq:alphapart} +\boldsymbol{\alpha}_0=\begin{bmatrix} +\underset{(1,1)}{\alpha_{0y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{0x}} +\end{bmatrix}, \enspace \enspace \enspace \boldsymbol{\alpha}_1=\begin{bmatrix} +\underset{(1,1)}{\alpha_{1y}} \\ \underset{(K,1)}{\boldsymbol{\alpha}_{1x} } +\end{bmatrix} +\end{equation} +\begin{equation}\label{eq:coeffpart} +\mathbf{A}(1)=\begin{bmatrix} +\underset{(1,K+1)}{\mathbf{a}'_{(y)}} \\ \underset{(K,K+1)}{\mathbf{A}_{(x)}} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{a_{yy}} & \underset{(1,K)}{\mathbf{a}'_{yx}} \\ \underset{(K,1)}{\mathbf{a}_{xy}} & \underset{(K,K)}{\mathbf{A}_{xx} } +\end{bmatrix}, +\enspace \enspace \enspace +\boldsymbol{\Gamma}(L)=\begin{bmatrix} +\underset{(1,K+1)}{\boldsymbol{\gamma}'_{y}(L)} \\ \underset{(K,K+1)}{\boldsymbol{\Gamma}_{(x)}(L)} +\end{bmatrix} +=\begin{bmatrix} +\underset{(1,1)}{\gamma_{yy}(L)} & \underset{(1,K)}{\boldsymbol{\gamma}'_{yx}(L)} \\ \underset{(K,1)}{\boldsymbol{\gamma}_{xy}(L)} & \underset{(K,K)}{\boldsymbol{\Gamma}_{xx}(L) } +\end{bmatrix} +\end{equation}, +and substituting \eqref{eq:epsilonx} into \eqref{eq:vecm} yields +\begin{equation}\label{eq:condsys} +\Delta\mathbf{z}_t=\begin{bmatrix} +\Delta y_{t} \\ \Delta\mathbf{x}_{t} +\end{bmatrix}=\begin{bmatrix} +\alpha_{0.y} \\ \boldsymbol{\alpha}_{0x} +\end{bmatrix} + \begin{bmatrix} +\alpha_{1.y} \\ \boldsymbol{\alpha}_{1x} +\end{bmatrix}t- \begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}\begin{bmatrix} +y_{t-1} \\ \mathbf{x}_{t-1} +\end{bmatrix} + \begin{bmatrix} +\boldsymbol{\gamma}'_{y.x}(L) \\ \boldsymbol{\Gamma}_{(x)}(L) +\end{bmatrix}\Delta\mathbf{z}_t+\begin{bmatrix} +\boldsymbol{\omega}'\Delta\mathbf{x}_{t} \\ \mathbf{0} +\end{bmatrix}+\begin{bmatrix} +{\nu}_{yt} \\ \boldsymbol{\varepsilon}_{xt} +\end{bmatrix} +\end{equation}, +where +\begin{equation}\label{eq:condintt} +\alpha_{0.y}=\alpha_{0y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{0x}, \enspace \enspace \enspace \alpha_{1.y}=\alpha_{1y}-\boldsymbol{\omega}'\boldsymbol{\alpha}_{1x} +\end{equation} +\begin{equation}\label{eq:condAmat} +\mathbf{a}'_{(y).x}=\mathbf{a}'_{(y)}-\boldsymbol{\omega}'\mathbf{A}_{(x)}, \enspace \enspace \enspace \boldsymbol{\gamma}'_{y.x}(L)=\boldsymbol{\gamma}_{y}'(L)-\boldsymbol{\omega}'\boldsymbol{\Gamma}_{(x)}(L). +\end{equation} +According to \eqref{eq:condsys}, the long-run relationships of the VECM turn out to be now included in the matrix +\begin{equation}\label{eq:condAmat2} +\begin{bmatrix} +\mathbf{a}'_{(y).x} \\ \mathbf{A}_{(x)} +\end{bmatrix}=\begin{bmatrix} +a_{yy}-\boldsymbol{\omega}'\mathbf{a}_{xy} & \mathbf{a}_{yx}'-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{a}_{xy}&\mathbf{A}_{xx} +\end{bmatrix}. +\end{equation} +To rule out the presence of long-run relationships between $y_{t}$ and $\mathbf{x}_{t}$ in the marginal model, +the $\mathbf{x}_{t}$ variables are assumed to be exogenous with respect to the ARDL parameters, that is $\mathbf{a}_{xy}$ is assumed to be a null vector. +Accordingly, the long-run matrix in \eqref{eq:condAmat2} becomes +\begin{equation}\label{eq:cond} +\widetilde{\mathbf{A}}=\begin{bmatrix}a_{yy} & \mathbf{a}'_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx} \\ \mathbf{0} & \mathbf{A}_{xx} +\end{bmatrix}=\begin{bmatrix} +a_{yy} & \widetilde{\mathbf{a}}_{y.x}' \\ \mathbf{0}&\mathbf{A}_{xx}\end{bmatrix} =\begin{bmatrix} +b_{yy}c_{yy} & b_{yy}\mathbf c_{yx}'+(\mathbf{b}_{yx}'-\boldsymbol{\omega}'\mathbf{B}_{xx})\mathbf{C}_{xx}' \\ \mathbf{0}& \mathbf{B}_{xx}\mathbf{C}_{xx}'\end{bmatrix}. +\end{equation} +After these algebraic transformations, the ARDL equation for $\Delta y_{t}$ can be rewritten as in \eqref{eq:ardl}.\\ +In light of the factorization \eqref{eq:factt} +of the matrix $\mathbf{A}(1)$, the long-run equilibrium vector $\boldsymbol{\theta}$ can be expressed as +\begin{equation}\label{eq:thetat} +\boldsymbol{\theta}'= +-\frac{1}{a_{yy}}\underset{(1,r+1)}{\left[b_{yy}\enspace\enspace(\mathbf{b}_{yx}-\boldsymbol{\omega}'\mathbf{B}_{xx})\right]} +\underset{(r+1,K)}{\begin{bmatrix} \mathbf{c}'_{yx}\\ \mathbf{C}'_{xx} \end{bmatrix}}, +\end{equation} +where $\widetilde{\mathbf{a}}_{y.x}=\mathbf{a}_{yx}-\boldsymbol{\omega}'\mathbf{A}_{xx}$.\\ +Bearing in mind that $\mathbf{C}'_{xx}$ is the cointegrating matrix for the variables $\mathbf{x}_t$, the equation \eqref{eq:thetat} leads to the following conclusion +\begin{equation}\label{eq:rank} +rk\begin{bmatrix}\mathbf{c}'_{yx}\\ \mathbf{C}'_{xx}\end{bmatrix}=\begin{cases} +r \to \enspace y_{t} \sim I(0) \\ +r+1 \to \enspace y_{t} \sim I(1) +\end{cases}, +\end{equation} +where $r=rk(\mathbf{A}_{xx})$ and $0 \leq r\leq K$. \\ + +\subsection{Section B - Intercept and trend specifications}\label{sec:appendixb} + ~\citet{pesaran2001} introduced five different specifications for the ARDL model, which depend on the deterministic components that can be absent or restricted to the values they assume in the parent VAR model. In this connection, note that, in light of \eqref{eq:vecmint}, the drift and the trend coefficient in the conditional VECM \eqref{eq:condsys} are defined as +\begin{equation} +\boldsymbol{\alpha_{0}}^{c}=\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} , \enspace \enspace +\boldsymbol{\alpha_{1}}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta}, +\end{equation} +where $\widetilde{\mathbf{A}}(1)$ is as in \eqref{eq:cond} and $\widetilde{\boldsymbol{\Gamma}}(1)=\begin{bmatrix} \boldsymbol{\gamma}_{y.x}'(1) \\ \boldsymbol{\Gamma}_{(x)}(1) \end{bmatrix}$.\\ +Accordingly, after partitioning the mean and the drift vectors as +\begin{equation} +\underset{(1,K+1)}{\boldsymbol{\mu}'}=[\underset{(1,1)}{\mu_{y}},\underset{(1,K)}{\boldsymbol{\mu}_x'}], \enspace \underset{(1,K+1)}{\boldsymbol{\eta}'}=[\underset{(1,1)}{\eta_{y}},\underset{(1,K)}{\boldsymbol{\eta}_{x}'}], +\end{equation} +the intercept and the coefficient of the trend of the ARDL equation \eqref{eq:ardl} are defined as +\begin{equation} +\alpha_{0.y}^{EC} += \mathbf{e}_{1}'\boldsymbol{\alpha_{0}}^{c} +=a_{yy}\mu_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\mu}_{x}+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}=a_{yy}(\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x})+\boldsymbol{\gamma}'_{y.x}(1)\boldsymbol{\eta}, \enspace +\boldsymbol{\theta}'=-\frac{\widetilde{\mathbf{a}}'_{y.x}}{a_{yy}} +\end{equation} +\begin{equation} +\enspace \enspace \alpha_{1.y}^{EC}=\mathbf{e}_{1}'\boldsymbol{\alpha_{1}}^{c}= +a_{yy}\eta_{y}-\widetilde{\mathbf{a}}'_{y.x}\boldsymbol{\eta}_{x}=a_{yy}(\eta_{y}-\boldsymbol{\theta'}\boldsymbol{\eta}_{x}), +\end{equation} +where $\mathbf{e}_{1}$ is the $K+1$ first elementary vector.\\ +In the error correction term +\begin{equation} +EC_{t-1}=y_{t-1}-\theta_{0}-\theta_{1}t-\boldsymbol{\theta}'\mathbf{x}_{t-1} +\end{equation} +the parameters that partake in the calculation of intercept and trend are +\begin{equation} +\theta_{0}=\mu_{y}-\boldsymbol{\theta}'\boldsymbol{\mu}_{x}, \enspace \theta_{1}=\eta_{y}-\boldsymbol{\theta}'\boldsymbol{\eta}_{x}. +\end{equation} +In particular, these latter are not null only when they are assumed to be restricted in the model specification.\\ +The five specifications proposed by ~\citet{pesaran2001} are +\begin{enumerate}[I] +\item \textit{No intercept and no trend}: +\begin{equation} +\boldsymbol{\mu}=\boldsymbol{\eta}=\mathbf{0}. +\end{equation} +It follows that +\begin{equation} +\theta_{0}=\theta_{1}=\alpha_{0.y}=\alpha_{1.y}=0. +\end{equation} +Accordingly, the model is as in \eqref{eq:case1}. + +\item \textit{Restricted intercept and no trend}: +\begin{equation} +\boldsymbol{\alpha}_{0}^{c}= \widetilde{\mathbf{A}}(1)\boldsymbol{\mu},\enspace \enspace \boldsymbol{\eta}=\mathbf{0}, +\end{equation} +which entails +\begin{equation} +\theta_0 \neq 0 \enspace\enspace\alpha_{0.y}^{EC}=a_{yy}\theta_{0}, \enspace \enspace +\alpha_{0.y}=\theta_{1}=\alpha_{1.y}=0. +\end{equation} +Therefore, the intercept stems from the EC term of the ARDL equation. The model is specified as in \eqref{eq:case2} + +\item \textit{Unrestricted intercept and no trend}: +\begin{equation} +\boldsymbol{\alpha}_{0}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\mu}, \enspace \enspace \boldsymbol{\eta}=\mathbf{0}. +\end{equation} +Thus, +\begin{equation} +\alpha_{0.y}\neq 0,\enspace \enspace \theta_{0}=\theta_{1}=\alpha_{1.y}=0. +\end{equation} +Accordingly, the model is as in \eqref{eq:case3}. + +\item \textit{Unrestricted intercept, restricted trend}: +\begin{equation}\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta}\enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}=\widetilde{\mathbf{A}}(1)\boldsymbol{\eta}, +\end{equation} +which entails +\begin{equation} +\alpha_{0.y} \neq 0,\enspace \enspace +\theta_{0}=0 \enspace \enspace +\theta_{1}\neq 0\enspace\enspace +\alpha_{1.y}^{EC}=a_{yy}\theta_1\enspace\enspace +\alpha_{1.y}=0. +\end{equation} +Accordingly, the trend stems from the EC term of the ARDL equation. The model is as in \eqref{eq:case4}. +\item \textit{Unrestricted intercept, unrestricted trend}: +\begin{equation} +\boldsymbol{\alpha_{0}}^{c}\neq\widetilde{\mathbf{A}}(1)(\boldsymbol{\mu}-\boldsymbol{\eta})+\widetilde{\boldsymbol{\Gamma}}(1)\boldsymbol{\eta} \enspace \enspace {\boldsymbol{\alpha}}_{1}^{c}\neq\widetilde{\mathbf{A}}(1)\boldsymbol{\eta}. +\end{equation} +Accordingly, +\begin{equation} \alpha_{0.y} \neq 0 \enspace \enspace\alpha_{1.y} \neq 0, \enspace \enspace\theta_{0}=\theta_{1}=0. +\end{equation} +The model is as in \eqref{eq:case5}. +\end{enumerate} +\newpage + + + +\bibliography{vacca-zoia-bertelli} + +\address{Gianmarco Vacca\\ + Department of Economic Policy. Università Cattolica del Sacro Cuore\\ + Largo Gemelli, 1, Milan.\\ + Italy\\ + (0000-0002-8996-5524)\\ + \email{gianmarco.vacca@unicatt.it}} + +\address{Maria Zoia\\ + Department of Economic Policy. Università Cattolica del Sacro Cuore\\ + Largo Gemelli, 1, Milan.\\ + Italy\\ + (0000-0002-8169-781X)\\ + \email{maria.zoia@unicatt.it}} + + \address{Stefano Bertelli\\ + CRO Area, Internal Validation and Controls Department, Operational Risk and ICAAP Internal Systems, + Intesa Sanpaolo, Milan\\ + Viale Stelvio, 55/57, Milan.\\ + Italy\\ + \email{stefano.bertelli@intesasanpaolo.com}} + diff --git a/_articles/RJ-2024-003/vacca_zoia_bertelli.R b/_articles/RJ-2024-003/vacca_zoia_bertelli.R new file mode 100644 index 0000000000..0003771fd5 --- /dev/null +++ b/_articles/RJ-2024-003/vacca_zoia_bertelli.R @@ -0,0 +1,561 @@ +#WORKING CODE R-JOURNAL + +install.packages(c("dplyr", + "ggplot2", + "Rmisc", + "reshape2", + "bootCT", + "tseries", + "urca", + "aTSA")) +library(dplyr) +library(ggplot2) +library(Rmisc) +library(aTSA) +library(reshape2) +library(bootCT) +library(tseries) +library(urca) + +# Multiple plot function + +multiplot = function(..., plotlist=NULL, file, cols=1, layout=NULL) { + library(grid) + + # Make a list from the ... arguments and plotlist + plots = c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout = matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx = as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +} + +########################################### +# SIMULATE DATA FOR FIGURE 1 AND FIGURE 2 # +########################################### +corrm = matrix(c( 0, 0, 0, + 0.25, 0, 0, + 0.4, -0.25, 0), nrow = 3, ncol = 3, byrow = T) + +Corrm = (corrm + t(corrm)) + diag(3) + +sds = diag(c(1.3, 1.2, 1)) + +sigma.in = (sds %*% Corrm %*% t(sds)) + +gamma1 = matrix(c(0.6, 0,0.2, + 0.1,-0.3, 0, + 0,-0.3,0.2), nrow = 3, ncol = 3,byrow=T) +gamma2= gamma1*0.3 + +omegat = sigma.in[1,-1]%*%solve(sigma.in[-1,-1]) +axx.in = matrix(c( 0.3, 0.5, + -0.4, 0.3), nrow = 2, ncol = 2, byrow = T) +ayx.uc.in = c(0.4,0.4) +ayy.in = 0.6 + +data.vecm.ardl_1 = + sim_vecm_ardl(nobs=200, + case = 1, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = ayy.in, + mu.in = rep(0,3), + eta.in = rep(0,3), + azero.in = rep(0,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_2 = + sim_vecm_ardl(nobs=200, + case = 2, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = ayy.in, + mu.in = rep(2,3), + eta.in = rep(0,3), + azero.in = rep(0,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_3 = + sim_vecm_ardl(nobs=200, + case = 3, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = ayy.in, + mu.in = rep(0,3), + eta.in = rep(0,3), + azero.in = rep(2,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_4 = + sim_vecm_ardl(nobs=200, + case = 4, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = ayy.in, + mu.in = rep(0,3), + eta.in = rep(0.4,3), + azero.in = rep(2,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_5 = + sim_vecm_ardl(nobs=200, + case = 5, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = ayy.in, + mu.in = rep(0,3), + eta.in = rep(0,3), + azero.in = rep(0.8,3), + aone.in = rep(0.4,3), + burn.in = 100, + seed.in = 999) + +df1 = data.vecm.ardl_1$data +meltdf1 = melt(df1,id="time") +p1 = ggplot(meltdf1,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE I") + theme_bw()+ + ylim(c(-10,12)) + +df2 = data.vecm.ardl_2$data +meltdf2 = melt(df2,id="time") +p2 = ggplot(meltdf2,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE II") + theme_bw()+ + ylim(c(-10,12)) + +df3 = data.vecm.ardl_3$data +meltdf3 = melt(df3,id="time") +p3 = ggplot(meltdf3,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE III") + theme_bw()+ + ylim(c(-10,12)) + +df4 = data.vecm.ardl_4$data +meltdf4 = melt(df4,id="time") +p4 = ggplot(meltdf4,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE IV") + theme_bw()+ + ylim(c(-15,100)) + +df5 = data.vecm.ardl_5$data +meltdf5 = melt(df5,id="time") +p5 = ggplot(meltdf5,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE V") + theme_bw()+ + ylim(c(-100,150)) + +multiplot(p1,p2,p3,p4,p5, cols=1) + +# Degeneracy of second type +data.vecm.ardl_1 = + sim_vecm_ardl(nobs=200, + case = 1, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = 0, + mu.in = rep(0,3), + eta.in = rep(0,3), + azero.in = rep(0,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_2 = + sim_vecm_ardl(nobs=200, + case = 2, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = 0, + mu.in = rep(0.3,3), + eta.in = rep(0,3), + azero.in = rep(0,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_3 = + sim_vecm_ardl(nobs=200, + case = 3, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = 0, + mu.in = rep(0,3), + eta.in = rep(0,3), + azero.in = rep(0.3,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_4 = + sim_vecm_ardl(nobs=200, + case = 4, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = 0, + mu.in = rep(0,3), + eta.in = rep(0.4,3), + azero.in = rep(0.3,3), + aone.in = rep(0,3), + burn.in = 100, + seed.in = 999) + +data.vecm.ardl_5 = + sim_vecm_ardl(nobs=200, + case = 5, + sigma.in = sigma.in, + gamma.in = list(gamma1,gamma2), + axx.in = axx.in, + ayx.uc.in = ayx.uc.in, + ayy.in = 0, + mu.in = rep(0,3), + eta.in = rep(0,3), + azero.in = rep(0.3,3), + aone.in = rep(0.3,3), + burn.in = 100, + seed.in = 999) + +df1 = data.vecm.ardl_1$data +meltdf1 = melt(df1,id="time") +p1 = ggplot(meltdf1,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE I") + theme_bw() + +df2 = data.vecm.ardl_2$data +meltdf2 = melt(df2,id="time") +p2 = ggplot(meltdf2,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE II") + theme_bw() + +df3 = data.vecm.ardl_3$data +meltdf3 = melt(df3,id="time") +p3 = ggplot(meltdf3,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE III") + theme_bw() + +df4 = data.vecm.ardl_4$data +meltdf4 = melt(df4,id="time") +p4 = ggplot(meltdf4,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE IV") + theme_bw() + +df5 = data.vecm.ardl_5$data +meltdf5 = melt(df5,id="time") +p5 = ggplot(meltdf5,aes(x=time,y=value,colour=variable,group=variable)) + geom_line() + ggtitle("CASE V") + theme_bw() + +multiplot(p1,p2,p3,p4,p5, cols=1) + +######################## +# TIME ELAPSED TABLE 1 # +######################## +# DO NOT RUN, IT TAKES A LOT OF TIME +# DATA GENERATION +corrm = matrix(c(0, 0, 0, + 0.25, 0, 0, + 0.4, -0.25, 0), + nrow = 3, ncol = 3, byrow = T) + +Corrm = (corrm + t(corrm)) + diag(3) + +sds = diag(c(1.3, 1.2, 1)) + +sigma.in = (sds %*% Corrm %*% t(sds)) + +gamma1 = matrix(c(0.3,0.2,0.2, + 0.1,-0.2,0.1, + 0.3,-0.1,0), + nrow = 3, ncol = 3,byrow = T) +gamma2= gamma1*0.3 +gammax=list(gamma1,gamma2) + +omegat = sigma.in[1,-1]%*%solve(sigma.in[-1,-1]) +axx.in = matrix(c(0.3, 0.5, -0.4, 0.3), nrow = 2, ncol = 2, byrow = T) +ayxC.in = omegat%*%axx.in + +data_sim=alist(n50=,n80=,n100=,n200=,n500=) +num=c(50,80,100,200,500) +set.seed(100) +for(j in 1:5){ + data_sim[[j]]=list() + for(k in 1:100){ + data_sim[[j]][[k]] = + sim_vecm_ardl(nobs = num[j], + case = 2, + sigma.in = sigma.in, + gamma.in = gammax, + axx.in = axx.in, + ayx.uc.in = c(0.6,0.5), + ayy.in = 0.7, + mu.in = rep(0.3,3), + eta.in = rep(0,3), + azero.in = rep(0,3), + aone.in = rep(0,3), + burn.in=num[j]/2) + print(j) + } +} + +# APPLYING BOOTSTRAP AND RECORDING TIME +res_sim=list() +timerec=list() +bootr=c(200,500,1000,2000,5000) +set.seed(1) + +for(m in 1:length(bootr)){ + res_sim[[m]]=list() + timerec[[m]]=list() + + for(j in 1:length(num)){ + res_sim[[m]][[j]]=list() + begin=Sys.time() + + for(k in 1:100){ + res_sim[[m]][[j]][[k]] = boot_ardl(data = data_sim[[j]][[k]]$data[,-4], + nboot = bootr[m], + case = 2, + fix.ardl = rep(2,3), + fix.vecm = 2, + print = T) + print(k) + } + + end=Sys.time() + print(j) + print(m) + timerec[[m]][[j]]=end-begin + #save.image(file="timerec_res.RData",compress=T) + } + +} + + +####################################### +# APPLICATION ON GERMAN MACRO DATASET # +####################################### + +# LOAD DATA +data("ger_macro") + +# DATA PREPARATION +colnames(ger_macro) +LNDATA=apply(ger_macro[,-1],2,log) +col_ln=paste0("LN",colnames(ger_macro)[-1]) +LNDATA=as.data.frame(LNDATA) +colnames(LNDATA)=col_ln +LNDATA=LNDATA%>%select(LNCONS,LNINCOME,LNINVEST) +LNDATA$DATE=ger_macro$DATE + +# ADF TEST IN LEVELS Table 2 +aTSA::adf.test(LNDATA$LNINVEST) +aTSA::adf.test(LNDATA$LNINCOME) +aTSA::adf.test(LNDATA$LNCONS) + +# CREATE DIFFERENCE +lagdf1 = lag_mts(as.matrix(LNDATA[,-4]), k = c(1,1,1)) +dlagdf0 = na.omit(LNDATA[,-4] - lagdf1) +colnames(dlagdf0)=paste0("D_",colnames(LNDATA)[-4]) +dlagdf0$DATE=ger_macro$DATE[-1] + +# ADF TEST IN DIFFERENCE Table 2 +aTSA::adf.test(na.omit(dlagdf0$D_LNCONS)) +aTSA::adf.test(na.omit(dlagdf0$D_LNINVEST)) +aTSA::adf.test(na.omit(dlagdf0$D_LNINCOME)) + +# PLOT FOR FIGURE 5 +dfmelt = melt(LNDATA, id = "DATE") +dfmelt = dfmelt%>%arrange(variable,DATE) + +p1 = ggplot(dfmelt, + aes(x = DATE, y = value, colour = variable, group = variable)) + + geom_line() + ggtitle("Level Variables (log-scale)") + theme_bw() + +diff.dfmelt = melt(dlagdf0, id = "DATE") +diff.dfmelt = diff.dfmelt%>%arrange(variable,DATE) + +p2 = ggplot(diff.dfmelt, + aes(x = DATE, y = value, colour = variable, group = variable)) + + geom_line() + ggtitle("Diff. Variables (log-scale)") + theme_bw() + +plot(p1) +plot(p2) + +# ARDL BOOT +set.seed(999) + +# Time elapsed +time0 = Sys.time() + +# MODEL I +BCT_res_CONS = boot_ardl(data = LNDATA, + yvar = "LNCONS", + xvar = c("LNINCOME","LNINVEST"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) + +# MODEL II +BCT_res_INC = boot_ardl(data = LNDATA, + yvar = "LNINCOME", + xvar = c("LNCONS","LNINVEST"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) + +# MODEL III +BCT_res_INV = boot_ardl(data = LNDATA, + yvar = "LNINVEST", + xvar = c("LNCONS","LNINCOME"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) + +time1=Sys.time() +runtime = time1-time0 + +# SUMMARY WITH OPTIONS +summary(BCT_res_CONS,out = "ARDL") # Table 3 ARDL, extract lags column for Table 4 +summary(BCT_res_INC,out = "ARDL") # Table 3 ARDL, extract lags column for Table 4 +summary(BCT_res_INV,out = "ARDL") # Table 3 ARDL, extract lags column for Table 4 +summary(BCT_res_CONS,out = "VECM") # Table 3 VECM +summary(BCT_res_INC,out = "VECM") # Table 3 VECM +summary(BCT_res_INV,out = "VECM") # Table 3 VECM +summary(BCT_res_CONS,out = "cointVECM") # Table 3 VECM bottom row +summary(BCT_res_INC,out = "cointVECM") # Table 3 VECM bottom row +summary(BCT_res_INV,out = "cointVECM") # Table 3 VECM bottom row +summary(BCT_res_CONS,out = "cointARDL") # Table 4 +summary(BCT_res_INC,out = "cointARDL") # Table 4 +summary(BCT_res_INV,out = "cointARDL") # Table 4 + +######################################## +# APPLICATION ON ITALIAN MACRO DATASET # +######################################## + +# LOAD DATA +data("ita_macro") + +# ADF TEST IN LEVELS +aTSA::adf.test(ita_macro$LGDP) +aTSA::adf.test(ita_macro$LEXP) +aTSA::adf.test(ita_macro$LFI) + +# CREATE DIFFERENCE +lagdf1 = lag_mts(as.matrix(ita_macro[,-1]), k = c(1,1,1)) +dlagdf0 = na.omit(ita_macro[,-1] - lagdf1) +colnames(dlagdf0) = paste0("D_", colnames(ita_macro)[-1]) +dlagdf0$YEAR = ita_macro$YEAR[-1] + +# ADF TEST IN DIFFERENCE +aTSA::adf.test(na.omit(dlagdf0$D_LGDP)) +aTSA::adf.test(na.omit(dlagdf0$D_LEXP)) +aTSA::adf.test(na.omit(dlagdf0$D_LFI)) + +# PLOT FOR FIGURE 6 +dfmelt = melt(ita_macro, id = "YEAR") +dfmelt = dfmelt%>%arrange(variable,YEAR) + +p1 = ggplot(dfmelt, + aes(x = YEAR, y = value, colour = variable, group = variable)) + + geom_line() + ggtitle("Level Variables (log-scale)") + theme_bw() + +diff.dfmelt = melt(dlagdf0, id = "YEAR") +diff.dfmelt = diff.dfmelt%>%arrange(variable,YEAR) + +p2 = ggplot(diff.dfmelt, + aes(x = YEAR, y = value, colour = variable, group = variable)) + + geom_line() + ggtitle("Diff. Variables (log-scale)") + theme_bw() + +plot(p1) +plot(p2) + +# ARDL BOOT +set.seed(999) + +# MODEL I +BCT_res_GDP = boot_ardl(data = ita_macro, + yvar = "LGDP", + xvar = c("LEXP","LFI"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) + +# MODEL II +BCT_res_EXP = boot_ardl(data = ita_macro, + yvar = "LEXP", + xvar = c("LGDP","LFI"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) + +# MODEL III +BCT_res_FDI = boot_ardl(data = ita_macro, + yvar = "LFI", + xvar = c("LGDP","LEXP"), + maxlag = 5, + a.ardl = 0.1, + a.vecm = 0.1, + nboot = 2000, + case = 3, + a.boot.H0 = c(0.05), + print = T) + +# SUMMARY WITH OPTIONS TABLE 6 +summary(BCT_res_GDP,out="ARDL") # extract lags +summary(BCT_res_EXP,out="ARDL") # extract lags +summary(BCT_res_FDI,out="ARDL") # extract lags +summary(BCT_res_GDP,out="cointARDL") # ARDL cointegration +summary(BCT_res_EXP,out="cointARDL") # ARDL cointegration +summary(BCT_res_FDI,out="cointARDL") # ARDL cointegration \ No newline at end of file diff --git a/_articles/RJ-2024-004/RJ-2024-004.Rmd b/_articles/RJ-2024-004/RJ-2024-004.Rmd new file mode 100644 index 0000000000..a8adab360e --- /dev/null +++ b/_articles/RJ-2024-004/RJ-2024-004.Rmd @@ -0,0 +1,1270 @@ +--- +title: Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models + with QAPE 2.0 Package +abstract: | + The paper presents a new R package + [**qape**](https://CRAN.R-project.org/package=qape) for prediction, + accuracy estimation of various predictors and Monte Carlo simulation + studies of properties of both predictors and estimators of accuracy + measures. It allows to predict any population and subpopulation + characteristics of the response variable based on the Linear Mixed + Model (LMM). The response variable can be transformed, e.g. to + logarithm and the data can be in the cross-sectional or longitudinal + framework. Three bootstrap algorithms are developed: parametric, + residual and double, allowing to estimate the prediction accuracy. + Analyses can also include Monte Carlo simulation studies of properties + of the methods used. Unlike other packages, in the prediction process + the user can flexibly define the predictor, the model, the + transformation function of the response variable, the predicted + characteristics and the method of accuracy estimation. +author: +- name: Alicja Wolny--Dominiak + affiliation: Department of Statistical and Mathematical Methods in Economics + address: + - University of Economics in Katowice + - 50, 1 Maja Street + - 40--287 Katowice + - Poland + - | + [alicja.wolny-dominiak@uekat.pl](alicja.wolny-dominiak@uekat.pl){.uri} + - | + [web.ue.katowice.pl/woali/](web.ue.katowice.pl/woali/){.uri} +- name: Tomasz Ża̧dło + affiliation: Department of Statistics, Econometrics and Mathematics + address: + - University of Economics in Katowice + - 50, 1 Maja Street + - 40--287 Katowice + - Poland + - | + [tomasz.zadlo@uekat.pl](tomasz.zadlo@uekat.pl){.uri} + - | + [web.ue.katowice.pl/zadlo/](web.ue.katowice.pl/zadlo/){.uri} +date: '2025-01-10' +date_received: '2022-08-12' +journal: + firstpage: 67 + lastpage: 82 +volume: 16 +issue: 1 +slug: RJ-2024-004 +citation_url: https://rjournal.github.io/ +packages: + cran: + - qape + - sae + - msae + - saery + - JoSAE + - emdi + - HLMdiag + - lme4 + bioc: [] +preview: preview.png +bibliography: wolny-zadlo.bib +CTV: ~ +legacy_pdf: yes +legacy_converted: yes +output: + rjtools::rjournal_web_article: + self_contained: yes + toc: no + mathjax: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js + md_extension: -tex_math_single_backslash +draft: no + +--- + +::::: article +## Introduction {#intro} + +One of the tasks in application of mixed models in the real-life +problems is the prediction of random effects. Then, the predicted values +give the possibility for further prediction, e.g. characteristics of +interest such as sum, mean or quantiles or the future value of the +response variable for cross-sectional or longitudinal data. + +Three main predictors of these characteristics are proposed in the +literature: Empirical Best Linear Unbiased Predictors - EBLUPs (see e.g. +[@henderson1950estimation] and [@royall1976linear]), PLUG-IN predictors +(see e.g. [@boubeta2016empirical], [@chwila2019properties], +[@hobza2016empirical]) and Empirical Best Predictors - EBPs (see e.g. +[@molina2010small]). Each assumes the LMM to model the response +variable. + +The numerous successful applications of these three predictors for +cross-sectional and longitudinal data can be found in the model approach +in survey sampling, including the small area estimation. In paper +[@fay1979estimates] the Authors introduce the prediction of the mean +income for small places based on the special case of the LMM model +called Fay-Herriot model and the EBLUP. The analysis of poverty is +extended in many works, e.g. in [@molina2010small] and +[@christiaensen2012]. In turn, in [@SAE1988] the Authors analyse the +total crop areas based on survey and satellite data using EBLUPs. The +proposed LMM model is known as the Battese-Harter-Fuller model. The +predictors are also exploited in the subject of experience rating in +non-life insurance, see [@frees1999] and [@buhlmann2005], where the +longitudinal data are under consideration. The insurance premium for the +next period for every policy in the insurance portfolio is predicted. + +A major challenge in this type of prediction is the estimation of the +prediction accuracy measure. Most often it is the Root Mean Squared +Error (RMSE), which is given in analytical form or can be e.g. estimated +using bootstrap. A feature of the distribution of the squared prediction +error is usually a very strong positive asymmetry. Because the mean is +not recommended as the appropriate measure of the central tendency in +such distributions, the alternative prediction accuracy measure called +the Quantile of Absolute Prediction Errors (QAPE), proposed by +[@zadlo2013parametric] and [@zadlo2020bootstrap], can be applied. + +There is a variety of R packages to calculate the considered predictors +together with the accuracy measure of prediction, usually the RMSE. The +package [**sae**](https://CRAN.R-project.org/package=sae), see [@sae], +provides EBLUPs based on Fay-Herriot and Battese-Harter-Fuller models. +In turn, the multivariate EBLUP for Fay-Herriot models is implemented in +[**msae**](https://CRAN.R-project.org/package=msae), see [@msae]. +Several EBLUPs introduced in [@rao1994small] are implemented in package +[**saery**](https://CRAN.R-project.org/package=saery) introduced by +[@saery], likewise in +[**JoSAE**](https://CRAN.R-project.org/package=JoSAE), see [@josae], but +with additional heteroscedasticity analysis. The EBP is provided in the +package [**emdi**](https://CRAN.R-project.org/package=emdi) described in +[@kreutzmann2019r]. + +A new package in this area is our proposed package +[**qape**](https://CRAN.R-project.org/package=qape). It allows the +prediction of flexibly defined characteristics of the response variable +using the above three predictors, assuming an appropriate LMM. A novel +feature of the package +[**qape**](https://CRAN.R-project.org/package=qape), compared to those +already in place, is the ability of bootstrap estimation of the +prediction accuracy measures, both the RMSE and QAPE. Three types of +bootstrap procedures are provided: parametric, residual and double. + +There are three groups of functions in this package: predictors values +calculation, bootstrap estimation of RMSE and QAPE measures, and Monte +Carlo (MC) analysis of properties of predictors and prediction accuracy +estimators. The prediction is based on a LMM model defined by the user +and allows to predict the population characteristics of the response +variable, which can be defined by a linear combination (in the case of +EBLUP), by any R function (e.g. `sum`) or any function defined by the +user (in the case of the EBP and PLUG-IN predictors). The package allows +for full flexibility in defining: the model, the predicted +characteristic, and the transformation of the response variable. + +This paper is organized as follows. Firstly, the background of the LMM +is presented together with the theoretical foundations of the prediction +including prediction accuracy measures. Then, the package functionality +in the area of prediction is presented and illustrated. A short +application based on `radon` data, a cross-sectional dataset available +in [**HLMdiag**](https://CRAN.R-project.org/package=HLMdiag) package, to +predict three subpopulation characteristics is shown. Subsequently, the +theoretical background of the prediction accuracy measures estimation +based on bootstrap is presented. Implementations of bootstrap algorithms +in [**qape**](https://CRAN.R-project.org/package=qape) are briefly +introduced. Finally, the procedure of the model-based Monte Carlo +simulation study is discussed. The paper ends with a conclusion. + +## Prediction accuracy measures {#PAM} + +We consider the problem of prediction of any given function of the +population vector $\mathbf{Y}$ of the response variable: +$$\label{theta} +\theta = f_{\theta}(\mathbf{Y}) (\#eq:theta)$$ +under the LMM. It covers linear combinations of $\mathbf{Y}$ (such as +one future realization of the response variable or population and +subpopulation means and totals) but also other population and +subpopulation characteristics such quantiles and variability measures. + +To assess the accuracy of the particular predictor $\hat \theta$, +firstly, the prediction error is defined as $U=\hat{\theta}-\theta$. +Therefore, the well-known RMSE has the following formula: +$$\label{eq0} + RMSE(\hat{\theta})=\sqrt{E(\hat{\theta}-\theta)^{2}}=\sqrt{E({{U}^{2}})}. (\#eq:eq0)$$ +The alternative to the RMSE based on the mean could be the QAPE based on +quantiles. It represents the $p$th quantile of the absolute prediction +error $|U|$, see [@zadlo2013parametric] and [@zadlo2020bootstrap], and +it is given by: +$$\label{eq1} + QAPE_p(\hat{\theta}) = \inf \left\{ {x:P\left( {\left| {{\hat{\theta}-\theta}} \right| \le x} \right) \ge p} \right\} =\inf \left\{ {x:P\left( {\left| {{U}} \right| \le x} \right) \ge p} \right\} (\#eq:eq1)$$ +This measure informs that at least $p100\%$ of observed absolute +prediction errors are smaller than or equal to $QAPE_p(\hat{\theta})$, +while at least $(1-p)100\%$ of them are higher than or equal to +$QAPE_p(\hat{\theta})$. Quantiles reflect the relation between the +magnitude of the error and the probability of its realization. It means +that using the QAPE, it is possible to make a full description of the +distribution of prediction errors instead of using the average +(reflected by the RMSE). Furthermore, the MSE is the mean of positively +(usually very strongly) skewed squared prediction errors, where the mean +should not be used as a measure of the central tendency of positively +skewed distributions. + +The above described accuracy prediction measures RMSE and QAPE can be +estimated using the bootstrap techniques. Their estimators as well as +the bootstrap distributions of the prediction errors based on any +(assumed or misspecified) model are provided in +[**qape**](https://CRAN.R-project.org/package=qape) package, including +algorithms where the parallel computing is used. + +In the [**qape**](https://CRAN.R-project.org/package=qape) package, the +whole prediction process has its own specific procedure, which can be +presented in the following steps. + +::: {#Proc1 .procedure} +**Procedure 1**. *The process of prediction, accuracy measures +estimation and Monte Carlo simulation analyses in +[**qape**](https://CRAN.R-project.org/package=qape) * + +1. *Define the characteristics of the response variable to predict,* + +2. *provide the information on sample and population values,* + +3. *define the LMM,* + +4. *estimate parameters of the LMM,* + +5. *predict the random variable $\theta$ using the chosen class of + predictors,* + +6. *estimate the prediction accuracy measures RMSE and QAPE using one + of the developed bootstrap algorithms,* + +7. *conduct simulation analyses of properties of predictors and + accuracy measures estimators under any (also misspecified) LMM + model.* +::: + +## The prediction under LMM + +The main functions of the +[**qape**](https://CRAN.R-project.org/package=qape) package provide the +bootstrap estimation of prediction accuracy measures. However, it must +be preceded by the prediction process, including the choice of the LMM +and the predictor. + +### The model + +Let $\mathbf{Y}$ denote the vector of response variables +$Y_1, Y_2,..., Y_N$. Assuming, without a loss of generality, that only +the first $n$ realizations of $Y_i$ are observed, $\mathbf{Y}$ can be +decomposed as $\mathbf{Y}= +\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{Y}_r^T +\end{bmatrix}^T$ , where $\mathbf{Y}_s$ and $\mathbf{Y}_r$ are of +dimension $n \times 1$ and $(N - n) \times 1$, respectively. In all +notations, the subscript \"s\" is used for observed realizations of the +variable of interest and \"r\" for the unobserved ones. Two known +matrices of auxiliary variables are also considered, denoted by +$\mathbf{X}$ and $\mathbf{Z}$, which are associated with fixed and +random effects, respectively. The $\mathbf{X}$ matrix is of dimension +$N \times p$, and it consists of $p$ regression variables. It can be +decomposed like $\mathbf{Y}$ as follows: $\mathbf{X}= +\begin{bmatrix} + \mathbf{X}_s^T & \mathbf{X}_r^T +\end{bmatrix}^T$, where matrices $\mathbf{X}_s$ and $\mathbf{X}_r$, both +known, are of dimension $n \times p$ and $(N-n) \times p$, respectively. +Similarly, the $\mathbf{Z}$ matrix of dimension $N \times h$ can be +written as follows: $\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_s^T & \mathbf{Z}_r^T +\end{bmatrix}^T$, where matrices $\mathbf{Z}_s$ and $\mathbf{Z}_r$, both +known, are of dimension $n \times h$ and $(N-n) \times h$, respectively. + +Then, let $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})$ denotes the +LMM of the following form (e.g. [@rao2015small], p. 98): +$$\label{LMM} + \left\{ \begin{array}{c} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}\mathbf{v}+\mathbf{e} \\ + E(\mathbf{e})=\mathbf{0}, E(\mathbf{v})=\mathbf{0} \\ + Var(\mathbf{e})=\mathbf{R}(\pmb{\delta}), Var(\mathbf{v})=\mathbf{G}(\pmb{\delta}) + \end{array} \right. (\#eq:LMM)$$ +The vector of parameters in model (\@ref(eq:LMM)) is then +$\boldsymbol{\psi}=\begin{bmatrix} + \boldsymbol{\beta}^T & \pmb{\delta}^T +\end{bmatrix}^T$, where $\boldsymbol{\beta}$ is a vector of fixed +effects of dimension $p \times 1$ and $\pmb{\delta}$ is a vector of +variance components. The random part of the model is described by the +known matrix $\mathbf{Z}$, a vector $\mathbf{v}$ of random effects of +dimension $h \times 1$ and a vector $\mathbf{e}$ of random components of +dimension $N\times 1$, where $\mathbf{e}$ and $\mathbf{v}$ are assumed +to be independent. The vector of random components $\mathbf{e}$ will be +decomposed similarly to the vector $\mathbf{Y}$, i.e. +$\mathbf{e}=\begin{bmatrix} + \mathbf{e}_s^T & \mathbf{e}_r^T +\end{bmatrix}^T$. + +In the residual bootstrap implemented in +[**qape**](https://CRAN.R-project.org/package=qape), there is a need to +re-write the LMM model to take account of the specific structure of +data, i.e. the grouping variables taken into account in the random part +of the model. In this case, without a loss of the generality, the LMM +model can be written as follows: +$$\label{LMMa} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}_1\mathbf{v}_1+...+\mathbf{Z}_l\mathbf{v}_l+...+\mathbf{Z}_L\mathbf{v}_L+\mathbf{e}, (\#eq:LMMa)$$ +where $\mathbf{v}_1,\dots,\mathbf{v}_l,\dots,\mathbf{v}_L$ are +independent vectors of random effects assumed for different divisions of +the $\mathbf{Y}$ vector (under different grouping of the data) and +$\mathbf{Z}_1, \dots, \mathbf{Z}_l, \dots, \mathbf{Z}_L$ are known +matrices of auxiliary variables associated with random effects. Writing +in (\@ref(eq:LMMa)): $\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_1 & \dots & \mathbf{0} & \dots & \mathbf{0} \\ + \vdots & \ddots & & & \vdots \\ + \mathbf{0} & \dots & \mathbf{Z}_l & \dots & \mathbf{0} \\ + \vdots & & & \ddots & \vdots \\ + \mathbf{0} & \dots & \mathbf{0} & \dots & \mathbf{Z}_L \\ +\end{bmatrix}$ and $\mathbf{v}= +\begin{bmatrix} + \mathbf{v}_1^T & \dots & \mathbf{v}_l^T & \dots & \mathbf{v}_L^T \\ +\end{bmatrix}^T$ the LMM model is obtained. Let + +$$\label{vl} +\mathbf{v}_l=\left[ \mathbf{v}_{l1}^T \dots \mathbf{v}_{lk}^T \dots \mathbf{v}_{lK_l}^T \right]^T (\#eq:vl)$$ +be of dimension $K_l J_l \times 1$, where $\mathbf{v}_{lk}$ is of +dimension $J_l \times 1$ for all $k=1,...,K_l$ and $K_l$ is the number +of random effects at the $l$th level of grouping. Hence, $\mathbf{Z}_l$ +is $N \times K_l J_l$. For example, if the random regression coefficient +model is considered with two random coefficients where both random +effects are subpopulation-specific, where $D$ is the number of +subpopulations, then $L=1$, $K_1=2$ and $J_1=D$. + +### Predictors + +In the [**qape**](https://CRAN.R-project.org/package=qape) package, in +the general case the predicted characteristic is given by any function +of response variables: +$$\label{ftheta} +\theta = f_{\theta}(\mathbf{Y}). (\#eq:ftheta)$$ +Under the $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})$ model it +could be predicted using one of three predictors: + +1. Empirical Best Linear Unbiased Predictor (EBLUP), + +2. Empirical Best Predictor (EBP) under nested error LMM, + +3. PLUG-IN predictor under the LMM. + +The first predictor (EBLUP) allows to predict the linear combination of +the response variables: +$$\label{l.theta} +\theta = f_{\theta}(\mathbf{Y}) = \boldsymbol{\gamma}^T \mathbf{Y}= \boldsymbol{\gamma}_s^T \mathbf{Y}_s + \boldsymbol{\gamma}_r^T \mathbf{Y}_r, (\#eq:l-theta)$$ +where $\boldsymbol{\gamma}$ is a vector of weights. In this case, the +predicted characteristic $\theta$ is basically the linear combination of +the response variable. For example, if one of the elements of +$\boldsymbol{\gamma}$ equals 1 and the rest of the elements equals 0, +then one realization of the response variable is predicted. If all +elements in $\boldsymbol{\gamma}$ vector equal 1, then $\theta$ becomes +the sum of all $Y_i$'s in the whole considered population dataset. The +two-stage EBLUP corresponds to the Best Linear Unbiased Predictor (BLUP) +introduced in [@henderson1950estimation] and [@royall1976linear] as: +$$\label{BLUP} + \hat{\theta}^{BLUP} (\pmb{\delta}) = {\boldsymbol{\gamma}}_s^T \mathbf{Y}_s + \hat{\theta}_r(\pmb{\delta}), (\#eq:BLUP)$$ +where the predictor of the linear combination +$\boldsymbol{\gamma}_r^T \mathbf{Y}_r$ of unobserved random variables is +given by +$\hat{\theta}_r(\pmb{\delta})={\boldsymbol{\gamma }}_r^T {{\mathbf{X}}_r}{\tilde{\boldsymbol{\beta}} }(\pmb{\delta}) +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{\tilde{v}}}(\pmb{\delta})$, +where $\tilde{\boldsymbol{\beta}}(\pmb{\delta})$ is the Best Linear +Unbiased Estimator of $\boldsymbol{\beta}$ and +$\tilde{\mathbf{v}}(\pmb{\delta})$ is the Best Linear Unbiased Predictor +of $\mathbf{v}$, both presented in (\@ref(eq:LMM)). As shown by +[@zadlo2017EBLUP] p. 8094, if +$Cov(\mathbf{e}_r, \mathbf{e}_s)=\mathbf{0}$, then the predictor +(\@ref(eq:BLUP)) is the BLUP of $\theta$ defined as the linear +combination (\@ref(eq:l-theta)). Even if +$Cov(\mathbf{e}_r, \mathbf{e}_s) \neq \mathbf{0}$, the predictor +$\hat{\theta}_r(\pmb{\delta})$ is the Best Linear Unbiased Predictor of +the following linear combination of $\boldsymbol{\beta}$ and +$\mathbf{v}$: +${\boldsymbol{\gamma }}_r^T{{\mathbf{X}}_r}{ {\boldsymbol{\beta}} } +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{{v}}}$. +The EBLUP $\hat\theta^{EBLUP}$ is obtained by replacing the vector of +variance components $\pmb{\delta}$ in BLUP (\@ref(eq:BLUP)) with the +estimator $\hat{\pmb{\delta}}$. If (a) the expectation of the predictor +is finite, (b) $\hat{\pmb{\delta}}$ is any even, translation-invariant +estimator of $\pmb{\delta}$, (c) the distributions of both random +effects and random components are symmetric around $\mathbf{0}$ (not +necessarily normal), the EBLUP remains unbiased, as proved by +[@kackar1981unbiasedness]. + +To introduce the second predictor, called EBP, considered e.g. by +[@molina2010small], firstly, the Best Predictor (BP) $\hat{\theta}^{BP}$ +of characteristic $\theta(\mathbf{Y})$ has to be defined. It is computed +by minimizing the Mean Squared Error +$MSE(\hat\theta )=E(\hat\theta - \theta)^2$ and can be written as +$\hat\theta^{BP} = E(\theta|\mathbf{Y}_s)$. It means that the +conditional distribution of $\mathbf{Y}_r|\mathbf{Y}_s$ must be known to +compute its value while at least the parameters of this distribution, +denoted by $\boldsymbol{\psi}$ in (\@ref(eq:LMM)), are unknown. The EBP +$\hat\theta^{EBP}$ is obtained by replacing these parameters with +estimators $\hat{\boldsymbol{\psi}}$. Its value can be computed +according to the Monte Carlo procedure presented in the supplementary +document for this paper. + +The last predictor is the PLUG-IN predictor defined as (e.g. +[@chwila2019properties]): +$$\hat{\theta}^{PLUG-IN}=\theta(\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{\hat{Y}}_r^T + \end{bmatrix}^T),$$ +where $\mathbf{\hat{Y}}_r$ is the vector of fitted values of unobserved +random variables under the assumed model (any model specified by the +statistician). Under the LMM and if the linear combination of +$\mathbf{Y}$ is predicted, the PLUG-IN predictor is the EBLUP, but +generally, it is not optimal. However, it was shown in simulation +studies that it can have similar or even higher accuracy compared to +empirical (estimated) best predictors, where the best predictors +minimize the prediction mean squared errors (cf. e.g. +[@boubeta2016empirical], [@chwila2019properties], +[@hobza2016empirical]). Moreover, the PLUG-IN predictor is less +computationally demanding than the EBP. + +### Predictors in [**qape**](https://CRAN.R-project.org/package=qape) + +To deal with the LMM model, the +[**qape**](https://CRAN.R-project.org/package=qape) package uses the +`lmer()` function from the +[**lme4**](https://CRAN.R-project.org/package=lme4) package, see +[@lme4]. Assuming (\@ref(eq:LMM)) and based on $\mathbf{Y}_s$, the +vector of model parameters +$\boldsymbol{\psi} = [\boldsymbol{\beta}^T, \pmb{\delta}^T]^T$ is +estimated using the Restricted Maximum Likelihood Method (REML), known +to be robust on non-normality, see e.g [@jiang1996reml], and +$\hat{\boldsymbol{\psi}}$ is obtained. + +In order to obtain the predictor of $\theta$, one of the three +[**qape**](https://CRAN.R-project.org/package=qape) functions can be +applied: `EBLUP()`, `ebpLMMne()` or `plugInLMM()`. Firstly, the +characteristic of response variables of interest has to be defined. It +is actually obvious for EBLUP, which can be used only to predict the +population/subpopulation linear combination (e.g. the sum) by using the +argument `gamma` equivalent to the population vector of weights +$\boldsymbol{\gamma}$ in (\@ref(eq:l-theta)). For other two predictors, +the EBP and the PLUG-IN, the input argument called `thetaFun` has to be +given (see $f_{\theta}(.)$ in (\@ref(eq:ftheta))). Function `thetaFun` +could define one characteristic or a vector of characteristics, for +example: + +``` r +> thetaFun1 <- function(x) median(x) +> thetaFun2 <- function(x) c(sum(x), mean(x), sd(x)) +``` + +Secondly, two groups of input arguments, common to all three predictors, +has to be provided: + +- group 1 - arguments defining the sample and the population + + - `YS` - values of the dependent variable in the sample + ($\mathbf{Y}_s$), + + - `reg` - the population matrix of auxiliary variables named in + `fixed.part`, `random.part` and `division`, + + - `con` - the population $0-1$ vector with $1$s for elements in + the sample and $0$s for elements which are not in the sample, + +- group 2 - arguments defining the model + + - `fixed.part` - fixed-effects terms declared as in `lm4::lmer` + function, + + - `random.part` - random-effects terms declared as in `lm4::lmer` + function, + + - `weights` - the population vector of weights. + +The weights make it possible to include heteroscedasticity of random +components in the LMM. + +In `EBLUP()` and `plugInLMM()` the random-effects terms of the LMM have +to be declared as the input argument `random.part`. The form of the +`ebpLMMne` predictor, in turn, requires defining in the `ebpLMMne()` +function the so-called `division` argument instead of `random.part`. +This input represents the variable dividing the population dataset into +subsets, which are taken into account in the nested error linear mixed +model with '`division`'-specific random components (presented in +supplementary document for this paper). + +In the process of prediction, it is often necessary to perform data +transformation before estimating the model parameters. An example is the +logarithmic scaling of the variable of interest. The +[**qape**](https://CRAN.R-project.org/package=qape) package offers the +possibility for declaring the argument `backTrans` to conduct the data +back-transformation. Hence, a very flexible solution is used which +allows to use any transformation of the response variable such that the +back-transformation can be defined. This argument (available in R or +defined by the user function) should be the back-transformation function +of the already transformed dependent variable used to define the model, +e.g. for log-transformed `YS` used as the response variable: + +``` r +> backTrans <- function(x) exp(x) +``` + +The main output is the value of predictor `thetaP`. For each class of +predictors, there are two S3 methods registered for existing generic +functions `print` and `summary`. The full list of output arguments is +presented in detail in the `qape-manual` file, cf. [@qape]. + +### Radon data and the model + +In order to demonstrate the functionality of the package's main +functions, in the following examples the `radon` dataset available in +[**HLMdiag**](https://CRAN.R-project.org/package=HLMdiag) package +([@HLMdiag]) is analyzed. It contains the results of a survey measuring +radon concentrations in 919 owner-occupied homes in 85 counties of +Minnesota (see Figure \@ref(fig:map)). A study was conducted in +1987-1988 by the Minnesota Department of Health, showing that indoor +radon levels are higher in Minnesota compared to typical levels in the +U.S. In the data, the response variable `log.radon` (denoted in +(\@ref(eq:radon-model)) by $log(Y_{ic})$) is the radon measurement in +logarithms of picoCurie per liter. The independent variables, on the +other hand, are: `uranium` ($x_{1ic}$) the average county-level soil +uranium content, `basement` ($x_{2ic}$) the 0-1 variable indicating the +level of the home at which the radon measurement was taken - 0 for +basement, 1 for the first floor, and `county` (denoted by subscript $c$ +in (\@ref(eq:radon-model))) is county ID. + +```{r map, echo=FALSE , fig.cap="The maps of characteristics of radon concentration in counties in picoCurie per liter. The gray colour means that the value is NA (Not Available)", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("mapaAll.png")) +``` + +In all considered examples, the prediction for the county no. 26 +(`county == 26`) is conducted and it is assumed that the observations in +this county from the first floor (`basement == 1`) are not available +(see Figure \@ref(fig:boxplot)). + +```{r boxplot, echo=FALSE , fig.cap="The distributions of radon concentration in picoCurie per liter in counties. The red line indicates county no. 26", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("boxAll.png")) +``` + +The `radon` dataset is widely discussed in the literature. In the paper +[@nero1994statistically], the Authors used an ordinary regression model +to predict county geometric means of radon concentration using surficial +soil radium data from the National Uranium Resource Evaluation. In turn, +the paper [@price1996bayesian] focuses on the prediction of the +geometric mean of radon for each county, but using a Bayesian approach. +For the `radon` data we use the following model +$$\label{radon.model} + log(Y_{ic}) = \beta_1 x_{1ic} + (\beta_2 + v_{1c}) x_{2ic} + \beta_0 + v_{2c} + e_{ic}, (\#eq:radon-model)$$ +where $i=1,2,\dots,N$, $c=1,2,\dots, C$, $N = 919$ observations, +$C = 85$ counties, $\beta_1$, $\beta_2$ and $\beta_0$ are unknown fixed +effects, $v_{1c}$ and $v_{2c}$ are random effects, $e_{ic}$ are random +components, $v_{1c}$, and $e_{ic}$ are mutually independent, $v_{2c}$ +and $e_{ic}$ are mutually independent too, $Cor(v_{1c}, v_{2c}) = \rho$, +$v_{1c} \sim (0, \sigma^2_{v_1})$, $v_{2c} \sim (0, \sigma^2_{v_2})$ and +$e_{ic} \sim (0, \sigma^2_e)$. As can easily be seen, the considered +model is the random coefficient model with two correlated +`county`-specific random effects. Its syntax written using the package +[**lme4**](https://CRAN.R-project.org/package=lme4) notation is as +follows: + +``` r +radon.model <- lmer(log.radon ~ basement + uranium + (basement | county), data = radon) +``` + +This and similar LMMs are considered, analyzed, and used for the +considered dataset in many publications, with a good overview presented +in [@gelman_data_2006]. In [@gelman2006bayesian], based on their +preceding research [@price1996bayesian], [@gelman1999analysis], +[@peck_should_2005], a very similar model but with additional +multivariate normality assumptions is studied, verified and chosen as +fitting well to the data within a Bayesian framework. The same model as +in [@gelman2006bayesian] with its special cases is considered in +[@cantoni2021review] but within the frequentist approach. Based on 25 +measures of explained variation and model selection, the Authors +conclude that the same model as considered in our paper (with additional +normality assumption, however, which is not used in all cases considered +in that paper), \"seems the best\" [@cantoni2021review p. 10] for the +`radon` data. Further tests of the model are presented by +[@loy2013diagnostics], [@loy2015you] and [@loy2017model] (see also +[@cook2007interactive] for the introduction of the methodology) showing +among others: the normality and homescedasticity of random components, +the normality of the distribution of the random slope but -- what is +important for our further considerations -- the lack of the normality of +the random intercept. Since the problem of choosing and verifying a +model for the considered dataset is widely discussed in the literature, +we will focus on the issues that are new in this case, namely the +problem of prediction and estimation of the prediction accuracy as well +as the Monte Carlo analysis of predictors' properties. + +### Example 1 + +This example shows the prediction procedure in the package +[**qape**](https://CRAN.R-project.org/package=qape). In the first step, +it is needed to define all the input arguments that will then be passed +to the prediction functions. + +``` r +> Ypop <- radon$log.radon # the population vector of the dependent variable +> # It is assumed that observations from the first floor +> # in county no. 26 are not available: +> con <- rep(1, nrow(radon)) +> con[radon$county == 26 & radon$basement == 1] <- 0 +> YS <- Ypop[con == 1] # sample vector of the dependent variable +> reg <- dplyr::select(radon, -log.radon) # the population matrix of auxiliary variables +> fixed.part <- 'basement + uranium' # the fixed part of the considered model +> random.part <- '(basement|county)' # the random part of the considered model +> # The vector of weights to define +> # the predicted linear combination - the mean for county == 26: +> gamma <- ++ (1 / sum((radon$county == 26))) * ifelse((radon$county == 26), 1, 0) +> estMSE <- TRUE # to include the naive MSE estimator of the EBLUP in the output +``` + +Then the functions corresponding to each predictor can be used. First, +the EBLUP prediction in the package +[**qape**](https://CRAN.R-project.org/package=qape) is presented. As the +EBLUP is limited to the linear combination of random variables, the +predicted characteristic is simply the arithmetic mean. To be precise, +it is the mean of logarithms of measurements (instead of the mean of +measurements), because the EBLUP can be used only under the linear +(linearized) models. As in the LMM the homescedasticity of random +components is assumed, the input argument `weights = NULL` is set up. + +``` r +> myeblup <- EBLUP(YS, fixed.part, random.part, reg, con, gamma, weights = NULL, estMSE) +> # the value of the predictor of the arithmetic mean +> # of logarithms of radon measurements: +> myeblup$thetaP +[1] 1.306916 +> myeblup$neMSE # the value of the naive MSE estimator +[1] 0.002292732 +``` + +Hence, the predicted value of the arithmetic mean of logarithms of radon +measurements equals $1.306916$ log picoCurie per liter. The estimated +root of prediction MSE equals $\sqrt{0.002292732} \approx 0.048$ log +picoCurie per liter, but -- what is important -- it is the value of the +naive RMSE estimator [as defined by @rao2015small p. 106], which means +that it ignores the decrease of accuracy due to the estimation of model +parameters. + +The second part of this example shows the prediction of the arithmetic +mean, geometric mean and median of radon measurements (not logarithm of +radon measurements) in county no. 26 with the use of the PLUG-IN +predictor. It requires the setting of two input arguments: `thetaFun` +and `backTrans`. + +``` r +> thetaFun <- function(x) { ++ c(mean(x[radon$county == 26]), psych::geometric.mean(x[radon$county == 26]), ++ median(x[radon$county == 26])) ++ } +> backTransExp <- function(x) exp(x) # back-transformation +> myplugin <- plugInLMM(YS, fixed.part, random.part, reg, con, weights = NULL, ++ backTrans = backTransExp, thetaFun) +> # values of the predictor of arithmetic mean, geometric mean +> # and median of radon measurements: +> myplugin$thetaP +[1] 3.694761 4.553745 3.900000 +``` + +In this case we can conclude that the predicted values of the +aritmethmic mean, geometric mean and median in county no. 26 equal: +$3.694761$, $4.553745$ and $3.9$ picoCurie per liter, respectively. The +problem of prediction accuracy estimation will be discussed in the next +sections of the paper. + +The [**qape**](https://CRAN.R-project.org/package=qape) package allows +to use the Empirical Best Predictor (EBP) (see the supplementary +document for this paper) as well. It provides predicted values of any +function of the variable of interest, as the PLUG-IN predictor. However, +this requires stronger assumptions to be met. The EBP procedure +available in [**qape**](https://CRAN.R-project.org/package=qape) package +is prepared under the assumption of the normality of the variable of +interest after any transformation. However, in the case of the +considered model for logarithms of radon measurements, the assumption is +not met as we mentioned before based on the results presented in the +literature. It can also be verified using `normCholTest` function +(available in [**qape**](https://CRAN.R-project.org/package=qape) +package) as follows: + +``` r +> normCholTest(radon.model, shapiro.test)$p.value +[1] 2.589407e-08 +``` + +Moreover, due to the fact of very time-consuming iterative procedure +used to compute the EBP for the general case, in the +[**qape**](https://CRAN.R-project.org/package=qape) package the function +`ebpLMMne` uses a very fast procedure working only for nested error +Linear Mixed Models (see [@molina2010small]). + +The prediction of any function of the random variables based on +cross-sectional data has been considered. Its special case, not +presented above but widely discussed in the econometric literature, is +the prediction of one random variable, in this case a radon measurement +for one non-observed owner-occupied home. Furthermore, the +[**qape**](https://CRAN.R-project.org/package=qape) package is also +designed for prediction based on longitudinal data for current or future +periods as shown in examples for the `EBLUP`, `plugInLMM` and `ebpLMMne` +functions in the `qape-manual` file, cf. [@qape]. + +## Bootstrap procedures + +The [**qape**](https://CRAN.R-project.org/package=qape) package provides +three main types of bootstrap algorithms: the parametric bootstrap, the +residual bootstrap and the double-bootstrap. + +The parametric bootstrap procedure is implemented according to +[@gonzales2007] and [@gonzales2008] and could be described in the +following steps: + +1. based on $n$ observations of the dependent and independent variables + ($\mathbf{Y}_s$, $\mathbf{X}_s$ and $\mathbf{Z}_s$) estimate + $\boldsymbol{\psi}$ to obtain the vector of estimates + $\boldsymbol{\hat{\psi}}$, + +2. generate $B$ realizations $y_{i}^{*(b)}$ of $Y_{i}$, under the + $LMM(\mathbf{X}, \mathbf{Z}, \hat{\boldsymbol{\psi}})$ and + multivariate normality of random effects and random components + obtaining\ + $\mathbf{y}^{*(b)}=\begin{bmatrix} + y_{1}^{*(b)} & ... & y_{i}^{*(b)} &... & y_{N}^{*(b)} + \end{bmatrix}^T$, where $i=1, 2, ... ,N$ and $b=1, 2, ... ,B$, + +3. decompose the vector $\mathbf{y}^{*(b)}$ as follows $\begin{bmatrix} + \mathbf{y}_s^{*(b)T} & \mathbf{y}_r^{*(b)T} + \end{bmatrix}^T$, + +4. in the $b$th iteration ($b=1,2,...,B$) + + 1. compute the bootstrap realization + $\theta^{*(b)}=\theta^{*(b)}(\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}})$ + of random variable $\theta$, + + 2. obtain the vector of estimates $\boldsymbol{\hat{\psi}}^{*(b)}$ + using $\mathbf{y}_s^{*(b)}$ and compute the bootstrap + realization of predictor $\hat{\theta}$ denoted by + $\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})$ + based on + $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\hat{\psi}}^{*(b)})$, + + 3. compute bootstrap realizations of prediction error $U^*$ denoted + by $u^*$ and for the $b$th iteration given by: + $$\label{u*b} + u^{*(b)}=\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})-\theta^{*(b)} + (\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}}) =\hat{\theta}^{*(b)}-\theta^{*(b)}, (\#eq:u*b)$$ + +5. compute the parametric bootstrap estimators of prediction accuracy + measures: RMSE and QAPE replacing prediction errors $U$ in + (\@ref(eq:eq0)) and (\@ref(eq:eq1)) by their bootstrap realizations. + +Another possible method to estimate the prediction accuracy measures is +the residual bootstrap. In what follows, we use the notation +$srswr(\mathbf{A}, m)$ to indicate the outcome of taking a simple random +sample with replacement of size $m$ of rows of matrix $\mathbf{A}$. If +$\mathbf{A}$ is a vector, it simplifies to a simple random sample with +replacement of size $m$ of elements of $\mathbf{A}$. + +To obtain the algorithm of the residual bootstrap, it is enough to +replace step 2 of the parametric bootstrap procedure presented above +with the following procedure of the population data generation based on +(\@ref(eq:LMMa)): + +- generate $B$ population vectors of the variable of interest, denoted + by $\mathbf{y}^{*(b)}$ as + $$\label{LMMboot} + \mathbf{y}^{*(b)}=\mathbf{X}\hat{\boldsymbol{\beta}} + \mathbf{Z}_1\mathbf{v}^{*(b)}_1+...+\mathbf{Z}_l\mathbf{v}^{*(b)}_l+...+\mathbf{Z}_L\mathbf{v}^{*(b)}_L+\mathbf{e}^{*(b)}, (\#eq:LMMboot)$$ + where $\hat{\boldsymbol{\beta}}$ is an estimator (e.g. REML) of + ${\boldsymbol{\beta}}$, $\mathbf{e}^{*(b)}$ is a vector of dimension + $N \times 1$ defined as + $srswr(col_{1 \leq i \leq n } \hat{{e}}_{i}, N)$, where + $\hat{{e}}_{i}$ ($i=1,2,...,n$) are residuals, $\mathbf{v}^{*(b)}_l$ + (for $1,2,...,L$) is the vector of dimension $K_l J_l \times 1$ + built from the columns of the matrix: $srswr \left( + \left[ \begin{array}{ccccc} + \hat{\mathbf{v}}_{l1} & + \dots & + \hat{\mathbf{v}}_{lk} & + \dots & + \hat{\mathbf{v}}_{lK_l} + \end{array} + \right], J_l + \right)$ of dimension $J_l \times K_l$, where + $\hat{\mathbf{v}}_{lk}$ are estimates of elements of random effects + vector (\@ref(eq:vl)). + +The next 3--5 steps in this procedure are analogous to steps in the +parametric bootstrap procedure. + +In the above-described step, it can be seen that if more than one vector +of random effect is assumed at the $l$th level of grouping, then the +elements are not sampled with replacement independently. In this case, +rows of the matrix formed by these vectors are sampled with replacement. + +The residual bootstrap algorithm can also be performed with so-called +\"correction procedure\". This procedure, which can improve the +properties of the residual bootstrap estimators due to the +underdispersion of the uncorrected residual bootstrap distributions, is +presented in the supplementary document for this paper. + +## Bootstrap in [**qape**](https://CRAN.R-project.org/package=qape) + +Two bootstrap procedures are implemented in separate functions: +`bootPar()` (the parametric bootstrap) and `bootRes()` (the residual +bootstrap). According to the general Procedure [1](#Proc1), the step +preceding the bootstrap procedure in both functions is the definition of +the predictor object. It must be one of the following: `EBLUP`, +`ebpLMMne` or `plugInLMM`. This object has to be passed to `bootPar()` +or `bootRes()` as the input parameter `predictor`. The other input +parameters are intuitive: `B` - the number of bootstrap iterations and +`p` - order of quantiles in the estimated QAPEs. + +The additional input parameter in `bootRes()` is a logical condition +called `correction`, which makes it possible to include an additional +correction term for both random effects and random components, presented +in the supplementary document for this paper, to avoid the problem of +underdispersion of residual bootstrap distributions. + +The main output values in both functions are basically the measures: +`estRMSE` and `estQAPE` computed based on (\@ref(eq:eq0)) and +(\@ref(eq:eq1)), respectively, where prediction errors are replaced by +their bootstrap realizations. There is also the output `error` being the +vector of bootstrap realizations of prediction errors, which is useful +e.g. in in-depth analysis of the prediction accuracy and for graphical +presentation of results. To estimate these accuracy measures, we use +below the residual bootstrap with the correction procedure. + +As previously stated, our package utilizes the `lmer()` function from +the [**lme4**](https://CRAN.R-project.org/package=lme4) package for +estimating model parameters. However, this function has been known to +generate convergence warnings in certain situations, listed for example +by [@lme4] p. 25, when the estimated variances of random effects are +close to zero. Such scenarios may occur when models are estimated for +smaller or medium-sized datasets, when complex variance-covariance +structures are assumed, or when the grouping variable considered for +random effects has only a few levels. Although we have not observed such +issues estimating model parameters based on the original dataset +required to compute values of the predictors in previous sections, +bootstrapping or Monte Carlo simulations are more complex cases. This is +because, based on the estimates of model parameters, the values of the +dependent variables are generated $B$ times, and then model parameters +are estimated in each out of $B$ iterations. Therefore, in at least some +iterations, dependent variable values may be randomly generated giving +realizations, where the variance of the random effect is relatively +close to zero. As a result, estimates of model parameters can be +obtained; however, convergence issues implying warnings may occur. In +such cases, there are at least two possible solutions. The first option +is to discard iterations with warnings, which would imply that the +dependent variable would not follow the assumed model as required, but +instead only its conditional version with relatively high values of +variances of random effects. It will imply overdispersed bootstrap +distribution of random effects, which will affect the bias of the +bootstrap estimators of accuracy measures. The second option is to +consider all generated realizations, despite convergence warnings, as +long as the parameters can be estimated for all iterations. We opted for +the latter solution, as argued in [@lme4] p. 25, who noted that \"being +able to fit a singular model is an advantage: when the best fitting +model lies on the boundary of a constrained space\". + +### Example 2 + +The analyses presented in Example 1 are continued. We extend the +previous results to include the issue of estimating the prediction +accuracy of the considered predictors. The use of functions for this +estimation primarily requires an object of class predictor, here +\"myplugin\". + +``` r +> class(myplugin) +[1] "plugInLMM" +``` + +The short chunk of the R code presents the residual bootstrap estimators +of the RMSE (`estRMSE`) and the QAPE (`estQAPE`) of the PLUG-IN +predictors (`plugin`) of previously analyzed three characteristics of +radon measurements in county no. 26: the arithmetic mean, geometric mean +and median. In this and subsequent examples we make the computations for +relatively high number of iterations allowing, in our opinion, to get +reliable results. These results are also used to prepare Figure +\@ref(fig:hist). However, the computations are time-consuming. The +supplementary R file contains the same chunks of the code but the number +of iterations applied is smaller in order to execute the code swiftly. + +``` r +> # accuracy measures estimates based on +> # the residual bootstrap with the correction: +> B <- 500 # number of bootstrap iterations +> p <- c(0.75, 0.9) # orders of Quantiles of Absolute Prediction Error +> set.seed(1056) +> residBoot <- bootRes(myplugin, B, p, correction = TRUE) +> # values of estimated RMSEs of the predictor of three characteristics: +> # the arithmetic mean, geometric mean and median of radon measurements, respectively: +> residBoot$estRMSE +[1] 0.1848028 0.2003681 0.2824359 +> # values of estimated QAPEs +> # (of order 0.75 in the first row, and of order 0.9 in the second row) +> # of the predictor of three characteristics: +> # the arithmetic mean, geometric mean and median of radon measurements, +> # in the 1st, 2nd and 3rd column, respectively: +> residBoot$estQAPE + [,1] [,2] [,3] +75% 0.1533405 0.2135476 0.2908988 +90% 0.2813886 0.3397411 0.4374534 +``` + +Let us concentrate on interpretations of estimators of accuracy measures +for the predictor of the geometric mean, i.e. the second value of +`residBoot$estRMSE`, and values in the second column of +`residBoot$estQAPE`. It is estimated that the average difference between +predicted values of the geometric mean and their unknown realizations +equals $0.2003681$ picoCurie per liter. Furthermore, it is estimated +that at least $75\%$ of absolute prediction errors of the predictor of +the geometric mean are smaller or equal to $0.2135476$ picoCurie per +liter and at least $25\%$ of absolute prediction errors of the predictor +are higher or equal to $0.2135476$ picoCurie per liter. Finally, it is +estimated that at least $90\%$ of absolute prediction errors of the +predictor of the geometric mean are smaller or equal to $0.3397411$ +picoCurie per liter and at least $10\%$ of absolute prediction errors of +the predictor are higher or equal to $0.3397411$ picoCurie per liter. +The distributions of bootstrap absolute prediction errors with values of +estimated RMSEs and QAPEs for the considered three prediction problems +are presented in Figure \@ref(fig:hist). + +```{r hist, echo=FALSE , fig.cap="The histograms of bootstrap absolute prediction errors for myplugin (for PLUG-IN predictors of the arithmetic mean, geometric mean and median) for B=500", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("histAll.png")) +``` + +Since the assumption of normality is not met, the parametric bootstrap +should not be used in this case. For this reason, we do not present the +results for this method below, although -- but for illustrative purposes +only -- they are presented in the supplementary R file. Moreover, these +analyses can also be conducted using `bootParFuture()` and +`bootResFuture()` functions where parallel computing algorithms are +applied. The input arguments and the output of these functions are the +same as in `bootPar()` and `bootRes()`. Examples based on these +functions are also included in the supplementary R file. + +## Bootstrap under the misspecified model in [**qape**](https://CRAN.R-project.org/package=qape) + +The [**qape**](https://CRAN.R-project.org/package=qape) package also +allows to use predictors under a model different from the assumed one +(e.g. a simpler or more robust model), but estimate its accuracy under +the assumed model. In this case, the parametric and residual bootstrap +procedures are implemented in `bootParMis()` and `bootResMis()` +functions. These functions allow to estimate the accuracy of two +predictors under the model correctly specified for the first of them. Of +course, it is expected that the estimated accuracy of the first +predictor will be better than of the second one, but the key issue can +be the difference between estimates of accuracy measures. A small +difference, even to the second predictor's disadvantage, may be treated +by the user as an argument for using the second predictor due to its +properties, such as robustness or simplicity. + +The considered functions allow to estimate the accuracy of two +predictors, which belong to the class `plugInLMM`, under the model used +to define the first of them. The remaining arguments are the same as in +`bootPar()` and `bootRes()` functions: `B` - the number of bootstrap +iterations, and `p` - orders of QAPE estimates to be taken into account. + +The output results of `bootParMis()` and `bootResMis()` include -- +similarly to `bootPar()` and `bootRes()` functions -- estimates of the +RMSEs and QAPEs of both predictors (denoted here by: `estRMSElmm`, +`estRMSElmmMis`, `estQAPElmm` and `estQAPElmmMis`), and boostrap +realizations of their prediction errors (`errorLMM` and `errorLMMmis`). + +### Example 3 + +In this example, we study the same accuracy measures as in Example 2, +but the aim is to compare the predictor `myplugin` and other predictor +defined under the misspecified LMM. First, the misspecified model has to +be defined, and a relevant predictor has to be computed. + +``` r +> fixed.part.mis <- '1' +> random.part.mis <- '(1|county)' +> myplugin.mis <- plugInLMM(YS, fixed.part.mis, random.part.mis, reg, con, ++ weights = NULL, backTrans = backTransExp, thetaFun) +``` + +Having two objects: `myplugin` and `myplugin.mis`, one can proceed to a +comparison by estimating bootstrap prediction accuracy performed using +the residual bootstrap with correction procedure. In this case, we +estimate the prediction accuracy of these two predictors under the model +used to define the first of them. + +``` r +> set.seed(1056) +> residBootMis <- bootResMis(myplugin, myplugin.mis, B, p, correction = TRUE) +> # residual bootstrap with the correction RMSE estimators +> # of 'plugin' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estRMSElmm +[1] 0.1848028 0.2003681 0.2824359 +> # residual bootstrap with the correction RMSE estimators +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estRMSElmmMis +[1] 0.1919184 0.3192304 0.2762137 +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +> # of 'plugin' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estQAPElmm + [,1] [,2] [,3] +75% 0.1533405 0.2135476 0.2908988 +90% 0.2813886 0.3397411 0.4374534 +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estQAPElmmMis + [,1] [,2] [,3] +75% 0.2267062 0.3802836 0.3255197 +90% 0.2813787 0.4970726 0.4489399 +``` + +The results, presented above, were obtained for the same number of +bootstrap iterations as in Example 2 ($B = 500$). If we compare, under +the model defined in `plugin`, estimated RMSEs of `plugin` and +`plugin.mis` predictors of the geometric mean given by $0.2003681$ and +$0.3192304$ picoCurie per liter, respectively, we can state that the +estimated accuracy (measured by RMSE estimators) of the first predictor +is better comparing with the second one. If we are not interested in the +average accuracy measures but in the right tail of the distribution of +prediction errors, we can use estimates of QAPE of order 0.9 to compare +the accuracy. The result for the `plugin.mis` of the geometric mean +equals to $0.4970726$ picoCurie per liter, and it is higher comparing +with $0.3397411$ picoCurie per liter obtained for `plugin` for the same +prediction problem. Hence, in this case, the accuracy comparison based +both on the RMSE and QAPE leads to the same finding. + +In the previous paragraph, we have focused on the results for the case +of prediction of the geometric mean. If the comparison is made for the +case of prediction of the arithmetic mean (the first column of output +results) or the median (the third column of output results), we will +come to the same conclusion regarding the estimated accuracy of `plugin` +and `plugin.mis` as in the case of prediction of the geometric mean. + +Similarly to the residual bootstrap, the parametric bootstrap procedure +`paramBootMis` available in +[**qape**](https://CRAN.R-project.org/package=qape) package can be +performed. However, in the considered case the normality assumption is +not met (as discussed above) and the procedure is not recommended. The +appropriate chunk of the R code is presented in the supplementary R +file, but it is solely intended for illustrative purposes. + +## Monte Carlo simulation analyses + +In the previous section, our aim was to estimate the prediction accuracy +under correctly specified or misspecified model. In this section, we do +not estimate the accuracy, but we approximate the true prediction +accuracy under the specified model in the Monte Carlo simulation study. +The crucial difference is that in this case, the model parameters used +are obtained based on the whole population dataset, not the sample. If +the number of iterations is large enough, we can treat the computed +values of the measures as their true values, which are unknown in +practice. + +The last step of the analysis in +[**qape**](https://CRAN.R-project.org/package=qape) package presented in +Procedure [1](#Proc1) is the Monte Carlo (MC) simulation analysis of: + +- properties of predictors + +- and properties of parametric, residual and double bootstrap + estimators of accuracy measures. + +The whole Monte Carlo procedure is as follows. + +::: {#Proc2 .procedure} +**Procedure 2**. *Model-based Monte Carlo simulation analyses in +[**qape**](https://CRAN.R-project.org/package=qape) * + +1. *define the population vector of the dependent variable and the + population matrix of auxiliary variables,* + +2. *provide the information on the division of the population into the + sampled and non-sampled part,* + +3. *define $\theta$ - the characteristics of the response variable to + be predicted,* + +4. *define the predictors $\hat{\theta}$ and accuracy measures + estimators which properties are to be assessed,* + +5. *define the model to be used to generate realizations of the values + of the dependent variable and estimate its parameters based on + population data,* + +6. *For k=1, 2, \..., K* + + 1. *generate the population vector of the response variable based + on the assumed model,* + + 2. *based on population data, compute the characteristics $\theta$, + denoted by $\theta_k$,* + + 3. *based on sample data, estimate the parameters of the LMM,* + + 4. *based on sample data, compute values of predictors + $\hat{\theta}$, denoted by $\hat{\theta}_k$,* + + 5. *based on sample data, estimate the accuracy of $\hat{\theta}$ + using bootstrap methods,* + +7. *End For* + +8. *compute accuracy measures of predictors using $\hat{\theta}_k$ and + $\theta_k$ (for $k=1,2, ..., K$),* + +9. *compute accuracy measures of estimators of prediction accuracy + measures.* +::: + +## Monte Carlo analyses in [**qape**](https://CRAN.R-project.org/package=qape) + +In order to perform a Monte Carlo (MC) analysis on the properties of +predictors, it is necessary to have access to the entire population data +for both dependent and independent variables. The function `mcLMMmis()` +can be used with the following arguments. Firstly, the population values +of the dependent variable (after a necessary transformation) should be +declared as `Ypop`. By using the `Ypop` values, we can estimate the +model parameters based on the entire population data (assuming that they +are known). This allows us to generate values of the dependent variable +in the simulation study that can mimic its distribution in the entire +population, not just in the sample. This approach ensures that our +simulation study can be an accurate representation of the random process +in the entire population, resembling the real-world scenario. Secondly, +three predictors: `predictorLMMmis`, `predictorLMM`, `predictorLMM2`, +which belong to the class `plugInLMM`, are to be defined. The first one +is used only to define the (possibly misspecified) model used to +generate population values of the response variables. Accuracy of +`predictorLMM` and `predictorLMM2` is assessed in the simulation study. +The next two arguments include the number of MC iterations `K` and +orders `p` of QAPEs used to assess the prediction accuracy. Finally, it +should be noted that it is possible to modify covariance matrices of +random components and random effects based on the model defined in +`predictorLMMmis`, which are used tThiso generate values of the +dependent variable. It is possible by declaring values of `ratioR` and +`ratioG` arguments, which the diagonal elements of covariance matrices +of random components and random effects, respectively, are divided by. + +The output of this function covers the following statistics of both +predictors computed in the simulation study: relative biases (`rBlmm` +and `rBlmm2`), relative RMSEs (`rRMSElmm` and `rRMSElmm2`) and QAPEs +(`QAPElmm` and `QAPElmm2`). Simulation-based prediction errors of both +predictors (`errorLMM` and `errorLMM2`) are also taken into account. + +### Example 4 + +In the example, an MC simulation is carried out assuming the `myplugin` +predictor. The goal is to approximate the true accuracy of the +prediction assuming model (\@ref(eq:radon-model)). Hence, in the package +[**qape**](https://CRAN.R-project.org/package=qape), all input predictor +objects in the function `mcLMMmis` have to be defined as `myplugin`.   + +``` r +> # input arguments: +predictorLMMmis <- myplugin # to define the model +predictorLMM <- myplugin # which properties are assessed in the simulation study +predictorLMM2 <- myplugin # which properties are assessed in the sim. study +``` + +Except that no modification of covariance matrices has to be used. + +``` r +# diag. elements of the covariance matrix of random components are divided by: +ratioR <- 1 +# diag. elements of the covariance matrix of random effects are divided by: +ratioG <- 1 +``` + +We specify the number of Monte Carlo iterations. + +``` r +K <- 500 # the number of MC iterations +``` + +The analysis is conducted in the object `MC`. + +``` r +> set.seed(1086) +> MC <- mcLMMmis(Ypop, predictorLMMmis, predictorLMM, predictorLMM2, ++ K, p, ratioR, ratioG) +> # relative bias of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26 (in %): +> MC$rBlmm +[1] -1.73208393 -0.04053178 -5.22355236 +``` + +Results of the relative biases are obtained. It is seen, that under the +assumed model the values of the considered predictor of the geometric +mean (the second value of `MC$rBlmm`) are smaller than possible +realizations of the geometric mean on average by $0.04053178\%$. In +turn, the relative RMSEs are as follows. + +``` r +> # relative RMSE of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26 (in %): +> MC$rRMSElmm +[1] 3.429465 4.665810 7.146678 +``` + +In the considered case, the average difference between predicted values +of the geometric mean and its possible realizations (the second value of +`MC$rRMSElmm`) equals $4.665810\%$. It should be noted that this value +can be treated as the true value of the relative RMSE (if the number of +iterations is large enough), not the estimated value obtained in +Examples 2 and 3. + +Finally, QAPEs of orders 0.75 and 0.9 are considered. + +``` r +> # QAPE of order 0.75 and 0.9 of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26: +> MC$QAPElmm + [,1] [,2] [,3] +75% 0.1491262 0.1989504 0.2919221 +90% 0.2895684 0.2959457 0.4728064 +``` + +Let us interpret the results presented in the second column of +`MC$QAPElmm`. At least $75\%$ ($90\%$) of absolute prediction errors of +the predictor of the geometric mean are smaller or equal to $0.1989504$ +($0.2959457$) picoCurie per liter and at least $25\%$ ($10\%$) of +absolute prediction errors of the predictor are higher or equal to +$0.1989504$ ($0.2959457$) picoCurie per liter. Similar to the values of +the rRMSEs in the previous code chunk, the values can be considered to +be true QAPE values, not the estimates presented in Examples 2 and 3. + +In Example 4, the accuracy of one predictor under the model used to +define this predictor was presented. A more complex version of the +simulation study, where the properties of two predictors are studied +under the model defined by the third predictor, is presented in the +supplementary R file. What is more, the +[**qape**](https://CRAN.R-project.org/package=qape) package also allows +to use `mcBootMis()` function to conduct MC analyses of properties of +accuracy measure estimators (estimators of MSEs and QAPEs) of two +predictors (which belong to the class `plugInLMM`) declared as +arguments. The model used in the simulation study is declared in the +first predictor, but the properties of accuracy measures estimators of +both predictors are studied. Output results of `mcBootMis()` covers +simulation results on properties of different accuracy measures +estimators, including the relative biases and relative RMSEs of the +parametric bootstrap MSE estimators of both predictors. The same +simulation-based statistics but for parametric bootstrap QAPE estimators +are also included. Other bootstrap methods, including the residual +bootstrap with and without the correction procedure, are also taken into +account. The full list of output arguments of `mcBootMis()` function are +presented in `qape-manual` file, cf. [@qape]. + +## Conclusions + +The package enables R users to make predictions and assess the accuracy +under linear mixed models based on different methods in a fast and +intuitive manner -- not only based on the RMSE but also based on +Quantiles of Absolute Prediction Errors. It also covers functions which +allow to conduct Monte Carlo simulation analyses of properties of the +methods of users interest. Its main advantage, compared to other +packages, is the considerable flexibility in terms of defining the model +(as in the [**lme4**](https://CRAN.R-project.org/package=lme4) package) +and the predicted characteristic, but also the transformation of the +response variable. + +In our opinion, the package is useful for scientists, practitioners and +decision-makers in all areas of research where accurate estimates and +forecasts for different types of data (including cross-sectional and +longitudinal data) and for different characteristics play the crucial +role. We believe that it will be of special interest to survey +statisticians interested in the prediction for subpopulations with small +or even zero sample sizes, called small areas. +::::: diff --git a/_articles/RJ-2024-004/RJ-2024-004.html b/_articles/RJ-2024-004/RJ-2024-004.html new file mode 100644 index 0000000000..efc8f98f32 --- /dev/null +++ b/_articles/RJ-2024-004/RJ-2024-004.html @@ -0,0 +1,3895 @@ + + + + + + + + + + + + + + + + + + + + + + Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models with QAPE 2.0 Package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models with QAPE 2.0 Package

    + + + +

    The paper presents a new R package +qape for prediction, +accuracy estimation of various predictors and Monte Carlo simulation +studies of properties of both predictors and estimators of accuracy +measures. It allows to predict any population and subpopulation +characteristics of the response variable based on the Linear Mixed +Model (LMM). The response variable can be transformed, e.g. to +logarithm and the data can be in the cross-sectional or longitudinal +framework. Three bootstrap algorithms are developed: parametric, +residual and double, allowing to estimate the prediction accuracy. +Analyses can also include Monte Carlo simulation studies of properties +of the methods used. Unlike other packages, in the prediction process +the user can flexibly define the predictor, the model, the +transformation function of the response variable, the predicted +characteristics and the method of accuracy estimation.

    +
    + + + +
    +
    +

    1 Introduction

    +

    One of the tasks in application of mixed models in the real-life +problems is the prediction of random effects. Then, the predicted values +give the possibility for further prediction, e.g. characteristics of +interest such as sum, mean or quantiles or the future value of the +response variable for cross-sectional or longitudinal data.

    +

    Three main predictors of these characteristics are proposed in the +literature: Empirical Best Linear Unbiased Predictors - EBLUPs (see e.g. +(Henderson 1950) and (Royall 1976)), PLUG-IN predictors +(see e.g. (Boubeta et al. 2016), (Chwila and Żądło 2019), +(Hobza and Morales 2016)) and Empirical Best Predictors - EBPs (see e.g. +(Molina and Rao 2010)). Each assumes the LMM to model the response +variable.

    +

    The numerous successful applications of these three predictors for +cross-sectional and longitudinal data can be found in the model approach +in survey sampling, including the small area estimation. In paper +(Fay III and Herriot 1979) the Authors introduce the prediction of the mean +income for small places based on the special case of the LMM model +called Fay-Herriot model and the EBLUP. The analysis of poverty is +extended in many works, e.g. in (Molina and Rao 2010) and +(Christiaensen et al. 2012). In turn, in (Battese et al. 1988) the Authors analyse the +total crop areas based on survey and satellite data using EBLUPs. The +proposed LMM model is known as the Battese-Harter-Fuller model. The +predictors are also exploited in the subject of experience rating in +non-life insurance, see (Frees et al. 1999) and (Bühlmann and Gisler 2005), where the +longitudinal data are under consideration. The insurance premium for the +next period for every policy in the insurance portfolio is predicted.

    +

    A major challenge in this type of prediction is the estimation of the +prediction accuracy measure. Most often it is the Root Mean Squared +Error (RMSE), which is given in analytical form or can be e.g. estimated +using bootstrap. A feature of the distribution of the squared prediction +error is usually a very strong positive asymmetry. Because the mean is +not recommended as the appropriate measure of the central tendency in +such distributions, the alternative prediction accuracy measure called +the Quantile of Absolute Prediction Errors (QAPE), proposed by +(Żądło 2013) and (Wolny-Dominiak and Żądło 2020), can be applied.

    +

    There is a variety of R packages to calculate the considered predictors +together with the accuracy measure of prediction, usually the RMSE. The +package sae, see (Molina and Marhuenda 2015), +provides EBLUPs based on Fay-Herriot and Battese-Harter-Fuller models. +In turn, the multivariate EBLUP for Fay-Herriot models is implemented in +msae, see (Permatasari and Ubaidillah 2021). +Several EBLUPs introduced in (Rao and Yu 1994) are implemented in package +saery introduced by +(Lefler et al. 2014), likewise in +JoSAE, see (Breidenbach 2018), but +with additional heteroscedasticity analysis. The EBP is provided in the +package emdi described in +(Kreutzmann et al. 2019).

    +

    A new package in this area is our proposed package +qape. It allows the +prediction of flexibly defined characteristics of the response variable +using the above three predictors, assuming an appropriate LMM. A novel +feature of the package +qape, compared to those +already in place, is the ability of bootstrap estimation of the +prediction accuracy measures, both the RMSE and QAPE. Three types of +bootstrap procedures are provided: parametric, residual and double.

    +

    There are three groups of functions in this package: predictors values +calculation, bootstrap estimation of RMSE and QAPE measures, and Monte +Carlo (MC) analysis of properties of predictors and prediction accuracy +estimators. The prediction is based on a LMM model defined by the user +and allows to predict the population characteristics of the response +variable, which can be defined by a linear combination (in the case of +EBLUP), by any R function (e.g. sum) or any function defined by the +user (in the case of the EBP and PLUG-IN predictors). The package allows +for full flexibility in defining: the model, the predicted +characteristic, and the transformation of the response variable.

    +

    This paper is organized as follows. Firstly, the background of the LMM +is presented together with the theoretical foundations of the prediction +including prediction accuracy measures. Then, the package functionality +in the area of prediction is presented and illustrated. A short +application based on radon data, a cross-sectional dataset available +in HLMdiag package, to +predict three subpopulation characteristics is shown. Subsequently, the +theoretical background of the prediction accuracy measures estimation +based on bootstrap is presented. Implementations of bootstrap algorithms +in qape are briefly +introduced. Finally, the procedure of the model-based Monte Carlo +simulation study is discussed. The paper ends with a conclusion.

    +

    2 Prediction accuracy measures

    +

    We consider the problem of prediction of any given function of the +population vector \(\mathbf{Y}\) of the response variable: +\[\label{theta} +\theta = f_{\theta}(\mathbf{Y}) \tag{1}\] +under the LMM. It covers linear combinations of \(\mathbf{Y}\) (such as +one future realization of the response variable or population and +subpopulation means and totals) but also other population and +subpopulation characteristics such quantiles and variability measures.

    +

    To assess the accuracy of the particular predictor \(\hat \theta\), +firstly, the prediction error is defined as \(U=\hat{\theta}-\theta\). +Therefore, the well-known RMSE has the following formula: +\[\label{eq0} + RMSE(\hat{\theta})=\sqrt{E(\hat{\theta}-\theta)^{2}}=\sqrt{E({{U}^{2}})}. \tag{2}\] +The alternative to the RMSE based on the mean could be the QAPE based on +quantiles. It represents the \(p\)th quantile of the absolute prediction +error \(|U|\), see (Żądło 2013) and (Wolny-Dominiak and Żądło 2020), and +it is given by: +\[\label{eq1} + QAPE_p(\hat{\theta}) = \inf \left\{ {x:P\left( {\left| {{\hat{\theta}-\theta}} \right| \le x} \right) \ge p} \right\} =\inf \left\{ {x:P\left( {\left| {{U}} \right| \le x} \right) \ge p} \right\} \tag{3}\] +This measure informs that at least \(p100\%\) of observed absolute +prediction errors are smaller than or equal to \(QAPE_p(\hat{\theta})\), +while at least \((1-p)100\%\) of them are higher than or equal to +\(QAPE_p(\hat{\theta})\). Quantiles reflect the relation between the +magnitude of the error and the probability of its realization. It means +that using the QAPE, it is possible to make a full description of the +distribution of prediction errors instead of using the average +(reflected by the RMSE). Furthermore, the MSE is the mean of positively +(usually very strongly) skewed squared prediction errors, where the mean +should not be used as a measure of the central tendency of positively +skewed distributions.

    +

    The above described accuracy prediction measures RMSE and QAPE can be +estimated using the bootstrap techniques. Their estimators as well as +the bootstrap distributions of the prediction errors based on any +(assumed or misspecified) model are provided in +qape package, including +algorithms where the parallel computing is used.

    +

    In the qape package, the +whole prediction process has its own specific procedure, which can be +presented in the following steps.

    +
    +

    Procedure 1. The process of prediction, accuracy measures +estimation and Monte Carlo simulation analyses in +qape

    +
      +
    1. Define the characteristics of the response variable to predict,

    2. +
    3. provide the information on sample and population values,

    4. +
    5. define the LMM,

    6. +
    7. estimate parameters of the LMM,

    8. +
    9. predict the random variable \(\theta\) using the chosen class of +predictors,

    10. +
    11. estimate the prediction accuracy measures RMSE and QAPE using one +of the developed bootstrap algorithms,

    12. +
    13. conduct simulation analyses of properties of predictors and +accuracy measures estimators under any (also misspecified) LMM +model.

    14. +
    +
    +

    3 The prediction under LMM

    +

    The main functions of the +qape package provide the +bootstrap estimation of prediction accuracy measures. However, it must +be preceded by the prediction process, including the choice of the LMM +and the predictor.

    +

    The model

    +

    Let \(\mathbf{Y}\) denote the vector of response variables +\(Y_1, Y_2,..., Y_N\). Assuming, without a loss of generality, that only +the first \(n\) realizations of \(Y_i\) are observed, \(\mathbf{Y}\) can be +decomposed as \(\mathbf{Y}= +\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{Y}_r^T +\end{bmatrix}^T\) , where \(\mathbf{Y}_s\) and \(\mathbf{Y}_r\) are of +dimension \(n \times 1\) and \((N - n) \times 1\), respectively. In all +notations, the subscript "s" is used for observed realizations of the +variable of interest and "r" for the unobserved ones. Two known +matrices of auxiliary variables are also considered, denoted by +\(\mathbf{X}\) and \(\mathbf{Z}\), which are associated with fixed and +random effects, respectively. The \(\mathbf{X}\) matrix is of dimension +\(N \times p\), and it consists of \(p\) regression variables. It can be +decomposed like \(\mathbf{Y}\) as follows: \(\mathbf{X}= +\begin{bmatrix} + \mathbf{X}_s^T & \mathbf{X}_r^T +\end{bmatrix}^T\), where matrices \(\mathbf{X}_s\) and \(\mathbf{X}_r\), both +known, are of dimension \(n \times p\) and \((N-n) \times p\), respectively. +Similarly, the \(\mathbf{Z}\) matrix of dimension \(N \times h\) can be +written as follows: \(\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_s^T & \mathbf{Z}_r^T +\end{bmatrix}^T\), where matrices \(\mathbf{Z}_s\) and \(\mathbf{Z}_r\), both +known, are of dimension \(n \times h\) and \((N-n) \times h\), respectively.

    +

    Then, let \(LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})\) denotes the +LMM of the following form (e.g. (Rao and Molina 2015), p. 98): +\[\label{LMM} + \left\{ \begin{array}{c} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}\mathbf{v}+\mathbf{e} \\ + E(\mathbf{e})=\mathbf{0}, E(\mathbf{v})=\mathbf{0} \\ + Var(\mathbf{e})=\mathbf{R}(\pmb{\delta}), Var(\mathbf{v})=\mathbf{G}(\pmb{\delta}) + \end{array} \right. \tag{4}\] +The vector of parameters in model ((4)) is then +\(\boldsymbol{\psi}=\begin{bmatrix} + \boldsymbol{\beta}^T & \pmb{\delta}^T +\end{bmatrix}^T\), where \(\boldsymbol{\beta}\) is a vector of fixed +effects of dimension \(p \times 1\) and \(\pmb{\delta}\) is a vector of +variance components. The random part of the model is described by the +known matrix \(\mathbf{Z}\), a vector \(\mathbf{v}\) of random effects of +dimension \(h \times 1\) and a vector \(\mathbf{e}\) of random components of +dimension \(N\times 1\), where \(\mathbf{e}\) and \(\mathbf{v}\) are assumed +to be independent. The vector of random components \(\mathbf{e}\) will be +decomposed similarly to the vector \(\mathbf{Y}\), i.e. +\(\mathbf{e}=\begin{bmatrix} + \mathbf{e}_s^T & \mathbf{e}_r^T +\end{bmatrix}^T\).

    +

    In the residual bootstrap implemented in +qape, there is a need to +re-write the LMM model to take account of the specific structure of +data, i.e. the grouping variables taken into account in the random part +of the model. In this case, without a loss of the generality, the LMM +model can be written as follows: +\[\label{LMMa} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}_1\mathbf{v}_1+...+\mathbf{Z}_l\mathbf{v}_l+...+\mathbf{Z}_L\mathbf{v}_L+\mathbf{e}, \tag{5}\] +where \(\mathbf{v}_1,\dots,\mathbf{v}_l,\dots,\mathbf{v}_L\) are +independent vectors of random effects assumed for different divisions of +the \(\mathbf{Y}\) vector (under different grouping of the data) and +\(\mathbf{Z}_1, \dots, \mathbf{Z}_l, \dots, \mathbf{Z}_L\) are known +matrices of auxiliary variables associated with random effects. Writing +in ((5)): \(\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_1 & \dots & \mathbf{0} & \dots & \mathbf{0} \\ + \vdots & \ddots & & & \vdots \\ + \mathbf{0} & \dots & \mathbf{Z}_l & \dots & \mathbf{0} \\ + \vdots & & & \ddots & \vdots \\ + \mathbf{0} & \dots & \mathbf{0} & \dots & \mathbf{Z}_L \\ +\end{bmatrix}\) and \(\mathbf{v}= +\begin{bmatrix} + \mathbf{v}_1^T & \dots & \mathbf{v}_l^T & \dots & \mathbf{v}_L^T \\ +\end{bmatrix}^T\) the LMM model is obtained. Let

    +

    \[\label{vl} +\mathbf{v}_l=\left[ \mathbf{v}_{l1}^T \dots \mathbf{v}_{lk}^T \dots \mathbf{v}_{lK_l}^T \right]^T \tag{6}\] +be of dimension \(K_l J_l \times 1\), where \(\mathbf{v}_{lk}\) is of +dimension \(J_l \times 1\) for all \(k=1,...,K_l\) and \(K_l\) is the number +of random effects at the \(l\)th level of grouping. Hence, \(\mathbf{Z}_l\) +is \(N \times K_l J_l\). For example, if the random regression coefficient +model is considered with two random coefficients where both random +effects are subpopulation-specific, where \(D\) is the number of +subpopulations, then \(L=1\), \(K_1=2\) and \(J_1=D\).

    +

    Predictors

    +

    In the qape package, in +the general case the predicted characteristic is given by any function +of response variables: +\[\label{ftheta} +\theta = f_{\theta}(\mathbf{Y}). \tag{7}\] +Under the \(LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})\) model it +could be predicted using one of three predictors:

    +
      +
    1. Empirical Best Linear Unbiased Predictor (EBLUP),

    2. +
    3. Empirical Best Predictor (EBP) under nested error LMM,

    4. +
    5. PLUG-IN predictor under the LMM.

    6. +
    +

    The first predictor (EBLUP) allows to predict the linear combination of +the response variables: +\[\label{l.theta} +\theta = f_{\theta}(\mathbf{Y}) = \boldsymbol{\gamma}^T \mathbf{Y}= \boldsymbol{\gamma}_s^T \mathbf{Y}_s + \boldsymbol{\gamma}_r^T \mathbf{Y}_r, \tag{8}\] +where \(\boldsymbol{\gamma}\) is a vector of weights. In this case, the +predicted characteristic \(\theta\) is basically the linear combination of +the response variable. For example, if one of the elements of +\(\boldsymbol{\gamma}\) equals 1 and the rest of the elements equals 0, +then one realization of the response variable is predicted. If all +elements in \(\boldsymbol{\gamma}\) vector equal 1, then \(\theta\) becomes +the sum of all \(Y_i\)’s in the whole considered population dataset. The +two-stage EBLUP corresponds to the Best Linear Unbiased Predictor (BLUP) +introduced in (Henderson 1950) and (Royall 1976) as: +\[\label{BLUP} + \hat{\theta}^{BLUP} (\pmb{\delta}) = {\boldsymbol{\gamma}}_s^T \mathbf{Y}_s + \hat{\theta}_r(\pmb{\delta}), \tag{9}\] +where the predictor of the linear combination +\(\boldsymbol{\gamma}_r^T \mathbf{Y}_r\) of unobserved random variables is +given by +\(\hat{\theta}_r(\pmb{\delta})={\boldsymbol{\gamma }}_r^T {{\mathbf{X}}_r}{\tilde{\boldsymbol{\beta}} }(\pmb{\delta}) +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{\tilde{v}}}(\pmb{\delta})\), +where \(\tilde{\boldsymbol{\beta}}(\pmb{\delta})\) is the Best Linear +Unbiased Estimator of \(\boldsymbol{\beta}\) and +\(\tilde{\mathbf{v}}(\pmb{\delta})\) is the Best Linear Unbiased Predictor +of \(\mathbf{v}\), both presented in ((4)). As shown by +(Żądło 2017) p. 8094, if +\(Cov(\mathbf{e}_r, \mathbf{e}_s)=\mathbf{0}\), then the predictor +((9)) is the BLUP of \(\theta\) defined as the linear +combination ((8)). Even if +\(Cov(\mathbf{e}_r, \mathbf{e}_s) \neq \mathbf{0}\), the predictor +\(\hat{\theta}_r(\pmb{\delta})\) is the Best Linear Unbiased Predictor of +the following linear combination of \(\boldsymbol{\beta}\) and +\(\mathbf{v}\): +\({\boldsymbol{\gamma }}_r^T{{\mathbf{X}}_r}{ {\boldsymbol{\beta}} } +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{{v}}}\). +The EBLUP \(\hat\theta^{EBLUP}\) is obtained by replacing the vector of +variance components \(\pmb{\delta}\) in BLUP ((9)) with the +estimator \(\hat{\pmb{\delta}}\). If (a) the expectation of the predictor +is finite, (b) \(\hat{\pmb{\delta}}\) is any even, translation-invariant +estimator of \(\pmb{\delta}\), (c) the distributions of both random +effects and random components are symmetric around \(\mathbf{0}\) (not +necessarily normal), the EBLUP remains unbiased, as proved by +(Kackar and Harville 1981).

    +

    To introduce the second predictor, called EBP, considered e.g. by +(Molina and Rao 2010), firstly, the Best Predictor (BP) \(\hat{\theta}^{BP}\) +of characteristic \(\theta(\mathbf{Y})\) has to be defined. It is computed +by minimizing the Mean Squared Error +\(MSE(\hat\theta )=E(\hat\theta - \theta)^2\) and can be written as +\(\hat\theta^{BP} = E(\theta|\mathbf{Y}_s)\). It means that the +conditional distribution of \(\mathbf{Y}_r|\mathbf{Y}_s\) must be known to +compute its value while at least the parameters of this distribution, +denoted by \(\boldsymbol{\psi}\) in ((4)), are unknown. The EBP +\(\hat\theta^{EBP}\) is obtained by replacing these parameters with +estimators \(\hat{\boldsymbol{\psi}}\). Its value can be computed +according to the Monte Carlo procedure presented in the supplementary +document for this paper.

    +

    The last predictor is the PLUG-IN predictor defined as (e.g. +(Chwila and Żądło 2019)): +\[\hat{\theta}^{PLUG-IN}=\theta(\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{\hat{Y}}_r^T + \end{bmatrix}^T),\] +where \(\mathbf{\hat{Y}}_r\) is the vector of fitted values of unobserved +random variables under the assumed model (any model specified by the +statistician). Under the LMM and if the linear combination of +\(\mathbf{Y}\) is predicted, the PLUG-IN predictor is the EBLUP, but +generally, it is not optimal. However, it was shown in simulation +studies that it can have similar or even higher accuracy compared to +empirical (estimated) best predictors, where the best predictors +minimize the prediction mean squared errors (cf. e.g. +(Boubeta et al. 2016), (Chwila and Żądło 2019), +(Hobza and Morales 2016)). Moreover, the PLUG-IN predictor is less +computationally demanding than the EBP.

    +

    Predictors in qape

    +

    To deal with the LMM model, the +qape package uses the +lmer() function from the +lme4 package, see +(Bates et al. 2015). Assuming ((4)) and based on \(\mathbf{Y}_s\), the +vector of model parameters +\(\boldsymbol{\psi} = [\boldsymbol{\beta}^T, \pmb{\delta}^T]^T\) is +estimated using the Restricted Maximum Likelihood Method (REML), known +to be robust on non-normality, see e.g (Jiang 1996), and +\(\hat{\boldsymbol{\psi}}\) is obtained.

    +

    In order to obtain the predictor of \(\theta\), one of the three +qape functions can be +applied: EBLUP(), ebpLMMne() or plugInLMM(). Firstly, the +characteristic of response variables of interest has to be defined. It +is actually obvious for EBLUP, which can be used only to predict the +population/subpopulation linear combination (e.g. the sum) by using the +argument gamma equivalent to the population vector of weights +\(\boldsymbol{\gamma}\) in ((8)). For other two predictors, +the EBP and the PLUG-IN, the input argument called thetaFun has to be +given (see \(f_{\theta}(.)\) in ((7))). Function thetaFun +could define one characteristic or a vector of characteristics, for +example:

    +
    > thetaFun1 <- function(x) median(x)
    +> thetaFun2 <- function(x) c(sum(x), mean(x), sd(x))
    +

    Secondly, two groups of input arguments, common to all three predictors, +has to be provided:

    +
      +
    • group 1 - arguments defining the sample and the population

      +
        +
      • YS - values of the dependent variable in the sample +(\(\mathbf{Y}_s\)),

      • +
      • reg - the population matrix of auxiliary variables named in +fixed.part, random.part and division,

      • +
      • con - the population \(0-1\) vector with \(1\)s for elements in +the sample and \(0\)s for elements which are not in the sample,

      • +
    • +
    • group 2 - arguments defining the model

      +
        +
      • fixed.part - fixed-effects terms declared as in lm4::lmer +function,

      • +
      • random.part - random-effects terms declared as in lm4::lmer +function,

      • +
      • weights - the population vector of weights.

      • +
    • +
    +

    The weights make it possible to include heteroscedasticity of random +components in the LMM.

    +

    In EBLUP() and plugInLMM() the random-effects terms of the LMM have +to be declared as the input argument random.part. The form of the +ebpLMMne predictor, in turn, requires defining in the ebpLMMne() +function the so-called division argument instead of random.part. +This input represents the variable dividing the population dataset into +subsets, which are taken into account in the nested error linear mixed +model with ‘division’-specific random components (presented in +supplementary document for this paper).

    +

    In the process of prediction, it is often necessary to perform data +transformation before estimating the model parameters. An example is the +logarithmic scaling of the variable of interest. The +qape package offers the +possibility for declaring the argument backTrans to conduct the data +back-transformation. Hence, a very flexible solution is used which +allows to use any transformation of the response variable such that the +back-transformation can be defined. This argument (available in R or +defined by the user function) should be the back-transformation function +of the already transformed dependent variable used to define the model, +e.g. for log-transformed YS used as the response variable:

    +
    > backTrans <- function(x) exp(x)
    +

    The main output is the value of predictor thetaP. For each class of +predictors, there are two S3 methods registered for existing generic +functions print and summary. The full list of output arguments is +presented in detail in the qape-manual file, cf. (Wolny-Dominiak and Żądło 2023).

    +

    Radon data and the model

    +

    In order to demonstrate the functionality of the package’s main +functions, in the following examples the radon dataset available in +HLMdiag package +((Loy and Hofmann 2014)) is analyzed. It contains the results of a survey measuring +radon concentrations in 919 owner-occupied homes in 85 counties of +Minnesota (see Figure 1). A study was conducted in +1987-1988 by the Minnesota Department of Health, showing that indoor +radon levels are higher in Minnesota compared to typical levels in the +U.S. In the data, the response variable log.radon (denoted in +((10)) by \(log(Y_{ic})\)) is the radon measurement in +logarithms of picoCurie per liter. The independent variables, on the +other hand, are: uranium (\(x_{1ic}\)) the average county-level soil +uranium content, basement (\(x_{2ic}\)) the 0-1 variable indicating the +level of the home at which the radon measurement was taken - 0 for +basement, 1 for the first floor, and county (denoted by subscript \(c\) +in ((10))) is county ID.

    +
    +
    +graphic without alt text +

    +Figure 1: The maps of characteristics of radon concentration in counties in picoCurie per liter. The gray colour means that the value is NA (Not Available) +

    +
    +
    +

    In all considered examples, the prediction for the county no. 26 +(county == 26) is conducted and it is assumed that the observations in +this county from the first floor (basement == 1) are not available +(see Figure 2).

    +
    +
    +graphic without alt text +

    +Figure 2: The distributions of radon concentration in picoCurie per liter in counties. The red line indicates county no. 26 +

    +
    +
    +

    The radon dataset is widely discussed in the literature. In the paper +(Nero et al. 1994), the Authors used an ordinary regression model +to predict county geometric means of radon concentration using surficial +soil radium data from the National Uranium Resource Evaluation. In turn, +the paper (Price et al. 1996) focuses on the prediction of the +geometric mean of radon for each county, but using a Bayesian approach. +For the radon data we use the following model +\[\label{radon.model} + log(Y_{ic}) = \beta_1 x_{1ic} + (\beta_2 + v_{1c}) x_{2ic} + \beta_0 + v_{2c} + e_{ic}, \tag{10}\] +where \(i=1,2,\dots,N\), \(c=1,2,\dots, C\), \(N = 919\) observations, +\(C = 85\) counties, \(\beta_1\), \(\beta_2\) and \(\beta_0\) are unknown fixed +effects, \(v_{1c}\) and \(v_{2c}\) are random effects, \(e_{ic}\) are random +components, \(v_{1c}\), and \(e_{ic}\) are mutually independent, \(v_{2c}\) +and \(e_{ic}\) are mutually independent too, \(Cor(v_{1c}, v_{2c}) = \rho\), +\(v_{1c} \sim (0, \sigma^2_{v_1})\), \(v_{2c} \sim (0, \sigma^2_{v_2})\) and +\(e_{ic} \sim (0, \sigma^2_e)\). As can easily be seen, the considered +model is the random coefficient model with two correlated +county-specific random effects. Its syntax written using the package +lme4 notation is as +follows:

    +
    radon.model <-  lmer(log.radon ~ basement + uranium + (basement | county), data = radon)
    +

    This and similar LMMs are considered, analyzed, and used for the +considered dataset in many publications, with a good overview presented +in (Gelman and Hill 2006). In (Gelman and Pardoe 2006), based on their +preceding research (Price et al. 1996), (Lin et al. 1999), +(Price and Gelman 2005), a very similar model but with additional +multivariate normality assumptions is studied, verified and chosen as +fitting well to the data within a Bayesian framework. The same model as +in (Gelman and Pardoe 2006) with its special cases is considered in +(Cantoni et al. 2021) but within the frequentist approach. Based on 25 +measures of explained variation and model selection, the Authors +conclude that the same model as considered in our paper (with additional +normality assumption, however, which is not used in all cases considered +in that paper), "seems the best" (Cantoni et al. 2021 10) for the +radon data. Further tests of the model are presented by +(Loy 2013), (Loy and Hofmann 2015) and (Loy et al. 2017) (see also +(Cook et al. 2007) for the introduction of the methodology) showing +among others: the normality and homescedasticity of random components, +the normality of the distribution of the random slope but – what is +important for our further considerations – the lack of the normality of +the random intercept. Since the problem of choosing and verifying a +model for the considered dataset is widely discussed in the literature, +we will focus on the issues that are new in this case, namely the +problem of prediction and estimation of the prediction accuracy as well +as the Monte Carlo analysis of predictors’ properties.

    +

    Example 1

    +

    This example shows the prediction procedure in the package +qape. In the first step, +it is needed to define all the input arguments that will then be passed +to the prediction functions.

    +
    > Ypop <- radon$log.radon # the population vector of the dependent variable
    +> # It is assumed that observations from the first floor
    +> # in county no. 26 are not available: 
    +> con <- rep(1, nrow(radon))
    +> con[radon$county == 26 & radon$basement == 1] <- 0
    +> YS <- Ypop[con == 1] # sample vector of the dependent variable
    +> reg <- dplyr::select(radon, -log.radon) # the population matrix of auxiliary variables
    +> fixed.part <- 'basement + uranium' # the fixed part of the considered model
    +> random.part <- '(basement|county)' # the random part of the considered model
    +> # The vector of weights to define
    +> # the predicted linear combination -  the mean for county == 26:
    +> gamma <-
    ++   (1 / sum((radon$county == 26))) * ifelse((radon$county == 26), 1, 0)
    +> estMSE <- TRUE # to include the naive MSE estimator of the EBLUP in the output
    +

    Then the functions corresponding to each predictor can be used. First, +the EBLUP prediction in the package +qape is presented. As the +EBLUP is limited to the linear combination of random variables, the +predicted characteristic is simply the arithmetic mean. To be precise, +it is the mean of logarithms of measurements (instead of the mean of +measurements), because the EBLUP can be used only under the linear +(linearized) models. As in the LMM the homescedasticity of random +components is assumed, the input argument weights = NULL is set up.

    +
    > myeblup <- EBLUP(YS, fixed.part, random.part, reg, con, gamma,  weights = NULL, estMSE)
    +> # the value of the predictor of the arithmetic mean
    +> # of logarithms of radon measurements:
    +> myeblup$thetaP
    +[1] 1.306916
    +> myeblup$neMSE # the value of the naive MSE estimator
    +[1] 0.002292732
    +

    Hence, the predicted value of the arithmetic mean of logarithms of radon +measurements equals \(1.306916\) log picoCurie per liter. The estimated +root of prediction MSE equals \(\sqrt{0.002292732} \approx 0.048\) log +picoCurie per liter, but – what is important – it is the value of the +naive RMSE estimator (as defined by Rao and Molina 2015 106), which means +that it ignores the decrease of accuracy due to the estimation of model +parameters.

    +

    The second part of this example shows the prediction of the arithmetic +mean, geometric mean and median of radon measurements (not logarithm of +radon measurements) in county no. 26 with the use of the PLUG-IN +predictor. It requires the setting of two input arguments: thetaFun +and backTrans.

    +
    > thetaFun <- function(x) {
    ++   c(mean(x[radon$county == 26]), psych::geometric.mean(x[radon$county == 26]),
    ++     median(x[radon$county == 26]))
    ++   }
    +> backTransExp <- function(x) exp(x) # back-transformation
    +> myplugin <- plugInLMM(YS, fixed.part, random.part, reg, con, weights = NULL,
    ++                     backTrans = backTransExp, thetaFun)
    +> # values of the predictor of arithmetic mean, geometric mean
    +> # and median of radon measurements:
    +> myplugin$thetaP
    +[1] 3.694761 4.553745 3.900000
    +

    In this case we can conclude that the predicted values of the +aritmethmic mean, geometric mean and median in county no. 26 equal: +\(3.694761\), \(4.553745\) and \(3.9\) picoCurie per liter, respectively. The +problem of prediction accuracy estimation will be discussed in the next +sections of the paper.

    +

    The qape package allows +to use the Empirical Best Predictor (EBP) (see the supplementary +document for this paper) as well. It provides predicted values of any +function of the variable of interest, as the PLUG-IN predictor. However, +this requires stronger assumptions to be met. The EBP procedure +available in qape package +is prepared under the assumption of the normality of the variable of +interest after any transformation. However, in the case of the +considered model for logarithms of radon measurements, the assumption is +not met as we mentioned before based on the results presented in the +literature. It can also be verified using normCholTest function +(available in qape +package) as follows:

    +
    > normCholTest(radon.model, shapiro.test)$p.value
    +[1] 2.589407e-08
    +

    Moreover, due to the fact of very time-consuming iterative procedure +used to compute the EBP for the general case, in the +qape package the function +ebpLMMne uses a very fast procedure working only for nested error +Linear Mixed Models (see (Molina and Rao 2010)).

    +

    The prediction of any function of the random variables based on +cross-sectional data has been considered. Its special case, not +presented above but widely discussed in the econometric literature, is +the prediction of one random variable, in this case a radon measurement +for one non-observed owner-occupied home. Furthermore, the +qape package is also +designed for prediction based on longitudinal data for current or future +periods as shown in examples for the EBLUP, plugInLMM and ebpLMMne +functions in the qape-manual file, cf. (Wolny-Dominiak and Żądło 2023).

    +

    4 Bootstrap procedures

    +

    The qape package provides +three main types of bootstrap algorithms: the parametric bootstrap, the +residual bootstrap and the double-bootstrap.

    +

    The parametric bootstrap procedure is implemented according to +(González-Manteiga et al. 2007) and (González-Manteiga et al. 2008) and could be described in the +following steps:

    +
      +
    1. based on \(n\) observations of the dependent and independent variables +(\(\mathbf{Y}_s\), \(\mathbf{X}_s\) and \(\mathbf{Z}_s\)) estimate +\(\boldsymbol{\psi}\) to obtain the vector of estimates +\(\boldsymbol{\hat{\psi}}\),

    2. +
    3. generate \(B\) realizations \(y_{i}^{*(b)}\) of \(Y_{i}\), under the +\(LMM(\mathbf{X}, \mathbf{Z}, \hat{\boldsymbol{\psi}})\) and +multivariate normality of random effects and random components +obtaining
      +\(\mathbf{y}^{*(b)}=\begin{bmatrix} + y_{1}^{*(b)} & ... & y_{i}^{*(b)} &... & y_{N}^{*(b)} + \end{bmatrix}^T\), where \(i=1, 2, ... ,N\) and \(b=1, 2, ... ,B\),

    4. +
    5. decompose the vector \(\mathbf{y}^{*(b)}\) as follows \(\begin{bmatrix} + \mathbf{y}_s^{*(b)T} & \mathbf{y}_r^{*(b)T} + \end{bmatrix}^T\),

    6. +
    7. in the \(b\)th iteration (\(b=1,2,...,B\))

      +
        +
      1. compute the bootstrap realization +\(\theta^{*(b)}=\theta^{*(b)}(\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}})\) +of random variable \(\theta\),

      2. +
      3. obtain the vector of estimates \(\boldsymbol{\hat{\psi}}^{*(b)}\) +using \(\mathbf{y}_s^{*(b)}\) and compute the bootstrap +realization of predictor \(\hat{\theta}\) denoted by +\(\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})\) +based on +\(LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\hat{\psi}}^{*(b)})\),

      4. +
      5. compute bootstrap realizations of prediction error \(U^*\) denoted +by \(u^*\) and for the \(b\)th iteration given by: +\[\label{u*b} + u^{*(b)}=\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})-\theta^{*(b)} + (\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}}) =\hat{\theta}^{*(b)}-\theta^{*(b)}, (\#eq:u*b)\]

      6. +
    8. +
    9. compute the parametric bootstrap estimators of prediction accuracy +measures: RMSE and QAPE replacing prediction errors \(U\) in +((2)) and ((3)) by their bootstrap realizations.

    10. +
    +

    Another possible method to estimate the prediction accuracy measures is +the residual bootstrap. In what follows, we use the notation +\(srswr(\mathbf{A}, m)\) to indicate the outcome of taking a simple random +sample with replacement of size \(m\) of rows of matrix \(\mathbf{A}\). If +\(\mathbf{A}\) is a vector, it simplifies to a simple random sample with +replacement of size \(m\) of elements of \(\mathbf{A}\).

    +

    To obtain the algorithm of the residual bootstrap, it is enough to +replace step 2 of the parametric bootstrap procedure presented above +with the following procedure of the population data generation based on +((5)):

    +
      +
    • generate \(B\) population vectors of the variable of interest, denoted +by \(\mathbf{y}^{*(b)}\) as +\[\label{LMMboot} + \mathbf{y}^{*(b)}=\mathbf{X}\hat{\boldsymbol{\beta}} + \mathbf{Z}_1\mathbf{v}^{*(b)}_1+...+\mathbf{Z}_l\mathbf{v}^{*(b)}_l+...+\mathbf{Z}_L\mathbf{v}^{*(b)}_L+\mathbf{e}^{*(b)}, \tag{11}\] +where \(\hat{\boldsymbol{\beta}}\) is an estimator (e.g. REML) of +\({\boldsymbol{\beta}}\), \(\mathbf{e}^{*(b)}\) is a vector of dimension +\(N \times 1\) defined as +\(srswr(col_{1 \leq i \leq n } \hat{{e}}_{i}, N)\), where +\(\hat{{e}}_{i}\) (\(i=1,2,...,n\)) are residuals, \(\mathbf{v}^{*(b)}_l\) +(for \(1,2,...,L\)) is the vector of dimension \(K_l J_l \times 1\) +built from the columns of the matrix: \(srswr \left( + \left[ \begin{array}{ccccc} + \hat{\mathbf{v}}_{l1} & + \dots & + \hat{\mathbf{v}}_{lk} & + \dots & + \hat{\mathbf{v}}_{lK_l} + \end{array} + \right], J_l + \right)\) of dimension \(J_l \times K_l\), where +\(\hat{\mathbf{v}}_{lk}\) are estimates of elements of random effects +vector ((6)).
    • +
    +

    The next 3–5 steps in this procedure are analogous to steps in the +parametric bootstrap procedure.

    +

    In the above-described step, it can be seen that if more than one vector +of random effect is assumed at the \(l\)th level of grouping, then the +elements are not sampled with replacement independently. In this case, +rows of the matrix formed by these vectors are sampled with replacement.

    +

    The residual bootstrap algorithm can also be performed with so-called +"correction procedure". This procedure, which can improve the +properties of the residual bootstrap estimators due to the +underdispersion of the uncorrected residual bootstrap distributions, is +presented in the supplementary document for this paper.

    +

    5 Bootstrap in qape

    +

    Two bootstrap procedures are implemented in separate functions: +bootPar() (the parametric bootstrap) and bootRes() (the residual +bootstrap). According to the general Procedure 1, the step +preceding the bootstrap procedure in both functions is the definition of +the predictor object. It must be one of the following: EBLUP, +ebpLMMne or plugInLMM. This object has to be passed to bootPar() +or bootRes() as the input parameter predictor. The other input +parameters are intuitive: B - the number of bootstrap iterations and +p - order of quantiles in the estimated QAPEs.

    +

    The additional input parameter in bootRes() is a logical condition +called correction, which makes it possible to include an additional +correction term for both random effects and random components, presented +in the supplementary document for this paper, to avoid the problem of +underdispersion of residual bootstrap distributions.

    +

    The main output values in both functions are basically the measures: +estRMSE and estQAPE computed based on ((2)) and +((3)), respectively, where prediction errors are replaced by +their bootstrap realizations. There is also the output error being the +vector of bootstrap realizations of prediction errors, which is useful +e.g. in in-depth analysis of the prediction accuracy and for graphical +presentation of results. To estimate these accuracy measures, we use +below the residual bootstrap with the correction procedure.

    +

    As previously stated, our package utilizes the lmer() function from +the lme4 package for +estimating model parameters. However, this function has been known to +generate convergence warnings in certain situations, listed for example +by (Bates et al. 2015) p. 25, when the estimated variances of random effects are +close to zero. Such scenarios may occur when models are estimated for +smaller or medium-sized datasets, when complex variance-covariance +structures are assumed, or when the grouping variable considered for +random effects has only a few levels. Although we have not observed such +issues estimating model parameters based on the original dataset +required to compute values of the predictors in previous sections, +bootstrapping or Monte Carlo simulations are more complex cases. This is +because, based on the estimates of model parameters, the values of the +dependent variables are generated \(B\) times, and then model parameters +are estimated in each out of \(B\) iterations. Therefore, in at least some +iterations, dependent variable values may be randomly generated giving +realizations, where the variance of the random effect is relatively +close to zero. As a result, estimates of model parameters can be +obtained; however, convergence issues implying warnings may occur. In +such cases, there are at least two possible solutions. The first option +is to discard iterations with warnings, which would imply that the +dependent variable would not follow the assumed model as required, but +instead only its conditional version with relatively high values of +variances of random effects. It will imply overdispersed bootstrap +distribution of random effects, which will affect the bias of the +bootstrap estimators of accuracy measures. The second option is to +consider all generated realizations, despite convergence warnings, as +long as the parameters can be estimated for all iterations. We opted for +the latter solution, as argued in (Bates et al. 2015) p. 25, who noted that "being +able to fit a singular model is an advantage: when the best fitting +model lies on the boundary of a constrained space".

    +

    Example 2

    +

    The analyses presented in Example 1 are continued. We extend the +previous results to include the issue of estimating the prediction +accuracy of the considered predictors. The use of functions for this +estimation primarily requires an object of class predictor, here +"myplugin".

    +
    > class(myplugin)
    +[1] "plugInLMM"
    +

    The short chunk of the R code presents the residual bootstrap estimators +of the RMSE (estRMSE) and the QAPE (estQAPE) of the PLUG-IN +predictors (plugin) of previously analyzed three characteristics of +radon measurements in county no. 26: the arithmetic mean, geometric mean +and median. In this and subsequent examples we make the computations for +relatively high number of iterations allowing, in our opinion, to get +reliable results. These results are also used to prepare Figure +3. However, the computations are time-consuming. The +supplementary R file contains the same chunks of the code but the number +of iterations applied is smaller in order to execute the code swiftly.

    +
    > # accuracy measures estimates based on
    +> # the residual bootstrap with the correction:
    +> B <- 500 # number of bootstrap iterations
    +> p <- c(0.75, 0.9) # orders of Quantiles of Absolute Prediction Error
    +> set.seed(1056)
    +> residBoot <- bootRes(myplugin, B, p, correction = TRUE)
    +> # values of estimated RMSEs of the predictor of three characteristics:
    +> # the arithmetic mean, geometric mean and median of radon measurements, respectively:
    +> residBoot$estRMSE
    +[1] 0.1848028 0.2003681 0.2824359
    +> # values of estimated QAPEs
    +> # (of order 0.75 in the first row, and of order 0.9 in the second row)
    +> # of the predictor of three characteristics:
    +> # the arithmetic mean, geometric mean and median of radon measurements,
    +> # in the 1st, 2nd and 3rd column, respectively:
    +> residBoot$estQAPE
    +         [,1]      [,2]      [,3]
    +75% 0.1533405 0.2135476 0.2908988
    +90% 0.2813886 0.3397411 0.4374534
    +

    Let us concentrate on interpretations of estimators of accuracy measures +for the predictor of the geometric mean, i.e. the second value of +residBoot$estRMSE, and values in the second column of +residBoot$estQAPE. It is estimated that the average difference between +predicted values of the geometric mean and their unknown realizations +equals \(0.2003681\) picoCurie per liter. Furthermore, it is estimated +that at least \(75\%\) of absolute prediction errors of the predictor of +the geometric mean are smaller or equal to \(0.2135476\) picoCurie per +liter and at least \(25\%\) of absolute prediction errors of the predictor +are higher or equal to \(0.2135476\) picoCurie per liter. Finally, it is +estimated that at least \(90\%\) of absolute prediction errors of the +predictor of the geometric mean are smaller or equal to \(0.3397411\) +picoCurie per liter and at least \(10\%\) of absolute prediction errors of +the predictor are higher or equal to \(0.3397411\) picoCurie per liter. +The distributions of bootstrap absolute prediction errors with values of +estimated RMSEs and QAPEs for the considered three prediction problems +are presented in Figure 3.

    +
    +
    +graphic without alt text +

    +Figure 3: The histograms of bootstrap absolute prediction errors for myplugin (for PLUG-IN predictors of the arithmetic mean, geometric mean and median) for B=500 +

    +
    +
    +

    Since the assumption of normality is not met, the parametric bootstrap +should not be used in this case. For this reason, we do not present the +results for this method below, although – but for illustrative purposes +only – they are presented in the supplementary R file. Moreover, these +analyses can also be conducted using bootParFuture() and +bootResFuture() functions where parallel computing algorithms are +applied. The input arguments and the output of these functions are the +same as in bootPar() and bootRes(). Examples based on these +functions are also included in the supplementary R file.

    +

    6 Bootstrap under the misspecified model in qape

    +

    The qape package also +allows to use predictors under a model different from the assumed one +(e.g. a simpler or more robust model), but estimate its accuracy under +the assumed model. In this case, the parametric and residual bootstrap +procedures are implemented in bootParMis() and bootResMis() +functions. These functions allow to estimate the accuracy of two +predictors under the model correctly specified for the first of them. Of +course, it is expected that the estimated accuracy of the first +predictor will be better than of the second one, but the key issue can +be the difference between estimates of accuracy measures. A small +difference, even to the second predictor’s disadvantage, may be treated +by the user as an argument for using the second predictor due to its +properties, such as robustness or simplicity.

    +

    The considered functions allow to estimate the accuracy of two +predictors, which belong to the class plugInLMM, under the model used +to define the first of them. The remaining arguments are the same as in +bootPar() and bootRes() functions: B - the number of bootstrap +iterations, and p - orders of QAPE estimates to be taken into account.

    +

    The output results of bootParMis() and bootResMis() include – +similarly to bootPar() and bootRes() functions – estimates of the +RMSEs and QAPEs of both predictors (denoted here by: estRMSElmm, +estRMSElmmMis, estQAPElmm and estQAPElmmMis), and boostrap +realizations of their prediction errors (errorLMM and errorLMMmis).

    +

    Example 3

    +

    In this example, we study the same accuracy measures as in Example 2, +but the aim is to compare the predictor myplugin and other predictor +defined under the misspecified LMM. First, the misspecified model has to +be defined, and a relevant predictor has to be computed.

    +
    > fixed.part.mis <- '1'
    +> random.part.mis <- '(1|county)'
    +> myplugin.mis <- plugInLMM(YS, fixed.part.mis, random.part.mis, reg, con,
    ++                         weights = NULL, backTrans = backTransExp, thetaFun)
    +

    Having two objects: myplugin and myplugin.mis, one can proceed to a +comparison by estimating bootstrap prediction accuracy performed using +the residual bootstrap with correction procedure. In this case, we +estimate the prediction accuracy of these two predictors under the model +used to define the first of them.

    +
    > set.seed(1056)
    +> residBootMis <- bootResMis(myplugin, myplugin.mis, B, p, correction = TRUE)
    +> # residual bootstrap with the correction RMSE estimators
    +> # of 'plugin' of: arithmetic mean, geometric mean and median
    +> # of radon measurements in county 26:
    +> residBootMis$estRMSElmm
    +[1] 0.1848028 0.2003681 0.2824359
    +> # residual bootstrap with the correction RMSE estimators
    +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median
    +> # of radon measurements in county 26:
    +> residBootMis$estRMSElmmMis
    +[1] 0.1919184 0.3192304 0.2762137
    +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9
    +> # of 'plugin' of: arithmetic mean, geometric mean and median
    +> # of radon measurements in county 26:
    +> residBootMis$estQAPElmm
    +         [,1]      [,2]      [,3]
    +75% 0.1533405 0.2135476 0.2908988
    +90% 0.2813886 0.3397411 0.4374534
    +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9
    +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median
    +> # of radon measurements in county 26:
    +> residBootMis$estQAPElmmMis
    +         [,1]      [,2]      [,3]
    +75% 0.2267062 0.3802836 0.3255197
    +90% 0.2813787 0.4970726 0.4489399
    +

    The results, presented above, were obtained for the same number of +bootstrap iterations as in Example 2 (\(B = 500\)). If we compare, under +the model defined in plugin, estimated RMSEs of plugin and +plugin.mis predictors of the geometric mean given by \(0.2003681\) and +\(0.3192304\) picoCurie per liter, respectively, we can state that the +estimated accuracy (measured by RMSE estimators) of the first predictor +is better comparing with the second one. If we are not interested in the +average accuracy measures but in the right tail of the distribution of +prediction errors, we can use estimates of QAPE of order 0.9 to compare +the accuracy. The result for the plugin.mis of the geometric mean +equals to \(0.4970726\) picoCurie per liter, and it is higher comparing +with \(0.3397411\) picoCurie per liter obtained for plugin for the same +prediction problem. Hence, in this case, the accuracy comparison based +both on the RMSE and QAPE leads to the same finding.

    +

    In the previous paragraph, we have focused on the results for the case +of prediction of the geometric mean. If the comparison is made for the +case of prediction of the arithmetic mean (the first column of output +results) or the median (the third column of output results), we will +come to the same conclusion regarding the estimated accuracy of plugin +and plugin.mis as in the case of prediction of the geometric mean.

    +

    Similarly to the residual bootstrap, the parametric bootstrap procedure +paramBootMis available in +qape package can be +performed. However, in the considered case the normality assumption is +not met (as discussed above) and the procedure is not recommended. The +appropriate chunk of the R code is presented in the supplementary R +file, but it is solely intended for illustrative purposes.

    +

    7 Monte Carlo simulation analyses

    +

    In the previous section, our aim was to estimate the prediction accuracy +under correctly specified or misspecified model. In this section, we do +not estimate the accuracy, but we approximate the true prediction +accuracy under the specified model in the Monte Carlo simulation study. +The crucial difference is that in this case, the model parameters used +are obtained based on the whole population dataset, not the sample. If +the number of iterations is large enough, we can treat the computed +values of the measures as their true values, which are unknown in +practice.

    +

    The last step of the analysis in +qape package presented in +Procedure 1 is the Monte Carlo (MC) simulation analysis of:

    +
      +
    • properties of predictors

    • +
    • and properties of parametric, residual and double bootstrap +estimators of accuracy measures.

    • +
    +

    The whole Monte Carlo procedure is as follows.

    +
    +

    Procedure 2. Model-based Monte Carlo simulation analyses in +qape

    +
      +
    1. define the population vector of the dependent variable and the +population matrix of auxiliary variables,

    2. +
    3. provide the information on the division of the population into the +sampled and non-sampled part,

    4. +
    5. define \(\theta\) - the characteristics of the response variable to +be predicted,

    6. +
    7. define the predictors \(\hat{\theta}\) and accuracy measures +estimators which properties are to be assessed,

    8. +
    9. define the model to be used to generate realizations of the values +of the dependent variable and estimate its parameters based on +population data,

    10. +
    11. For k=1, 2, ..., K

      +
        +
      1. generate the population vector of the response variable based +on the assumed model,

      2. +
      3. based on population data, compute the characteristics \(\theta\), +denoted by \(\theta_k\),

      4. +
      5. based on sample data, estimate the parameters of the LMM,

      6. +
      7. based on sample data, compute values of predictors +\(\hat{\theta}\), denoted by \(\hat{\theta}_k\),

      8. +
      9. based on sample data, estimate the accuracy of \(\hat{\theta}\) +using bootstrap methods,

      10. +
    12. +
    13. End For

    14. +
    15. compute accuracy measures of predictors using \(\hat{\theta}_k\) and +\(\theta_k\) (for \(k=1,2, ..., K\)),

    16. +
    17. compute accuracy measures of estimators of prediction accuracy +measures.

    18. +
    +
    +

    8 Monte Carlo analyses in qape

    +

    In order to perform a Monte Carlo (MC) analysis on the properties of +predictors, it is necessary to have access to the entire population data +for both dependent and independent variables. The function mcLMMmis() +can be used with the following arguments. Firstly, the population values +of the dependent variable (after a necessary transformation) should be +declared as Ypop. By using the Ypop values, we can estimate the +model parameters based on the entire population data (assuming that they +are known). This allows us to generate values of the dependent variable +in the simulation study that can mimic its distribution in the entire +population, not just in the sample. This approach ensures that our +simulation study can be an accurate representation of the random process +in the entire population, resembling the real-world scenario. Secondly, +three predictors: predictorLMMmis, predictorLMM, predictorLMM2, +which belong to the class plugInLMM, are to be defined. The first one +is used only to define the (possibly misspecified) model used to +generate population values of the response variables. Accuracy of +predictorLMM and predictorLMM2 is assessed in the simulation study. +The next two arguments include the number of MC iterations K and +orders p of QAPEs used to assess the prediction accuracy. Finally, it +should be noted that it is possible to modify covariance matrices of +random components and random effects based on the model defined in +predictorLMMmis, which are used tThiso generate values of the +dependent variable. It is possible by declaring values of ratioR and +ratioG arguments, which the diagonal elements of covariance matrices +of random components and random effects, respectively, are divided by.

    +

    The output of this function covers the following statistics of both +predictors computed in the simulation study: relative biases (rBlmm +and rBlmm2), relative RMSEs (rRMSElmm and rRMSElmm2) and QAPEs +(QAPElmm and QAPElmm2). Simulation-based prediction errors of both +predictors (errorLMM and errorLMM2) are also taken into account.

    +

    Example 4

    +

    In the example, an MC simulation is carried out assuming the myplugin +predictor. The goal is to approximate the true accuracy of the +prediction assuming model ((10)). Hence, in the package +qape, all input predictor +objects in the function mcLMMmis have to be defined as myplugin.  

    +
    > # input arguments:
    +predictorLMMmis <- myplugin # to define the model
    +predictorLMM <- myplugin # which properties are assessed in the simulation study
    +predictorLMM2 <- myplugin  # which properties are assessed in the sim. study
    +

    Except that no modification of covariance matrices has to be used.

    +
    # diag. elements of the covariance matrix of random components are divided by:
    +ratioR <- 1
    +# diag. elements of the covariance matrix of random effects are divided by:
    +ratioG <- 1
    +

    We specify the number of Monte Carlo iterations.

    +
    K <- 500 # the number of MC iterations
    +

    The analysis is conducted in the object MC.

    +
    > set.seed(1086)
    +> MC <- mcLMMmis(Ypop, predictorLMMmis, predictorLMM, predictorLMM2,
    ++                        K, p, ratioR, ratioG)
    +> # relative bias of 'predictorLMM'
    +> # of the arithmetic mean, geometric mean and median in county 26 (in %):
    +> MC$rBlmm
    +[1] -1.73208393 -0.04053178 -5.22355236
    +

    Results of the relative biases are obtained. It is seen, that under the +assumed model the values of the considered predictor of the geometric +mean (the second value of MC$rBlmm) are smaller than possible +realizations of the geometric mean on average by \(0.04053178\%\). In +turn, the relative RMSEs are as follows.

    +
    > # relative RMSE of 'predictorLMM'
    +> # of the arithmetic mean, geometric mean and median in county 26 (in %):
    +> MC$rRMSElmm
    +[1] 3.429465 4.665810 7.146678
    +

    In the considered case, the average difference between predicted values +of the geometric mean and its possible realizations (the second value of +MC$rRMSElmm) equals \(4.665810\%\). It should be noted that this value +can be treated as the true value of the relative RMSE (if the number of +iterations is large enough), not the estimated value obtained in +Examples 2 and 3.

    +

    Finally, QAPEs of orders 0.75 and 0.9 are considered.

    +
    > # QAPE of order 0.75 and 0.9 of 'predictorLMM'
    +> # of the arithmetic mean, geometric mean and median in county 26:
    +> MC$QAPElmm
    +         [,1]      [,2]      [,3]
    +75% 0.1491262 0.1989504 0.2919221
    +90% 0.2895684 0.2959457 0.4728064
    +

    Let us interpret the results presented in the second column of +MC$QAPElmm. At least \(75\%\) (\(90\%\)) of absolute prediction errors of +the predictor of the geometric mean are smaller or equal to \(0.1989504\) +(\(0.2959457\)) picoCurie per liter and at least \(25\%\) (\(10\%\)) of +absolute prediction errors of the predictor are higher or equal to +\(0.1989504\) (\(0.2959457\)) picoCurie per liter. Similar to the values of +the rRMSEs in the previous code chunk, the values can be considered to +be true QAPE values, not the estimates presented in Examples 2 and 3.

    +

    In Example 4, the accuracy of one predictor under the model used to +define this predictor was presented. A more complex version of the +simulation study, where the properties of two predictors are studied +under the model defined by the third predictor, is presented in the +supplementary R file. What is more, the +qape package also allows +to use mcBootMis() function to conduct MC analyses of properties of +accuracy measure estimators (estimators of MSEs and QAPEs) of two +predictors (which belong to the class plugInLMM) declared as +arguments. The model used in the simulation study is declared in the +first predictor, but the properties of accuracy measures estimators of +both predictors are studied. Output results of mcBootMis() covers +simulation results on properties of different accuracy measures +estimators, including the relative biases and relative RMSEs of the +parametric bootstrap MSE estimators of both predictors. The same +simulation-based statistics but for parametric bootstrap QAPE estimators +are also included. Other bootstrap methods, including the residual +bootstrap with and without the correction procedure, are also taken into +account. The full list of output arguments of mcBootMis() function are +presented in qape-manual file, cf. (Wolny-Dominiak and Żądło 2023).

    +

    9 Conclusions

    +

    The package enables R users to make predictions and assess the accuracy +under linear mixed models based on different methods in a fast and +intuitive manner – not only based on the RMSE but also based on +Quantiles of Absolute Prediction Errors. It also covers functions which +allow to conduct Monte Carlo simulation analyses of properties of the +methods of users interest. Its main advantage, compared to other +packages, is the considerable flexibility in terms of defining the model +(as in the lme4 package) +and the predicted characteristic, but also the transformation of the +response variable.

    +

    In our opinion, the package is useful for scientists, practitioners and +decision-makers in all areas of research where accurate estimates and +forecasts for different types of data (including cross-sectional and +longitudinal data) and for different characteristics play the crucial +role. We believe that it will be of special interest to survey +statisticians interested in the prediction for subpopulations with small +or even zero sample sizes, called small areas.

    +
    +
    +

    10 Supplementary materials

    +

    Supplementary materials are available in addition to this article. It can be downloaded at +RJ-2024-004.zip

    +

    11 CRAN packages used

    +

    qape, sae, msae, saery, JoSAE, emdi, HLMdiag, lme4

    +

    12 CRAN Task Views implied by cited packages

    +

    Econometrics, Environmetrics, MixedModels, OfficialStatistics, Psychometrics, SpatioTemporal

    +

    13 Note

    +

    This article is converted from a Legacy LaTeX article using the +texor package. +The pdf version is the official version. To report a problem with the html, +refer to CONTRIBUTE on the R Journal homepage.

    +
    +
    +D. Bates, M. Mächler, B. Bolker and S. Walker. Fitting linear mixed-effects models using lme4. Journal of Statistical Software, 67(1): 1–48, 2015. DOI 10.18637/jss.v067.i01. +
    +
    +G. E. Battese, R. M. Harter and W. A. Fuller. An error-components model for prediction of county crop areas using survey and satellite data. Journal of the American Statistical Association, 83(401): 28–36, 1988. +
    +
    +M. Boubeta, M. J. Lombardı́a and D. Morales. Empirical best prediction under area-level poisson mixed models. Test, 25(3): 548–569, 2016. +
    +
    +J. Breidenbach. JoSAE: Unit-level and area-level small area estimation. 2018. URL https://CRAN.R-project.org/package=JoSAE. R package version 0.3.0. +
    +
    +H. Bühlmann and A. Gisler. A course in credibility theory and its applications. Springer, 2005. +
    +
    +E. Cantoni, N. Jacot and P. Ghisletta. Review and comparison of measures of explained variation and model selection in linear mixed-effects models. Econometrics and Statistics, 2021. +
    +
    +L. Christiaensen, P. Lanjouw, J. Luoto and D. Stifel. Small area estimation-based prediction methods to track poverty: Validation and applications. The Journal of Economic Inequality, 10(2): 267–297, 2012. +
    +
    +A. Chwila and T. Żądło. On properties of empirical best predictors. Communications in Statistics-Simulation and Computation, 1–34, 2019. +
    +
    +D. Cook, D. F. Swayne and A. Buja. Interactive and dynamic graphics for data analysis: With r and GGobi. Springer, 2007. +
    +
    +R. E. Fay III and R. A. Herriot. Estimates of income for small places: An application of james-stein procedures to census data. Journal of the American Statistical Association, 74(366a): 269–277, 1979. +
    +
    +E. W. Frees, V. R. Young and Y. Luo. A longitudinal data analysis interpretation of credibility models. Insurance: Mathematics and Economics, 24(3): 229–247, 1999. +
    +
    +A. Gelman and J. Hill. Data Analysis Using Regression and Multilevel/Hierarchical Models. 1st edition Cambridge ; New York: Cambridge University Press, 2006. +
    +
    +A. Gelman and I. Pardoe. Bayesian measures of explained variance and pooling in multilevel (hierarchical) models. Technometrics, 48(2): 241–251, 2006. +
    +
    +W. González-Manteiga, M. J. Lombardı́a, I. Molina, D. Morales and L. Santamarı́a. Bootstrap mean squared error of small-area EBLUP. Journal of Statistical Computation and Simulation, 78: 443–462, 2008. +
    +
    +W. González-Manteiga, M. J. Lombardı́a, I. Molina, D. Morales and L. Santamarı́a. Estimation of the mean squared error of predictors of small area linear parameters under a logistic mixed model. Computational Statistics & Data Analysis, 51: 2720–2733, 2007. +
    +
    +C. R. Henderson. Estimation of genetic parameters. Biometrics, 6(2): 186–187, 1950. +
    +
    +T. Hobza and D. Morales. Empirical best prediction under unit-level logit mixed models. Journal of official statistics, 32(3): 661–692, 2016. +
    +
    +J. Jiang. REML estimation: Asymptotic behavior and related topics. The Annals of Statistics, 24(1): 255–286, 1996. +
    +
    +R. N. Kackar and D. A. Harville. Unbiasedness of two-stage estimation and prediction procedures for mixed linear models. Communications in statistics-theory and methods, 10(13): 1249–1261, 1981. +
    +
    +A.-K. Kreutzmann, S. Pannier, N. Rojas-Perilla, T. Schmid, M. Templ and N. Tzavidis. The r package emdi for estimating and mapping regionally disaggregated indicators. Journal of Statistical Software, 91: 2019. +
    +
    +M. D. E. Lefler, D. M. Gonzalez and A. P. Martin. Saery: Small area estimation for rao and yu model. 2014. URL https://CRAN.R-project.org/package=saery. R package version 1.0. +
    +
    +C. Lin, A. Gelman, P. N. Price and D. H. Krantz. Analysis of local decisions using hierarchical modeling, applied to home radon measurement and remediation. Statistical Science, 14(3): 305–337, 1999. +
    +
    +A. Loy. Diagnostics for mixed/hierarchical linear models. 2013. +
    +
    +A. Loy and H. Hofmann. Are you normal? The problem of confounded residual structures in hierarchical linear models. Journal of Computational and Graphical Statistics, 24(4): 1191–1209, 2015. +
    +
    +A. Loy and H. Hofmann. HLMdiag: A suite of diagnostics for hierarchical linear models in R. Journal of Statistical Software, 56(5): 1–28, 2014. URL https://www.jstatsoft.org/article/view/v056i05. +
    +
    +A. Loy, H. Hofmann and D. Cook. Model choice and diagnostics for linear mixed-effects models using statistics on street corners. Journal of Computational and Graphical Statistics, 26(3): 478–492, 2017. +
    +
    +I. Molina and Y. Marhuenda. sae: An R package for small area estimation. The R Journal, 7(1): 81–98, 2015. URL https://journal.r-project.org/archive/2015/RJ-2015-007/RJ-2015-007.pdf. +
    +
    +I. Molina and J. Rao. Small area estimation of poverty indicators. Canadian Journal of Statistics, 38(3): 369–385, 2010. +
    +
    +A. Nero, S. Leiden, D. Nolan, P. Price, S. Rein, K. Revzan, H. Woolenberg and A. Gadgil. Statistically based methodologies for mapping of radon’actual’concentrations: The case of minnesota. Radiation Protection Dosimetry, 56(1-4): 215–219, 1994. +
    +
    +N. Permatasari and A. Ubaidillah. Msae: Multivariate fay herriot models for small area estimation. 2021. URL https://CRAN.R-project.org/package=msae. R package version 0.1.4. +
    +
    +P. N. Price and A. Gelman. Should you measure the radon concentration in your home? In Statistics: A Guide to the Unknown, 4th edition pages. 149–170 2005. Belmont, CA: Duxbury Press. ISBN 978-0-534-37282-8. +
    +
    +P. N. Price, A. V. Nero and A. Gelman. Bayesian prediction of mean indoor radon concentrations for minnesota counties. Health Physics, 71(6): 922–936, 1996. +
    +
    +J. N. Rao and I. Molina. Small area estimation. John Wiley & Sons, 2015. +
    +
    +J. N. Rao and M. Yu. Small-area estimation by combining time-series and cross-sectional data. Canadian Journal of Statistics, 22(4): 511–528, 1994. +
    +
    +R. M. Royall. The linear least-squares prediction approach to two-stage sampling. Journal of the American Statistical Association, 71(355): 657–664, 1976. +
    +
    +A. Wolny-Dominiak and T. Żądło. On bootstrap estimators of some prediction accuracy measures of loss reserves in a non-life insurance company. Communications in Statistics-Simulation and Computation, 1–16, 2020. +
    +
    +A. Wolny-Dominiak and T. Żądło. Qape: Quantile of absolute prediction errors. 2023. URL https://CRAN.R-project.org/package=qape. R package version 2.0. +
    +
    +T. Żądło. On parametric bootstrap and alternatives of MSE. In Proceedings of 31st international conference mathematical methods in economics, pages. 1081–1086 2013. +
    +
    +T. Żądło. On prediction of population and subpopulation characteristics for future periods. Communications in Statistics-Simulation and Computation, 461(10): 8086–8104, 2017. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Wolny--Dominiak & Ża̧dło, "Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models with QAPE 2.0 Package", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-004,
    +  author = {Wolny--Dominiak, Alicja and Ża̧dło, Tomasz},
    +  title = {Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models with QAPE 2.0 Package},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-004},
    +  doi = {10.32614/RJ-2024-004},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {67-82}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-004/RJ-2024-004.pdf b/_articles/RJ-2024-004/RJ-2024-004.pdf new file mode 100644 index 0000000000..de65183731 Binary files /dev/null and b/_articles/RJ-2024-004/RJ-2024-004.pdf differ diff --git a/_articles/RJ-2024-004/RJ-2024-004.zip b/_articles/RJ-2024-004/RJ-2024-004.zip new file mode 100644 index 0000000000..a4db7ecc36 Binary files /dev/null and b/_articles/RJ-2024-004/RJ-2024-004.zip differ diff --git a/_articles/RJ-2024-004/RJournal.sty b/_articles/RJ-2024-004/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-004/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-004/RJwrapper.md b/_articles/RJ-2024-004/RJwrapper.md new file mode 100644 index 0000000000..04c80227e6 --- /dev/null +++ b/_articles/RJ-2024-004/RJwrapper.md @@ -0,0 +1,1239 @@ +--- +abstract: | + The paper presents a new R package + [**qape**](https://CRAN.R-project.org/package=qape) for prediction, + accuracy estimation of various predictors and Monte Carlo simulation + studies of properties of both predictors and estimators of accuracy + measures. It allows to predict any population and subpopulation + characteristics of the response variable based on the Linear Mixed + Model (LMM). The response variable can be transformed, e.g. to + logarithm and the data can be in the cross-sectional or longitudinal + framework. Three bootstrap algorithms are developed: parametric, + residual and double, allowing to estimate the prediction accuracy. + Analyses can also include Monte Carlo simulation studies of properties + of the methods used. Unlike other packages, in the prediction process + the user can flexibly define the predictor, the model, the + transformation function of the response variable, the predicted + characteristics and the method of accuracy estimation. +address: +- | + Alicja Wolny--Dominiak\ + Department of Statistical and Mathematical Methods in Economics\ + University of Economics in Katowice\ + 50, 1 Maja Street\ + 40--287 Katowice\ + Poland\ +- | + Tomasz Ża̧dło\ + Department of Statistics, Econometrics and Mathematics\ + University of Economics in Katowice\ + 50, 1 Maja Street\ + 40--287 Katowice\ + Poland\ +author: +- by Alicja Wolny--Dominiak and Tomasz Ża̧dło +bibliography: +- wolny-zadlo.bib +title: Prediction, Bootstrapping and Monte Carlo Analyses Based on + Linear Mixed Models with QAPE 2.0 Package +--- + +::::: article +## Introduction {#intro} + +One of the tasks in application of mixed models in the real-life +problems is the prediction of random effects. Then, the predicted values +give the possibility for further prediction, e.g. characteristics of +interest such as sum, mean or quantiles or the future value of the +response variable for cross-sectional or longitudinal data. + +Three main predictors of these characteristics are proposed in the +literature: Empirical Best Linear Unbiased Predictors - EBLUPs (see e.g. +[@henderson1950estimation] and [@royall1976linear]), PLUG-IN predictors +(see e.g. [@boubeta2016empirical], [@chwila2019properties], +[@hobza2016empirical]) and Empirical Best Predictors - EBPs (see e.g. +[@molina2010small]). Each assumes the LMM to model the response +variable. + +The numerous successful applications of these three predictors for +cross-sectional and longitudinal data can be found in the model approach +in survey sampling, including the small area estimation. In paper +[@fay1979estimates] the Authors introduce the prediction of the mean +income for small places based on the special case of the LMM model +called Fay-Herriot model and the EBLUP. The analysis of poverty is +extended in many works, e.g. in [@molina2010small] and +[@christiaensen2012]. In turn, in [@SAE1988] the Authors analyse the +total crop areas based on survey and satellite data using EBLUPs. The +proposed LMM model is known as the Battese-Harter-Fuller model. The +predictors are also exploited in the subject of experience rating in +non-life insurance, see [@frees1999] and [@buhlmann2005], where the +longitudinal data are under consideration. The insurance premium for the +next period for every policy in the insurance portfolio is predicted. + +A major challenge in this type of prediction is the estimation of the +prediction accuracy measure. Most often it is the Root Mean Squared +Error (RMSE), which is given in analytical form or can be e.g. estimated +using bootstrap. A feature of the distribution of the squared prediction +error is usually a very strong positive asymmetry. Because the mean is +not recommended as the appropriate measure of the central tendency in +such distributions, the alternative prediction accuracy measure called +the Quantile of Absolute Prediction Errors (QAPE), proposed by +[@zadlo2013parametric] and [@zadlo2020bootstrap], can be applied. + +There is a variety of R packages to calculate the considered predictors +together with the accuracy measure of prediction, usually the RMSE. The +package [**sae**](https://CRAN.R-project.org/package=sae), see [@sae], +provides EBLUPs based on Fay-Herriot and Battese-Harter-Fuller models. +In turn, the multivariate EBLUP for Fay-Herriot models is implemented in +[**msae**](https://CRAN.R-project.org/package=msae), see [@msae]. +Several EBLUPs introduced in [@rao1994small] are implemented in package +[**saery**](https://CRAN.R-project.org/package=saery) introduced by +[@saery], likewise in +[**JoSAE**](https://CRAN.R-project.org/package=JoSAE), see [@josae], but +with additional heteroscedasticity analysis. The EBP is provided in the +package [**emdi**](https://CRAN.R-project.org/package=emdi) described in +[@kreutzmann2019r]. + +A new package in this area is our proposed package +[**qape**](https://CRAN.R-project.org/package=qape). It allows the +prediction of flexibly defined characteristics of the response variable +using the above three predictors, assuming an appropriate LMM. A novel +feature of the package +[**qape**](https://CRAN.R-project.org/package=qape), compared to those +already in place, is the ability of bootstrap estimation of the +prediction accuracy measures, both the RMSE and QAPE. Three types of +bootstrap procedures are provided: parametric, residual and double. + +There are three groups of functions in this package: predictors values +calculation, bootstrap estimation of RMSE and QAPE measures, and Monte +Carlo (MC) analysis of properties of predictors and prediction accuracy +estimators. The prediction is based on a LMM model defined by the user +and allows to predict the population characteristics of the response +variable, which can be defined by a linear combination (in the case of +EBLUP), by any R function (e.g. `sum`) or any function defined by the +user (in the case of the EBP and PLUG-IN predictors). The package allows +for full flexibility in defining: the model, the predicted +characteristic, and the transformation of the response variable. + +This paper is organized as follows. Firstly, the background of the LMM +is presented together with the theoretical foundations of the prediction +including prediction accuracy measures. Then, the package functionality +in the area of prediction is presented and illustrated. A short +application based on `radon` data, a cross-sectional dataset available +in [**HLMdiag**](https://CRAN.R-project.org/package=HLMdiag) package, to +predict three subpopulation characteristics is shown. Subsequently, the +theoretical background of the prediction accuracy measures estimation +based on bootstrap is presented. Implementations of bootstrap algorithms +in [**qape**](https://CRAN.R-project.org/package=qape) are briefly +introduced. Finally, the procedure of the model-based Monte Carlo +simulation study is discussed. The paper ends with a conclusion. + +## Prediction accuracy measures {#PAM} + +We consider the problem of prediction of any given function of the +population vector $\mathbf{Y}$ of the response variable: +$$\label{theta} +\theta = f_{\theta}(\mathbf{Y}) (\#eq:theta)$$ +under the LMM. It covers linear combinations of $\mathbf{Y}$ (such as +one future realization of the response variable or population and +subpopulation means and totals) but also other population and +subpopulation characteristics such quantiles and variability measures. + +To assess the accuracy of the particular predictor $\hat \theta$, +firstly, the prediction error is defined as $U=\hat{\theta}-\theta$. +Therefore, the well-known RMSE has the following formula: +$$\label{eq0} + RMSE(\hat{\theta})=\sqrt{E(\hat{\theta}-\theta)^{2}}=\sqrt{E({{U}^{2}})}. (\#eq:eq0)$$ +The alternative to the RMSE based on the mean could be the QAPE based on +quantiles. It represents the $p$th quantile of the absolute prediction +error $|U|$, see [@zadlo2013parametric] and [@zadlo2020bootstrap], and +it is given by: +$$\label{eq1} + QAPE_p(\hat{\theta}) = \inf \left\{ {x:P\left( {\left| {{\hat{\theta}-\theta}} \right| \le x} \right) \ge p} \right\} =\inf \left\{ {x:P\left( {\left| {{U}} \right| \le x} \right) \ge p} \right\} (\#eq:eq1)$$ +This measure informs that at least $p100\%$ of observed absolute +prediction errors are smaller than or equal to $QAPE_p(\hat{\theta})$, +while at least $(1-p)100\%$ of them are higher than or equal to +$QAPE_p(\hat{\theta})$. Quantiles reflect the relation between the +magnitude of the error and the probability of its realization. It means +that using the QAPE, it is possible to make a full description of the +distribution of prediction errors instead of using the average +(reflected by the RMSE). Furthermore, the MSE is the mean of positively +(usually very strongly) skewed squared prediction errors, where the mean +should not be used as a measure of the central tendency of positively +skewed distributions. + +The above described accuracy prediction measures RMSE and QAPE can be +estimated using the bootstrap techniques. Their estimators as well as +the bootstrap distributions of the prediction errors based on any +(assumed or misspecified) model are provided in +[**qape**](https://CRAN.R-project.org/package=qape) package, including +algorithms where the parallel computing is used. + +In the [**qape**](https://CRAN.R-project.org/package=qape) package, the +whole prediction process has its own specific procedure, which can be +presented in the following steps. + +::: {#Proc1 .procedure} +**Procedure 1**. *The process of prediction, accuracy measures +estimation and Monte Carlo simulation analyses in +[**qape**](https://CRAN.R-project.org/package=qape) * + +1. *Define the characteristics of the response variable to predict,* + +2. *provide the information on sample and population values,* + +3. *define the LMM,* + +4. *estimate parameters of the LMM,* + +5. *predict the random variable $\theta$ using the chosen class of + predictors,* + +6. *estimate the prediction accuracy measures RMSE and QAPE using one + of the developed bootstrap algorithms,* + +7. *conduct simulation analyses of properties of predictors and + accuracy measures estimators under any (also misspecified) LMM + model.* +::: + +## The prediction under LMM + +The main functions of the +[**qape**](https://CRAN.R-project.org/package=qape) package provide the +bootstrap estimation of prediction accuracy measures. However, it must +be preceded by the prediction process, including the choice of the LMM +and the predictor. + +### The model + +Let $\mathbf{Y}$ denote the vector of response variables +$Y_1, Y_2,..., Y_N$. Assuming, without a loss of generality, that only +the first $n$ realizations of $Y_i$ are observed, $\mathbf{Y}$ can be +decomposed as $\mathbf{Y}= +\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{Y}_r^T +\end{bmatrix}^T$ , where $\mathbf{Y}_s$ and $\mathbf{Y}_r$ are of +dimension $n \times 1$ and $(N - n) \times 1$, respectively. In all +notations, the subscript \"s\" is used for observed realizations of the +variable of interest and \"r\" for the unobserved ones. Two known +matrices of auxiliary variables are also considered, denoted by +$\mathbf{X}$ and $\mathbf{Z}$, which are associated with fixed and +random effects, respectively. The $\mathbf{X}$ matrix is of dimension +$N \times p$, and it consists of $p$ regression variables. It can be +decomposed like $\mathbf{Y}$ as follows: $\mathbf{X}= +\begin{bmatrix} + \mathbf{X}_s^T & \mathbf{X}_r^T +\end{bmatrix}^T$, where matrices $\mathbf{X}_s$ and $\mathbf{X}_r$, both +known, are of dimension $n \times p$ and $(N-n) \times p$, respectively. +Similarly, the $\mathbf{Z}$ matrix of dimension $N \times h$ can be +written as follows: $\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_s^T & \mathbf{Z}_r^T +\end{bmatrix}^T$, where matrices $\mathbf{Z}_s$ and $\mathbf{Z}_r$, both +known, are of dimension $n \times h$ and $(N-n) \times h$, respectively. + +Then, let $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})$ denotes the +LMM of the following form (e.g. [@rao2015small], p. 98): +$$\label{LMM} + \left\{ \begin{array}{c} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}\mathbf{v}+\mathbf{e} \\ + E(\mathbf{e})=\mathbf{0}, E(\mathbf{v})=\mathbf{0} \\ + Var(\mathbf{e})=\mathbf{R}(\pmb{\delta}), Var(\mathbf{v})=\mathbf{G}(\pmb{\delta}) + \end{array} \right. (\#eq:LMM)$$ +The vector of parameters in model (\@ref(eq:LMM)) is then +$\boldsymbol{\psi}=\begin{bmatrix} + \boldsymbol{\beta}^T & \pmb{\delta}^T +\end{bmatrix}^T$, where $\boldsymbol{\beta}$ is a vector of fixed +effects of dimension $p \times 1$ and $\pmb{\delta}$ is a vector of +variance components. The random part of the model is described by the +known matrix $\mathbf{Z}$, a vector $\mathbf{v}$ of random effects of +dimension $h \times 1$ and a vector $\mathbf{e}$ of random components of +dimension $N\times 1$, where $\mathbf{e}$ and $\mathbf{v}$ are assumed +to be independent. The vector of random components $\mathbf{e}$ will be +decomposed similarly to the vector $\mathbf{Y}$, i.e. +$\mathbf{e}=\begin{bmatrix} + \mathbf{e}_s^T & \mathbf{e}_r^T +\end{bmatrix}^T$. + +In the residual bootstrap implemented in +[**qape**](https://CRAN.R-project.org/package=qape), there is a need to +re-write the LMM model to take account of the specific structure of +data, i.e. the grouping variables taken into account in the random part +of the model. In this case, without a loss of the generality, the LMM +model can be written as follows: +$$\label{LMMa} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}_1\mathbf{v}_1+...+\mathbf{Z}_l\mathbf{v}_l+...+\mathbf{Z}_L\mathbf{v}_L+\mathbf{e}, (\#eq:LMMa)$$ +where $\mathbf{v}_1,\dots,\mathbf{v}_l,\dots,\mathbf{v}_L$ are +independent vectors of random effects assumed for different divisions of +the $\mathbf{Y}$ vector (under different grouping of the data) and +$\mathbf{Z}_1, \dots, \mathbf{Z}_l, \dots, \mathbf{Z}_L$ are known +matrices of auxiliary variables associated with random effects. Writing +in (\@ref(eq:LMMa)): $\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_1 & \dots & \mathbf{0} & \dots & \mathbf{0} \\ + \vdots & \ddots & & & \vdots \\ + \mathbf{0} & \dots & \mathbf{Z}_l & \dots & \mathbf{0} \\ + \vdots & & & \ddots & \vdots \\ + \mathbf{0} & \dots & \mathbf{0} & \dots & \mathbf{Z}_L \\ +\end{bmatrix}$ and $\mathbf{v}= +\begin{bmatrix} + \mathbf{v}_1^T & \dots & \mathbf{v}_l^T & \dots & \mathbf{v}_L^T \\ +\end{bmatrix}^T$ the LMM model is obtained. Let + +$$\label{vl} +\mathbf{v}_l=\left[ \mathbf{v}_{l1}^T \dots \mathbf{v}_{lk}^T \dots \mathbf{v}_{lK_l}^T \right]^T (\#eq:vl)$$ +be of dimension $K_l J_l \times 1$, where $\mathbf{v}_{lk}$ is of +dimension $J_l \times 1$ for all $k=1,...,K_l$ and $K_l$ is the number +of random effects at the $l$th level of grouping. Hence, $\mathbf{Z}_l$ +is $N \times K_l J_l$. For example, if the random regression coefficient +model is considered with two random coefficients where both random +effects are subpopulation-specific, where $D$ is the number of +subpopulations, then $L=1$, $K_1=2$ and $J_1=D$. + +### Predictors + +In the [**qape**](https://CRAN.R-project.org/package=qape) package, in +the general case the predicted characteristic is given by any function +of response variables: +$$\label{ftheta} +\theta = f_{\theta}(\mathbf{Y}). (\#eq:ftheta)$$ +Under the $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})$ model it +could be predicted using one of three predictors: + +1. Empirical Best Linear Unbiased Predictor (EBLUP), + +2. Empirical Best Predictor (EBP) under nested error LMM, + +3. PLUG-IN predictor under the LMM. + +The first predictor (EBLUP) allows to predict the linear combination of +the response variables: +$$\label{l.theta} +\theta = f_{\theta}(\mathbf{Y}) = \boldsymbol{\gamma}^T \mathbf{Y}= \boldsymbol{\gamma}_s^T \mathbf{Y}_s + \boldsymbol{\gamma}_r^T \mathbf{Y}_r, (\#eq:l-theta)$$ +where $\boldsymbol{\gamma}$ is a vector of weights. In this case, the +predicted characteristic $\theta$ is basically the linear combination of +the response variable. For example, if one of the elements of +$\boldsymbol{\gamma}$ equals 1 and the rest of the elements equals 0, +then one realization of the response variable is predicted. If all +elements in $\boldsymbol{\gamma}$ vector equal 1, then $\theta$ becomes +the sum of all $Y_i$'s in the whole considered population dataset. The +two-stage EBLUP corresponds to the Best Linear Unbiased Predictor (BLUP) +introduced in [@henderson1950estimation] and [@royall1976linear] as: +$$\label{BLUP} + \hat{\theta}^{BLUP} (\pmb{\delta}) = {\boldsymbol{\gamma}}_s^T \mathbf{Y}_s + \hat{\theta}_r(\pmb{\delta}), (\#eq:BLUP)$$ +where the predictor of the linear combination +$\boldsymbol{\gamma}_r^T \mathbf{Y}_r$ of unobserved random variables is +given by +$\hat{\theta}_r(\pmb{\delta})={\boldsymbol{\gamma }}_r^T {{\mathbf{X}}_r}{\tilde{\boldsymbol{\beta}} }(\pmb{\delta}) +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{\tilde{v}}}(\pmb{\delta})$, +where $\tilde{\boldsymbol{\beta}}(\pmb{\delta})$ is the Best Linear +Unbiased Estimator of $\boldsymbol{\beta}$ and +$\tilde{\mathbf{v}}(\pmb{\delta})$ is the Best Linear Unbiased Predictor +of $\mathbf{v}$, both presented in (\@ref(eq:LMM)). As shown by +[@zadlo2017EBLUP] p. 8094, if +$Cov(\mathbf{e}_r, \mathbf{e}_s)=\mathbf{0}$, then the predictor +(\@ref(eq:BLUP)) is the BLUP of $\theta$ defined as the linear +combination (\@ref(eq:l-theta)). Even if +$Cov(\mathbf{e}_r, \mathbf{e}_s) \neq \mathbf{0}$, the predictor +$\hat{\theta}_r(\pmb{\delta})$ is the Best Linear Unbiased Predictor of +the following linear combination of $\boldsymbol{\beta}$ and +$\mathbf{v}$: +${\boldsymbol{\gamma }}_r^T{{\mathbf{X}}_r}{ {\boldsymbol{\beta}} } +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{{v}}}$. +The EBLUP $\hat\theta^{EBLUP}$ is obtained by replacing the vector of +variance components $\pmb{\delta}$ in BLUP (\@ref(eq:BLUP)) with the +estimator $\hat{\pmb{\delta}}$. If (a) the expectation of the predictor +is finite, (b) $\hat{\pmb{\delta}}$ is any even, translation-invariant +estimator of $\pmb{\delta}$, (c) the distributions of both random +effects and random components are symmetric around $\mathbf{0}$ (not +necessarily normal), the EBLUP remains unbiased, as proved by +[@kackar1981unbiasedness]. + +To introduce the second predictor, called EBP, considered e.g. by +[@molina2010small], firstly, the Best Predictor (BP) $\hat{\theta}^{BP}$ +of characteristic $\theta(\mathbf{Y})$ has to be defined. It is computed +by minimizing the Mean Squared Error +$MSE(\hat\theta )=E(\hat\theta - \theta)^2$ and can be written as +$\hat\theta^{BP} = E(\theta|\mathbf{Y}_s)$. It means that the +conditional distribution of $\mathbf{Y}_r|\mathbf{Y}_s$ must be known to +compute its value while at least the parameters of this distribution, +denoted by $\boldsymbol{\psi}$ in (\@ref(eq:LMM)), are unknown. The EBP +$\hat\theta^{EBP}$ is obtained by replacing these parameters with +estimators $\hat{\boldsymbol{\psi}}$. Its value can be computed +according to the Monte Carlo procedure presented in the supplementary +document for this paper. + +The last predictor is the PLUG-IN predictor defined as (e.g. +[@chwila2019properties]): +$$\hat{\theta}^{PLUG-IN}=\theta(\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{\hat{Y}}_r^T + \end{bmatrix}^T),$$ +where $\mathbf{\hat{Y}}_r$ is the vector of fitted values of unobserved +random variables under the assumed model (any model specified by the +statistician). Under the LMM and if the linear combination of +$\mathbf{Y}$ is predicted, the PLUG-IN predictor is the EBLUP, but +generally, it is not optimal. However, it was shown in simulation +studies that it can have similar or even higher accuracy compared to +empirical (estimated) best predictors, where the best predictors +minimize the prediction mean squared errors (cf. e.g. +[@boubeta2016empirical], [@chwila2019properties], +[@hobza2016empirical]). Moreover, the PLUG-IN predictor is less +computationally demanding than the EBP. + +### Predictors in [**qape**](https://CRAN.R-project.org/package=qape) + +To deal with the LMM model, the +[**qape**](https://CRAN.R-project.org/package=qape) package uses the +`lmer()` function from the +[**lme4**](https://CRAN.R-project.org/package=lme4) package, see +[@lme4]. Assuming (\@ref(eq:LMM)) and based on $\mathbf{Y}_s$, the +vector of model parameters +$\boldsymbol{\psi} = [\boldsymbol{\beta}^T, \pmb{\delta}^T]^T$ is +estimated using the Restricted Maximum Likelihood Method (REML), known +to be robust on non-normality, see e.g [@jiang1996reml], and +$\hat{\boldsymbol{\psi}}$ is obtained. + +In order to obtain the predictor of $\theta$, one of the three +[**qape**](https://CRAN.R-project.org/package=qape) functions can be +applied: `EBLUP()`, `ebpLMMne()` or `plugInLMM()`. Firstly, the +characteristic of response variables of interest has to be defined. It +is actually obvious for EBLUP, which can be used only to predict the +population/subpopulation linear combination (e.g. the sum) by using the +argument `gamma` equivalent to the population vector of weights +$\boldsymbol{\gamma}$ in (\@ref(eq:l-theta)). For other two predictors, +the EBP and the PLUG-IN, the input argument called `thetaFun` has to be +given (see $f_{\theta}(.)$ in (\@ref(eq:ftheta))). Function `thetaFun` +could define one characteristic or a vector of characteristics, for +example: + +``` r +> thetaFun1 <- function(x) median(x) +> thetaFun2 <- function(x) c(sum(x), mean(x), sd(x)) +``` + +Secondly, two groups of input arguments, common to all three predictors, +has to be provided: + +- group 1 - arguments defining the sample and the population + + - `YS` - values of the dependent variable in the sample + ($\mathbf{Y}_s$), + + - `reg` - the population matrix of auxiliary variables named in + `fixed.part`, `random.part` and `division`, + + - `con` - the population $0-1$ vector with $1$s for elements in + the sample and $0$s for elements which are not in the sample, + +- group 2 - arguments defining the model + + - `fixed.part` - fixed-effects terms declared as in `lm4::lmer` + function, + + - `random.part` - random-effects terms declared as in `lm4::lmer` + function, + + - `weights` - the population vector of weights. + +The weights make it possible to include heteroscedasticity of random +components in the LMM. + +In `EBLUP()` and `plugInLMM()` the random-effects terms of the LMM have +to be declared as the input argument `random.part`. The form of the +`ebpLMMne` predictor, in turn, requires defining in the `ebpLMMne()` +function the so-called `division` argument instead of `random.part`. +This input represents the variable dividing the population dataset into +subsets, which are taken into account in the nested error linear mixed +model with '`division`'-specific random components (presented in +supplementary document for this paper). + +In the process of prediction, it is often necessary to perform data +transformation before estimating the model parameters. An example is the +logarithmic scaling of the variable of interest. The +[**qape**](https://CRAN.R-project.org/package=qape) package offers the +possibility for declaring the argument `backTrans` to conduct the data +back-transformation. Hence, a very flexible solution is used which +allows to use any transformation of the response variable such that the +back-transformation can be defined. This argument (available in R or +defined by the user function) should be the back-transformation function +of the already transformed dependent variable used to define the model, +e.g. for log-transformed `YS` used as the response variable: + +``` r +> backTrans <- function(x) exp(x) +``` + +The main output is the value of predictor `thetaP`. For each class of +predictors, there are two S3 methods registered for existing generic +functions `print` and `summary`. The full list of output arguments is +presented in detail in the `qape-manual` file, cf. [@qape]. + +### Radon data and the model + +In order to demonstrate the functionality of the package's main +functions, in the following examples the `radon` dataset available in +[**HLMdiag**](https://CRAN.R-project.org/package=HLMdiag) package +([@HLMdiag]) is analyzed. It contains the results of a survey measuring +radon concentrations in 919 owner-occupied homes in 85 counties of +Minnesota (see Figure \@ref(fig:map)). A study was conducted in +1987-1988 by the Minnesota Department of Health, showing that indoor +radon levels are higher in Minnesota compared to typical levels in the +U.S. In the data, the response variable `log.radon` (denoted in +(\@ref(eq:radon-model)) by $log(Y_{ic})$) is the radon measurement in +logarithms of picoCurie per liter. The independent variables, on the +other hand, are: `uranium` ($x_{1ic}$) the average county-level soil +uranium content, `basement` ($x_{2ic}$) the 0-1 variable indicating the +level of the home at which the radon measurement was taken - 0 for +basement, 1 for the first floor, and `county` (denoted by subscript $c$ +in (\@ref(eq:radon-model))) is county ID. + +```{r map, echo=FALSE , fig.cap="The maps of characteristics of radon concentration in counties in picoCurie per liter. The gray colour means that the value is NA (Not Available)", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("mapaAll.png")) +``` + +In all considered examples, the prediction for the county no. 26 +(`county == 26`) is conducted and it is assumed that the observations in +this county from the first floor (`basement == 1`) are not available +(see Figure \@ref(fig:boxplot)). + +```{r boxplot, echo=FALSE , fig.cap="The distributions of radon concentration in picoCurie per liter in counties. The red line indicates county no. 26", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("boxAll.png")) +``` + +The `radon` dataset is widely discussed in the literature. In the paper +[@nero1994statistically], the Authors used an ordinary regression model +to predict county geometric means of radon concentration using surficial +soil radium data from the National Uranium Resource Evaluation. In turn, +the paper [@price1996bayesian] focuses on the prediction of the +geometric mean of radon for each county, but using a Bayesian approach. +For the `radon` data we use the following model +$$\label{radon.model} + log(Y_{ic}) = \beta_1 x_{1ic} + (\beta_2 + v_{1c}) x_{2ic} + \beta_0 + v_{2c} + e_{ic}, (\#eq:radon-model)$$ +where $i=1,2,\dots,N$, $c=1,2,\dots, C$, $N = 919$ observations, +$C = 85$ counties, $\beta_1$, $\beta_2$ and $\beta_0$ are unknown fixed +effects, $v_{1c}$ and $v_{2c}$ are random effects, $e_{ic}$ are random +components, $v_{1c}$, and $e_{ic}$ are mutually independent, $v_{2c}$ +and $e_{ic}$ are mutually independent too, $Cor(v_{1c}, v_{2c}) = \rho$, +$v_{1c} \sim (0, \sigma^2_{v_1})$, $v_{2c} \sim (0, \sigma^2_{v_2})$ and +$e_{ic} \sim (0, \sigma^2_e)$. As can easily be seen, the considered +model is the random coefficient model with two correlated +`county`-specific random effects. Its syntax written using the package +[**lme4**](https://CRAN.R-project.org/package=lme4) notation is as +follows: + +``` r +radon.model <- lmer(log.radon ~ basement + uranium + (basement | county), data = radon) +``` + +This and similar LMMs are considered, analyzed, and used for the +considered dataset in many publications, with a good overview presented +in [@gelman_data_2006]. In [@gelman2006bayesian], based on their +preceding research [@price1996bayesian], [@gelman1999analysis], +[@peck_should_2005], a very similar model but with additional +multivariate normality assumptions is studied, verified and chosen as +fitting well to the data within a Bayesian framework. The same model as +in [@gelman2006bayesian] with its special cases is considered in +[@cantoni2021review] but within the frequentist approach. Based on 25 +measures of explained variation and model selection, the Authors +conclude that the same model as considered in our paper (with additional +normality assumption, however, which is not used in all cases considered +in that paper), \"seems the best\" [@cantoni2021review p. 10] for the +`radon` data. Further tests of the model are presented by +[@loy2013diagnostics], [@loy2015you] and [@loy2017model] (see also +[@cook2007interactive] for the introduction of the methodology) showing +among others: the normality and homescedasticity of random components, +the normality of the distribution of the random slope but -- what is +important for our further considerations -- the lack of the normality of +the random intercept. Since the problem of choosing and verifying a +model for the considered dataset is widely discussed in the literature, +we will focus on the issues that are new in this case, namely the +problem of prediction and estimation of the prediction accuracy as well +as the Monte Carlo analysis of predictors' properties. + +### Example 1 + +This example shows the prediction procedure in the package +[**qape**](https://CRAN.R-project.org/package=qape). In the first step, +it is needed to define all the input arguments that will then be passed +to the prediction functions. + +``` r +> Ypop <- radon$log.radon # the population vector of the dependent variable +> # It is assumed that observations from the first floor +> # in county no. 26 are not available: +> con <- rep(1, nrow(radon)) +> con[radon$county == 26 & radon$basement == 1] <- 0 +> YS <- Ypop[con == 1] # sample vector of the dependent variable +> reg <- dplyr::select(radon, -log.radon) # the population matrix of auxiliary variables +> fixed.part <- 'basement + uranium' # the fixed part of the considered model +> random.part <- '(basement|county)' # the random part of the considered model +> # The vector of weights to define +> # the predicted linear combination - the mean for county == 26: +> gamma <- ++ (1 / sum((radon$county == 26))) * ifelse((radon$county == 26), 1, 0) +> estMSE <- TRUE # to include the naive MSE estimator of the EBLUP in the output +``` + +Then the functions corresponding to each predictor can be used. First, +the EBLUP prediction in the package +[**qape**](https://CRAN.R-project.org/package=qape) is presented. As the +EBLUP is limited to the linear combination of random variables, the +predicted characteristic is simply the arithmetic mean. To be precise, +it is the mean of logarithms of measurements (instead of the mean of +measurements), because the EBLUP can be used only under the linear +(linearized) models. As in the LMM the homescedasticity of random +components is assumed, the input argument `weights = NULL` is set up. + +``` r +> myeblup <- EBLUP(YS, fixed.part, random.part, reg, con, gamma, weights = NULL, estMSE) +> # the value of the predictor of the arithmetic mean +> # of logarithms of radon measurements: +> myeblup$thetaP +[1] 1.306916 +> myeblup$neMSE # the value of the naive MSE estimator +[1] 0.002292732 +``` + +Hence, the predicted value of the arithmetic mean of logarithms of radon +measurements equals $1.306916$ log picoCurie per liter. The estimated +root of prediction MSE equals $\sqrt{0.002292732} \approx 0.048$ log +picoCurie per liter, but -- what is important -- it is the value of the +naive RMSE estimator [as defined by @rao2015small p. 106], which means +that it ignores the decrease of accuracy due to the estimation of model +parameters. + +The second part of this example shows the prediction of the arithmetic +mean, geometric mean and median of radon measurements (not logarithm of +radon measurements) in county no. 26 with the use of the PLUG-IN +predictor. It requires the setting of two input arguments: `thetaFun` +and `backTrans`. + +``` r +> thetaFun <- function(x) { ++ c(mean(x[radon$county == 26]), psych::geometric.mean(x[radon$county == 26]), ++ median(x[radon$county == 26])) ++ } +> backTransExp <- function(x) exp(x) # back-transformation +> myplugin <- plugInLMM(YS, fixed.part, random.part, reg, con, weights = NULL, ++ backTrans = backTransExp, thetaFun) +> # values of the predictor of arithmetic mean, geometric mean +> # and median of radon measurements: +> myplugin$thetaP +[1] 3.694761 4.553745 3.900000 +``` + +In this case we can conclude that the predicted values of the +aritmethmic mean, geometric mean and median in county no. 26 equal: +$3.694761$, $4.553745$ and $3.9$ picoCurie per liter, respectively. The +problem of prediction accuracy estimation will be discussed in the next +sections of the paper. + +The [**qape**](https://CRAN.R-project.org/package=qape) package allows +to use the Empirical Best Predictor (EBP) (see the supplementary +document for this paper) as well. It provides predicted values of any +function of the variable of interest, as the PLUG-IN predictor. However, +this requires stronger assumptions to be met. The EBP procedure +available in [**qape**](https://CRAN.R-project.org/package=qape) package +is prepared under the assumption of the normality of the variable of +interest after any transformation. However, in the case of the +considered model for logarithms of radon measurements, the assumption is +not met as we mentioned before based on the results presented in the +literature. It can also be verified using `normCholTest` function +(available in [**qape**](https://CRAN.R-project.org/package=qape) +package) as follows: + +``` r +> normCholTest(radon.model, shapiro.test)$p.value +[1] 2.589407e-08 +``` + +Moreover, due to the fact of very time-consuming iterative procedure +used to compute the EBP for the general case, in the +[**qape**](https://CRAN.R-project.org/package=qape) package the function +`ebpLMMne` uses a very fast procedure working only for nested error +Linear Mixed Models (see [@molina2010small]). + +The prediction of any function of the random variables based on +cross-sectional data has been considered. Its special case, not +presented above but widely discussed in the econometric literature, is +the prediction of one random variable, in this case a radon measurement +for one non-observed owner-occupied home. Furthermore, the +[**qape**](https://CRAN.R-project.org/package=qape) package is also +designed for prediction based on longitudinal data for current or future +periods as shown in examples for the `EBLUP`, `plugInLMM` and `ebpLMMne` +functions in the `qape-manual` file, cf. [@qape]. + +## Bootstrap procedures + +The [**qape**](https://CRAN.R-project.org/package=qape) package provides +three main types of bootstrap algorithms: the parametric bootstrap, the +residual bootstrap and the double-bootstrap. + +The parametric bootstrap procedure is implemented according to +[@gonzales2007] and [@gonzales2008] and could be described in the +following steps: + +1. based on $n$ observations of the dependent and independent variables + ($\mathbf{Y}_s$, $\mathbf{X}_s$ and $\mathbf{Z}_s$) estimate + $\boldsymbol{\psi}$ to obtain the vector of estimates + $\boldsymbol{\hat{\psi}}$, + +2. generate $B$ realizations $y_{i}^{*(b)}$ of $Y_{i}$, under the + $LMM(\mathbf{X}, \mathbf{Z}, \hat{\boldsymbol{\psi}})$ and + multivariate normality of random effects and random components + obtaining\ + $\mathbf{y}^{*(b)}=\begin{bmatrix} + y_{1}^{*(b)} & ... & y_{i}^{*(b)} &... & y_{N}^{*(b)} + \end{bmatrix}^T$, where $i=1, 2, ... ,N$ and $b=1, 2, ... ,B$, + +3. decompose the vector $\mathbf{y}^{*(b)}$ as follows $\begin{bmatrix} + \mathbf{y}_s^{*(b)T} & \mathbf{y}_r^{*(b)T} + \end{bmatrix}^T$, + +4. in the $b$th iteration ($b=1,2,...,B$) + + 1. compute the bootstrap realization + $\theta^{*(b)}=\theta^{*(b)}(\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}})$ + of random variable $\theta$, + + 2. obtain the vector of estimates $\boldsymbol{\hat{\psi}}^{*(b)}$ + using $\mathbf{y}_s^{*(b)}$ and compute the bootstrap + realization of predictor $\hat{\theta}$ denoted by + $\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})$ + based on + $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\hat{\psi}}^{*(b)})$, + + 3. compute bootstrap realizations of prediction error $U^*$ denoted + by $u^*$ and for the $b$th iteration given by: + $$\label{u*b} + u^{*(b)}=\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})-\theta^{*(b)} + (\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}}) =\hat{\theta}^{*(b)}-\theta^{*(b)}, (\#eq:u*b)$$ + +5. compute the parametric bootstrap estimators of prediction accuracy + measures: RMSE and QAPE replacing prediction errors $U$ in + (\@ref(eq:eq0)) and (\@ref(eq:eq1)) by their bootstrap realizations. + +Another possible method to estimate the prediction accuracy measures is +the residual bootstrap. In what follows, we use the notation +$srswr(\mathbf{A}, m)$ to indicate the outcome of taking a simple random +sample with replacement of size $m$ of rows of matrix $\mathbf{A}$. If +$\mathbf{A}$ is a vector, it simplifies to a simple random sample with +replacement of size $m$ of elements of $\mathbf{A}$. + +To obtain the algorithm of the residual bootstrap, it is enough to +replace step 2 of the parametric bootstrap procedure presented above +with the following procedure of the population data generation based on +(\@ref(eq:LMMa)): + +- generate $B$ population vectors of the variable of interest, denoted + by $\mathbf{y}^{*(b)}$ as + $$\label{LMMboot} + \mathbf{y}^{*(b)}=\mathbf{X}\hat{\boldsymbol{\beta}} + \mathbf{Z}_1\mathbf{v}^{*(b)}_1+...+\mathbf{Z}_l\mathbf{v}^{*(b)}_l+...+\mathbf{Z}_L\mathbf{v}^{*(b)}_L+\mathbf{e}^{*(b)}, (\#eq:LMMboot)$$ + where $\hat{\boldsymbol{\beta}}$ is an estimator (e.g. REML) of + ${\boldsymbol{\beta}}$, $\mathbf{e}^{*(b)}$ is a vector of dimension + $N \times 1$ defined as + $srswr(col_{1 \leq i \leq n } \hat{{e}}_{i}, N)$, where + $\hat{{e}}_{i}$ ($i=1,2,...,n$) are residuals, $\mathbf{v}^{*(b)}_l$ + (for $1,2,...,L$) is the vector of dimension $K_l J_l \times 1$ + built from the columns of the matrix: $srswr \left( + \left[ \begin{array}{ccccc} + \hat{\mathbf{v}}_{l1} & + \dots & + \hat{\mathbf{v}}_{lk} & + \dots & + \hat{\mathbf{v}}_{lK_l} + \end{array} + \right], J_l + \right)$ of dimension $J_l \times K_l$, where + $\hat{\mathbf{v}}_{lk}$ are estimates of elements of random effects + vector (\@ref(eq:vl)). + +The next 3--5 steps in this procedure are analogous to steps in the +parametric bootstrap procedure. + +In the above-described step, it can be seen that if more than one vector +of random effect is assumed at the $l$th level of grouping, then the +elements are not sampled with replacement independently. In this case, +rows of the matrix formed by these vectors are sampled with replacement. + +The residual bootstrap algorithm can also be performed with so-called +\"correction procedure\". This procedure, which can improve the +properties of the residual bootstrap estimators due to the +underdispersion of the uncorrected residual bootstrap distributions, is +presented in the supplementary document for this paper. + +## Bootstrap in [**qape**](https://CRAN.R-project.org/package=qape) + +Two bootstrap procedures are implemented in separate functions: +`bootPar()` (the parametric bootstrap) and `bootRes()` (the residual +bootstrap). According to the general Procedure [1](#Proc1), the step +preceding the bootstrap procedure in both functions is the definition of +the predictor object. It must be one of the following: `EBLUP`, +`ebpLMMne` or `plugInLMM`. This object has to be passed to `bootPar()` +or `bootRes()` as the input parameter `predictor`. The other input +parameters are intuitive: `B` - the number of bootstrap iterations and +`p` - order of quantiles in the estimated QAPEs. + +The additional input parameter in `bootRes()` is a logical condition +called `correction`, which makes it possible to include an additional +correction term for both random effects and random components, presented +in the supplementary document for this paper, to avoid the problem of +underdispersion of residual bootstrap distributions. + +The main output values in both functions are basically the measures: +`estRMSE` and `estQAPE` computed based on (\@ref(eq:eq0)) and +(\@ref(eq:eq1)), respectively, where prediction errors are replaced by +their bootstrap realizations. There is also the output `error` being the +vector of bootstrap realizations of prediction errors, which is useful +e.g. in in-depth analysis of the prediction accuracy and for graphical +presentation of results. To estimate these accuracy measures, we use +below the residual bootstrap with the correction procedure. + +As previously stated, our package utilizes the `lmer()` function from +the [**lme4**](https://CRAN.R-project.org/package=lme4) package for +estimating model parameters. However, this function has been known to +generate convergence warnings in certain situations, listed for example +by [@lme4] p. 25, when the estimated variances of random effects are +close to zero. Such scenarios may occur when models are estimated for +smaller or medium-sized datasets, when complex variance-covariance +structures are assumed, or when the grouping variable considered for +random effects has only a few levels. Although we have not observed such +issues estimating model parameters based on the original dataset +required to compute values of the predictors in previous sections, +bootstrapping or Monte Carlo simulations are more complex cases. This is +because, based on the estimates of model parameters, the values of the +dependent variables are generated $B$ times, and then model parameters +are estimated in each out of $B$ iterations. Therefore, in at least some +iterations, dependent variable values may be randomly generated giving +realizations, where the variance of the random effect is relatively +close to zero. As a result, estimates of model parameters can be +obtained; however, convergence issues implying warnings may occur. In +such cases, there are at least two possible solutions. The first option +is to discard iterations with warnings, which would imply that the +dependent variable would not follow the assumed model as required, but +instead only its conditional version with relatively high values of +variances of random effects. It will imply overdispersed bootstrap +distribution of random effects, which will affect the bias of the +bootstrap estimators of accuracy measures. The second option is to +consider all generated realizations, despite convergence warnings, as +long as the parameters can be estimated for all iterations. We opted for +the latter solution, as argued in [@lme4] p. 25, who noted that \"being +able to fit a singular model is an advantage: when the best fitting +model lies on the boundary of a constrained space\". + +### Example 2 + +The analyses presented in Example 1 are continued. We extend the +previous results to include the issue of estimating the prediction +accuracy of the considered predictors. The use of functions for this +estimation primarily requires an object of class predictor, here +\"myplugin\". + +``` r +> class(myplugin) +[1] "plugInLMM" +``` + +The short chunk of the R code presents the residual bootstrap estimators +of the RMSE (`estRMSE`) and the QAPE (`estQAPE`) of the PLUG-IN +predictors (`plugin`) of previously analyzed three characteristics of +radon measurements in county no. 26: the arithmetic mean, geometric mean +and median. In this and subsequent examples we make the computations for +relatively high number of iterations allowing, in our opinion, to get +reliable results. These results are also used to prepare Figure +\@ref(fig:hist). However, the computations are time-consuming. The +supplementary R file contains the same chunks of the code but the number +of iterations applied is smaller in order to execute the code swiftly. + +``` r +> # accuracy measures estimates based on +> # the residual bootstrap with the correction: +> B <- 500 # number of bootstrap iterations +> p <- c(0.75, 0.9) # orders of Quantiles of Absolute Prediction Error +> set.seed(1056) +> residBoot <- bootRes(myplugin, B, p, correction = TRUE) +> # values of estimated RMSEs of the predictor of three characteristics: +> # the arithmetic mean, geometric mean and median of radon measurements, respectively: +> residBoot$estRMSE +[1] 0.1848028 0.2003681 0.2824359 +> # values of estimated QAPEs +> # (of order 0.75 in the first row, and of order 0.9 in the second row) +> # of the predictor of three characteristics: +> # the arithmetic mean, geometric mean and median of radon measurements, +> # in the 1st, 2nd and 3rd column, respectively: +> residBoot$estQAPE + [,1] [,2] [,3] +75% 0.1533405 0.2135476 0.2908988 +90% 0.2813886 0.3397411 0.4374534 +``` + +Let us concentrate on interpretations of estimators of accuracy measures +for the predictor of the geometric mean, i.e. the second value of +`residBoot$estRMSE`, and values in the second column of +`residBoot$estQAPE`. It is estimated that the average difference between +predicted values of the geometric mean and their unknown realizations +equals $0.2003681$ picoCurie per liter. Furthermore, it is estimated +that at least $75\%$ of absolute prediction errors of the predictor of +the geometric mean are smaller or equal to $0.2135476$ picoCurie per +liter and at least $25\%$ of absolute prediction errors of the predictor +are higher or equal to $0.2135476$ picoCurie per liter. Finally, it is +estimated that at least $90\%$ of absolute prediction errors of the +predictor of the geometric mean are smaller or equal to $0.3397411$ +picoCurie per liter and at least $10\%$ of absolute prediction errors of +the predictor are higher or equal to $0.3397411$ picoCurie per liter. +The distributions of bootstrap absolute prediction errors with values of +estimated RMSEs and QAPEs for the considered three prediction problems +are presented in Figure \@ref(fig:hist). + +```{r hist, echo=FALSE , fig.cap="The histograms of bootstrap absolute prediction errors for myplugin (for PLUG-IN predictors of the arithmetic mean, geometric mean and median) for B=500", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("histAll.png")) +``` + +Since the assumption of normality is not met, the parametric bootstrap +should not be used in this case. For this reason, we do not present the +results for this method below, although -- but for illustrative purposes +only -- they are presented in the supplementary R file. Moreover, these +analyses can also be conducted using `bootParFuture()` and +`bootResFuture()` functions where parallel computing algorithms are +applied. The input arguments and the output of these functions are the +same as in `bootPar()` and `bootRes()`. Examples based on these +functions are also included in the supplementary R file. + +## Bootstrap under the misspecified model in [**qape**](https://CRAN.R-project.org/package=qape) + +The [**qape**](https://CRAN.R-project.org/package=qape) package also +allows to use predictors under a model different from the assumed one +(e.g. a simpler or more robust model), but estimate its accuracy under +the assumed model. In this case, the parametric and residual bootstrap +procedures are implemented in `bootParMis()` and `bootResMis()` +functions. These functions allow to estimate the accuracy of two +predictors under the model correctly specified for the first of them. Of +course, it is expected that the estimated accuracy of the first +predictor will be better than of the second one, but the key issue can +be the difference between estimates of accuracy measures. A small +difference, even to the second predictor's disadvantage, may be treated +by the user as an argument for using the second predictor due to its +properties, such as robustness or simplicity. + +The considered functions allow to estimate the accuracy of two +predictors, which belong to the class `plugInLMM`, under the model used +to define the first of them. The remaining arguments are the same as in +`bootPar()` and `bootRes()` functions: `B` - the number of bootstrap +iterations, and `p` - orders of QAPE estimates to be taken into account. + +The output results of `bootParMis()` and `bootResMis()` include -- +similarly to `bootPar()` and `bootRes()` functions -- estimates of the +RMSEs and QAPEs of both predictors (denoted here by: `estRMSElmm`, +`estRMSElmmMis`, `estQAPElmm` and `estQAPElmmMis`), and boostrap +realizations of their prediction errors (`errorLMM` and `errorLMMmis`). + +### Example 3 + +In this example, we study the same accuracy measures as in Example 2, +but the aim is to compare the predictor `myplugin` and other predictor +defined under the misspecified LMM. First, the misspecified model has to +be defined, and a relevant predictor has to be computed. + +``` r +> fixed.part.mis <- '1' +> random.part.mis <- '(1|county)' +> myplugin.mis <- plugInLMM(YS, fixed.part.mis, random.part.mis, reg, con, ++ weights = NULL, backTrans = backTransExp, thetaFun) +``` + +Having two objects: `myplugin` and `myplugin.mis`, one can proceed to a +comparison by estimating bootstrap prediction accuracy performed using +the residual bootstrap with correction procedure. In this case, we +estimate the prediction accuracy of these two predictors under the model +used to define the first of them. + +``` r +> set.seed(1056) +> residBootMis <- bootResMis(myplugin, myplugin.mis, B, p, correction = TRUE) +> # residual bootstrap with the correction RMSE estimators +> # of 'plugin' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estRMSElmm +[1] 0.1848028 0.2003681 0.2824359 +> # residual bootstrap with the correction RMSE estimators +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estRMSElmmMis +[1] 0.1919184 0.3192304 0.2762137 +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +> # of 'plugin' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estQAPElmm + [,1] [,2] [,3] +75% 0.1533405 0.2135476 0.2908988 +90% 0.2813886 0.3397411 0.4374534 +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estQAPElmmMis + [,1] [,2] [,3] +75% 0.2267062 0.3802836 0.3255197 +90% 0.2813787 0.4970726 0.4489399 +``` + +The results, presented above, were obtained for the same number of +bootstrap iterations as in Example 2 ($B = 500$). If we compare, under +the model defined in `plugin`, estimated RMSEs of `plugin` and +`plugin.mis` predictors of the geometric mean given by $0.2003681$ and +$0.3192304$ picoCurie per liter, respectively, we can state that the +estimated accuracy (measured by RMSE estimators) of the first predictor +is better comparing with the second one. If we are not interested in the +average accuracy measures but in the right tail of the distribution of +prediction errors, we can use estimates of QAPE of order 0.9 to compare +the accuracy. The result for the `plugin.mis` of the geometric mean +equals to $0.4970726$ picoCurie per liter, and it is higher comparing +with $0.3397411$ picoCurie per liter obtained for `plugin` for the same +prediction problem. Hence, in this case, the accuracy comparison based +both on the RMSE and QAPE leads to the same finding. + +In the previous paragraph, we have focused on the results for the case +of prediction of the geometric mean. If the comparison is made for the +case of prediction of the arithmetic mean (the first column of output +results) or the median (the third column of output results), we will +come to the same conclusion regarding the estimated accuracy of `plugin` +and `plugin.mis` as in the case of prediction of the geometric mean. + +Similarly to the residual bootstrap, the parametric bootstrap procedure +`paramBootMis` available in +[**qape**](https://CRAN.R-project.org/package=qape) package can be +performed. However, in the considered case the normality assumption is +not met (as discussed above) and the procedure is not recommended. The +appropriate chunk of the R code is presented in the supplementary R +file, but it is solely intended for illustrative purposes. + +## Monte Carlo simulation analyses + +In the previous section, our aim was to estimate the prediction accuracy +under correctly specified or misspecified model. In this section, we do +not estimate the accuracy, but we approximate the true prediction +accuracy under the specified model in the Monte Carlo simulation study. +The crucial difference is that in this case, the model parameters used +are obtained based on the whole population dataset, not the sample. If +the number of iterations is large enough, we can treat the computed +values of the measures as their true values, which are unknown in +practice. + +The last step of the analysis in +[**qape**](https://CRAN.R-project.org/package=qape) package presented in +Procedure [1](#Proc1) is the Monte Carlo (MC) simulation analysis of: + +- properties of predictors + +- and properties of parametric, residual and double bootstrap + estimators of accuracy measures. + +The whole Monte Carlo procedure is as follows. + +::: {#Proc2 .procedure} +**Procedure 2**. *Model-based Monte Carlo simulation analyses in +[**qape**](https://CRAN.R-project.org/package=qape) * + +1. *define the population vector of the dependent variable and the + population matrix of auxiliary variables,* + +2. *provide the information on the division of the population into the + sampled and non-sampled part,* + +3. *define $\theta$ - the characteristics of the response variable to + be predicted,* + +4. *define the predictors $\hat{\theta}$ and accuracy measures + estimators which properties are to be assessed,* + +5. *define the model to be used to generate realizations of the values + of the dependent variable and estimate its parameters based on + population data,* + +6. *For k=1, 2, \..., K* + + 1. *generate the population vector of the response variable based + on the assumed model,* + + 2. *based on population data, compute the characteristics $\theta$, + denoted by $\theta_k$,* + + 3. *based on sample data, estimate the parameters of the LMM,* + + 4. *based on sample data, compute values of predictors + $\hat{\theta}$, denoted by $\hat{\theta}_k$,* + + 5. *based on sample data, estimate the accuracy of $\hat{\theta}$ + using bootstrap methods,* + +7. *End For* + +8. *compute accuracy measures of predictors using $\hat{\theta}_k$ and + $\theta_k$ (for $k=1,2, ..., K$),* + +9. *compute accuracy measures of estimators of prediction accuracy + measures.* +::: + +## Monte Carlo analyses in [**qape**](https://CRAN.R-project.org/package=qape) + +In order to perform a Monte Carlo (MC) analysis on the properties of +predictors, it is necessary to have access to the entire population data +for both dependent and independent variables. The function `mcLMMmis()` +can be used with the following arguments. Firstly, the population values +of the dependent variable (after a necessary transformation) should be +declared as `Ypop`. By using the `Ypop` values, we can estimate the +model parameters based on the entire population data (assuming that they +are known). This allows us to generate values of the dependent variable +in the simulation study that can mimic its distribution in the entire +population, not just in the sample. This approach ensures that our +simulation study can be an accurate representation of the random process +in the entire population, resembling the real-world scenario. Secondly, +three predictors: `predictorLMMmis`, `predictorLMM`, `predictorLMM2`, +which belong to the class `plugInLMM`, are to be defined. The first one +is used only to define the (possibly misspecified) model used to +generate population values of the response variables. Accuracy of +`predictorLMM` and `predictorLMM2` is assessed in the simulation study. +The next two arguments include the number of MC iterations `K` and +orders `p` of QAPEs used to assess the prediction accuracy. Finally, it +should be noted that it is possible to modify covariance matrices of +random components and random effects based on the model defined in +`predictorLMMmis`, which are used tThiso generate values of the +dependent variable. It is possible by declaring values of `ratioR` and +`ratioG` arguments, which the diagonal elements of covariance matrices +of random components and random effects, respectively, are divided by. + +The output of this function covers the following statistics of both +predictors computed in the simulation study: relative biases (`rBlmm` +and `rBlmm2`), relative RMSEs (`rRMSElmm` and `rRMSElmm2`) and QAPEs +(`QAPElmm` and `QAPElmm2`). Simulation-based prediction errors of both +predictors (`errorLMM` and `errorLMM2`) are also taken into account. + +### Example 4 + +In the example, an MC simulation is carried out assuming the `myplugin` +predictor. The goal is to approximate the true accuracy of the +prediction assuming model (\@ref(eq:radon-model)). Hence, in the package +[**qape**](https://CRAN.R-project.org/package=qape), all input predictor +objects in the function `mcLMMmis` have to be defined as `myplugin`.   + +``` r +> # input arguments: +predictorLMMmis <- myplugin # to define the model +predictorLMM <- myplugin # which properties are assessed in the simulation study +predictorLMM2 <- myplugin # which properties are assessed in the sim. study +``` + +Except that no modification of covariance matrices has to be used. + +``` r +# diag. elements of the covariance matrix of random components are divided by: +ratioR <- 1 +# diag. elements of the covariance matrix of random effects are divided by: +ratioG <- 1 +``` + +We specify the number of Monte Carlo iterations. + +``` r +K <- 500 # the number of MC iterations +``` + +The analysis is conducted in the object `MC`. + +``` r +> set.seed(1086) +> MC <- mcLMMmis(Ypop, predictorLMMmis, predictorLMM, predictorLMM2, ++ K, p, ratioR, ratioG) +> # relative bias of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26 (in %): +> MC$rBlmm +[1] -1.73208393 -0.04053178 -5.22355236 +``` + +Results of the relative biases are obtained. It is seen, that under the +assumed model the values of the considered predictor of the geometric +mean (the second value of `MC$rBlmm`) are smaller than possible +realizations of the geometric mean on average by $0.04053178\%$. In +turn, the relative RMSEs are as follows. + +``` r +> # relative RMSE of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26 (in %): +> MC$rRMSElmm +[1] 3.429465 4.665810 7.146678 +``` + +In the considered case, the average difference between predicted values +of the geometric mean and its possible realizations (the second value of +`MC$rRMSElmm`) equals $4.665810\%$. It should be noted that this value +can be treated as the true value of the relative RMSE (if the number of +iterations is large enough), not the estimated value obtained in +Examples 2 and 3. + +Finally, QAPEs of orders 0.75 and 0.9 are considered. + +``` r +> # QAPE of order 0.75 and 0.9 of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26: +> MC$QAPElmm + [,1] [,2] [,3] +75% 0.1491262 0.1989504 0.2919221 +90% 0.2895684 0.2959457 0.4728064 +``` + +Let us interpret the results presented in the second column of +`MC$QAPElmm`. At least $75\%$ ($90\%$) of absolute prediction errors of +the predictor of the geometric mean are smaller or equal to $0.1989504$ +($0.2959457$) picoCurie per liter and at least $25\%$ ($10\%$) of +absolute prediction errors of the predictor are higher or equal to +$0.1989504$ ($0.2959457$) picoCurie per liter. Similar to the values of +the rRMSEs in the previous code chunk, the values can be considered to +be true QAPE values, not the estimates presented in Examples 2 and 3. + +In Example 4, the accuracy of one predictor under the model used to +define this predictor was presented. A more complex version of the +simulation study, where the properties of two predictors are studied +under the model defined by the third predictor, is presented in the +supplementary R file. What is more, the +[**qape**](https://CRAN.R-project.org/package=qape) package also allows +to use `mcBootMis()` function to conduct MC analyses of properties of +accuracy measure estimators (estimators of MSEs and QAPEs) of two +predictors (which belong to the class `plugInLMM`) declared as +arguments. The model used in the simulation study is declared in the +first predictor, but the properties of accuracy measures estimators of +both predictors are studied. Output results of `mcBootMis()` covers +simulation results on properties of different accuracy measures +estimators, including the relative biases and relative RMSEs of the +parametric bootstrap MSE estimators of both predictors. The same +simulation-based statistics but for parametric bootstrap QAPE estimators +are also included. Other bootstrap methods, including the residual +bootstrap with and without the correction procedure, are also taken into +account. The full list of output arguments of `mcBootMis()` function are +presented in `qape-manual` file, cf. [@qape]. + +## Conclusions + +The package enables R users to make predictions and assess the accuracy +under linear mixed models based on different methods in a fast and +intuitive manner -- not only based on the RMSE but also based on +Quantiles of Absolute Prediction Errors. It also covers functions which +allow to conduct Monte Carlo simulation analyses of properties of the +methods of users interest. Its main advantage, compared to other +packages, is the considerable flexibility in terms of defining the model +(as in the [**lme4**](https://CRAN.R-project.org/package=lme4) package) +and the predicted characteristic, but also the transformation of the +response variable. + +In our opinion, the package is useful for scientists, practitioners and +decision-makers in all areas of research where accurate estimates and +forecasts for different types of data (including cross-sectional and +longitudinal data) and for different characteristics play the crucial +role. We believe that it will be of special interest to survey +statisticians interested in the prediction for subpopulations with small +or even zero sample sizes, called small areas. + +[alicja.wolny-dominiak@uekat.pl](alicja.wolny-dominiak@uekat.pl){.uri}\ +[web.ue.katowice.pl/woali/](web.ue.katowice.pl/woali/){.uri} + +[tomasz.zadlo@uekat.pl](tomasz.zadlo@uekat.pl){.uri}\ +[web.ue.katowice.pl/zadlo/](web.ue.katowice.pl/zadlo/){.uri} +::::: diff --git a/_articles/RJ-2024-004/RJwrapper.tex b/_articles/RJ-2024-004/RJwrapper.tex new file mode 100644 index 0000000000..d2640e1621 --- /dev/null +++ b/_articles/RJ-2024-004/RJwrapper.tex @@ -0,0 +1,31 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} +\usepackage{theorem} +\usepackage{enumitem} +\usepackage{xcolor} + +\theorembodyfont{\rm} +\newtheorem{procedure}{Procedure} + +%% load any required packages here + +\begin{document} + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{67} + +%% replace RJtemplate with your article +\begin{article} + \input{wolny-zadlo} +\end{article} + +\end{document} diff --git a/_articles/RJ-2024-004/RJwrapper_appendix.tex b/_articles/RJ-2024-004/RJwrapper_appendix.tex new file mode 100644 index 0000000000..2af6f97311 --- /dev/null +++ b/_articles/RJ-2024-004/RJwrapper_appendix.tex @@ -0,0 +1,30 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} +\usepackage{theorem} + +\theorembodyfont{\rm} +\newtheorem{procedure}{Procedure} + +\newcommand{\bbeta}{\boldsymbol{\beta}} +\newcommand{\ebbeta}{\hat{\bbeta}} +%% load any required packages here + +\begin{document} + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{XX} +\volnumber{YY} +\year{20ZZ} +\month{AAAA} + +%% replace RJtemplate with your article +\begin{article} + \input{appendix} +\end{article} + +\end{document} \ No newline at end of file diff --git a/_articles/RJ-2024-004/appendix.pdf b/_articles/RJ-2024-004/appendix.pdf new file mode 100644 index 0000000000..9b56dd9aeb Binary files /dev/null and b/_articles/RJ-2024-004/appendix.pdf differ diff --git a/_articles/RJ-2024-004/appendix.tex b/_articles/RJ-2024-004/appendix.tex new file mode 100644 index 0000000000..b6d75c62aa --- /dev/null +++ b/_articles/RJ-2024-004/appendix.tex @@ -0,0 +1,110 @@ +\title{Supplementary Document for \\ Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models with QAPE 2.0 Package} +\author{by Alicja Wolny--Dominiak and Tomasz \.{Z}\c{a}d{\l}o} + +\maketitle % Produces the title. + +%\bigskip +%\abstract{ + %An abstract of less than 150 words. + + + %Introductory section which may include references in parentheses + %\citep{R}, or cite a reference such as \citet{R} in the text. + + %\section{Introduction} + %\begin{figure}[htbp] + % \centering + % \includegraphics{Rlogo} + % \caption{The logo of R.} + % \label{figure:rlogo} + %\end{figure} + + %\section{Introduction} + + + % Please keep the abstract below 300 words + + \abstract{ + The paper presents a new R package \CRANpkg{qape} for prediction, accuracy estimation of various predictors and Monte Carlo simulation studies of properties of both predictors and estimators of accuracy measures.} + +\section*{The Monte Carlo procedure to compute EBP} +The Monte Carlo procedure to compute EBP according to \cite{molina2010small} used in \code{ebpLMMne()} is presented. + +\begin{enumerate} + \item $\boldsymbol{\psi}$ is estimated based on sample data and estimator $\boldsymbol{\hat{\psi}}$ is obtained. + \item Using the distribution function of $\mathbf{Y}_r|\mathbf{Y}_s$, whose functional form is assumed to be known, and where $\boldsymbol{\psi}$ is replaced by $\boldsymbol{\hat{\psi}}$, $L$ vectors $\mathbf{Y}_r$ are generated of unobserved values of the dependent variable, denoted by $\mathbf{Y}_r^{(l)}$ (where $l=1,2,...,L$). + \item $L$ population vectors are built based on one subvector of the dependent variables observed in the sample and $L$ subvectors of unobserved values of the dependent variable generated in the previous step. The result is: $\mathbf{Y}^{(l)} = \left[ \mathbf{Y}_s^T \mathbf{Y}_r^{(l)T}\right]^T$, where $l=1,2,...,L$. + \item The EBP value is computed as follows: $\hat\theta_{EBP}={L^{- 1}}\sum\limits_{l = 1}^L \theta (\mathbf{Y}^{(l)})$. If the LMM is not assumed for the original variable of interest but for its transformation $T(.)$, the back-transformation is used additionally: $\hat\theta_{EBP}={L^{- 1}}\sum\limits_{l = 1}^L \theta (T^{-1}(\mathbf{Y}^{(l)}))$. +\end{enumerate} + +It is worth nothing, that if the distribution of $\mathbf{Y}$ is multivariate normal, then the distribution of $\mathbf{Y}_r|\mathbf{Y}_s$ (used in step (ii) above) is also multivariate normal, which means the generation process of $L$ population vectors in the algorithm presented above is very time-consuming in real-life surveys. Therefore, the EBP is considered under the special case of the LMM, which makes it possible to accelerate the algorithm by generating $\mathbf{Y}_r^{(l)}$, $l=1, 2, ...,L$, not from the multivariate normal distribution but using the univariate normal distribution. The model, called the nested error LMM, is given by: +\begin{equation} \label{neLMM} + \mathbf{Y}_k=\mathbf{X}_k\boldsymbol{\beta} + v_k \mathbf{1}_{N_k} +\mathbf{e}_k, +\end{equation} +where $k=1,2,...,K$ and $\mathbf{1}_{N_k}$ is a vector of ones of size $N_k \times 1$, $v_{k}$ is a random effect, such that $v_{k}$ are independent for $k=1, 2, ..., K$, $\mathbf{e}_{k}$ ($N_k \times 1$) is a vector of random components. Let us additionally assume that $v_k \sim N(0,\sigma^2_v)$ and $\mathbf{e}_{k} \sim N({\bf{0}},\sigma_e^2{{\bf{I}}_{N_k}})$. Let the number of elements of $\mathbf{Y}_k$ observed in the sample be denoted by $n_k$. Under (\ref{neLMM}), step (ii) of the above procedure is as follows (cf. \cite{molina2010small} p. 375): +\begin{itemize} + \item for $k$ where $n_k>0$ we generate $\mathbf{Y}_r^{(l)}=\begin{bmatrix} + \mathbf{Y}_{r1}^{(l)T} & ...& \mathbf{Y}_{rk}^{(l)T} & ... & \mathbf{Y}_{rK}^{(l)T} + \end{bmatrix}^T$, $l=1, 2, ...,L$, based on the following model: + \begin{equation}\label{codnorm_k3} + {{\bf{Y}}_{rk}} = {{\boldsymbol{\mu }}_{rk|sk}} + {u_k}{{\bf{1}}_{{N_k} - {n_k}}} + {{\boldsymbol{\varepsilon }}_{rk}}, + \end{equation} + where ${{\boldsymbol{\mu }}_{rk|sk}} = {{\bf{X}}_{rk}}{\ebbeta } + \hat{\sigma}_v^2{{\bf{1}}_{{N_k} - {n_k}}}{\bf{1}}_{{n_k}}^T{\left( {\hat{\sigma}_v^2{{\bf{1}}_{{n_k}}}{\bf{1}}_{{n_k}}^T + \hat{\sigma}_e^2{{\bf{I}}_{{n_k}}}} \right)^{ - 1}}({{\bf{Y}}_{sk}} - {{\bf{X}}_{sk}}{{\ebbeta }})$, ${u_k}$ and ${{\boldsymbol{\varepsilon }}_{rk}}$ are independent, ${{\boldsymbol{\varepsilon }}_{rk}}\sim N({\bf{0}},\hat{\sigma}_e^2{{\bf{I}}_{{N_k} - {n_k}}})$, ${u_k}\sim N(0,\hat{\sigma}_v^2(1-{\omega_k}))$, ${\omega_k} = \hat{\sigma}_v^2{(\hat{\sigma}_v^2 + \hat{\sigma}_e^2 n_k^{ - 1})^{ - 1}}$, + \item for $k$ where $n_k=0$ we generate $\mathbf{Y}_r^{(l)}$, $l=1, 2, ...,L$, based on (\ref{neLMM}), where unknown parameters are replaced with estimates. +\end{itemize} + + +\section*{The correction procedure in residual bootstrap} +This appendix presents the correction procedure according to \cite{carpenter2003novel}, \cite{chambers2013random} and \cite{thai2013comparison}, which can be used in the residual bootstrap to avoid the problem of underdispersion of the classic residual bootstrap distribution. + +Let us consider the $l$th vector of random effects in Equation (5) given by Equation (6), both presented in the paper. Let $\boldsymbol{G}_l$ be the variance-covariance matrix of size $K_l \times K_l$ defined as $\boldsymbol{G}_l=$ $Var\left(\left[ v_{l1j} \dots v_{lkj} \dots v_{lK_lj} \right]^T \right)$, where $v_{lkj}$ is the $j$th element of $\mathbf{v}_{lk}$. Let the estimated (e.g. using restricted maximum likelihood method) matrix $\boldsymbol{G}_l$ be denoted by $\hat{\boldsymbol{G}}_{l}$ and the empirical covariance matrix of size $K_l \times K_l$ be defined as follows ${\boldsymbol{G}}_{(emp)l}=J_l^{-1} +\left[ \begin{array}{c} + \hat{\mathbf{v}}_{l1}^T \\ + \dots \\ + \hat{\mathbf{v}}_{lk}^T \\ + \dots \\ + \hat{\mathbf{v}}_{lK_l}^T +\end{array} +\right] +\left[ \begin{array}{c} + \hat{\mathbf{v}}_{l1}^T \\ + \dots \\ + \hat{\mathbf{v}}_{lk}^T \\ + \dots \\ + \hat{\mathbf{v}}_{lK_l}^T +\end{array} +\right]^T +$, where $\hat{\mathbf{v}}_{lk}$ are the estimated best linear unbiased predictors of ${\mathbf{v}}_{lk}$.\\ + +Let us write the estimated and the empirical covariance matrices using the Cholesky decomposition, in terms of a lower triangular matrix, as $\hat{\boldsymbol{G}}_l = \mathbf{L}_{(est)l} \mathbf{L}^T_{(est)l} $ and $\boldsymbol{G}_{(emp)l} = \mathbf{L}_{(emp)l} \mathbf{L}_{(emp)l}^T$. Let $\mathbf{A}_l=(\mathbf{L}_{(est)l} \mathbf{L}_{(emp)l}^{-1})^T$. Let us define the corrected estimates of $\mathbf{v}_l$ as (\cite{carpenter2003novel}, \cite{thai2013comparison}): $\hat{\mathbf{v}}_{(cor)l} =\hat{\mathbf{v}_l} \mathbf{A}_l$, where $\hat{\mathbf{v}}$ is the empirical best linear unbiased predictor of $\mathbf{v}$. Let us additionally assume that $Var(\mathbf{e})=\mathbf{R}=\sigma^2_e diag_{1 \leq \ i \leq N} (d_i)$, where $d_i$ values are known weights. The corrected residuals are as follows (\cite{chambers2013random}): $\hat{{e}}_{(cor)i}=\hat{\sigma}_e \sqrt{d_i} \hat{e}_i (n^{-1}\sum_{k=1}^{n} \hat{e}_i )^{-0.5}$, where $i=1,2,...,n$, $\hat{\sigma}^2_e$ is the estimate (e.g. REML estimate) of ${\sigma}^2_e$, $\hat{e}_i$ are residuals computed under the model given by Equation (5) in the paper. + +Replacing $\hat{\mathbf{v}_l}$ and $\hat{{e}}_{i}$ by specified above $\hat{\mathbf{v}}_{(cor)l}$ and $\hat{{e}}_{(cor)i}$, respectively, the corrected version of the residual bootstrap procedure is obtained. + +\bibliography{wolny-zadlo} + +\address{Alicja Wolny--Dominiak\\ + Department of Statistical and Mathematical Methods in Economics \\ + University of Economics in Katowice\\ +50, 1 Maja Street\\ + 40--287 Katowice\\ + Poland\\} +\email{alicja.wolny-dominiak@uekat.pl} \\ +\url{web.ue.katowice.pl/woali/} + +\address{Tomasz \.{Z}\c{a}d{\l}o\\ +Department of Statistics, Econometrics and Mathematics \\ +University of Economics in Katowice\\ +50, 1 Maja Street\\ +40--287 Katowice\\ +Poland\\} +\email{tomasz.zadlo@uekat.pl} \\ +\url{web.ue.katowice.pl/zadlo/} + +%\email{} \\ + +\end{article} + +\end{document} + + + diff --git a/_articles/RJ-2024-004/boxAll.png b/_articles/RJ-2024-004/boxAll.png new file mode 100644 index 0000000000..eada34ddd4 Binary files /dev/null and b/_articles/RJ-2024-004/boxAll.png differ diff --git a/_articles/RJ-2024-004/histAll.png b/_articles/RJ-2024-004/histAll.png new file mode 100644 index 0000000000..3150ea3684 Binary files /dev/null and b/_articles/RJ-2024-004/histAll.png differ diff --git a/_articles/RJ-2024-004/mapa1.png b/_articles/RJ-2024-004/mapa1.png new file mode 100644 index 0000000000..6c7d52d61d Binary files /dev/null and b/_articles/RJ-2024-004/mapa1.png differ diff --git a/_articles/RJ-2024-004/mapa2.png b/_articles/RJ-2024-004/mapa2.png new file mode 100644 index 0000000000..c5d6b2bb05 Binary files /dev/null and b/_articles/RJ-2024-004/mapa2.png differ diff --git a/_articles/RJ-2024-004/mapaAll.png b/_articles/RJ-2024-004/mapaAll.png new file mode 100644 index 0000000000..2c2e9fc32e Binary files /dev/null and b/_articles/RJ-2024-004/mapaAll.png differ diff --git a/_articles/RJ-2024-004/wolny-zadlo.R b/_articles/RJ-2024-004/wolny-zadlo.R new file mode 100644 index 0000000000..4bdc612f1a --- /dev/null +++ b/_articles/RJ-2024-004/wolny-zadlo.R @@ -0,0 +1,289 @@ +### Prediction, Bootstrapping and Monte Carlo Analyses +### Based on Linear Mixed Models with QAPE 2.0 Package +### Alicja Wolny-Dominiak, Tomasz Zadlo + + +# PACKAGES AND DATASET -------------------------------------------------------- + + +update.packages("qape") +update.packages("dplyr") +update.packages("lme4") +update.packages("HLMdiag") +update.packages("psych") + +library(qape) +library(dplyr) +library(lme4) +library(HLMdiag) # radon data +library(psych) # geometric mean + + +# RADON DATA AND THE MODEL ---------------------------------------------------- + + +# We use the following model for radon data +# with two correlated random effects: + +Ypop <- radon$log.radon +radon.model <- + lmer(log.radon ~ basement + uranium + (basement | + county), data = radon) + +# the lack of normality of random effects and random components: +normCholTest(radon.model, shapiro.test) + + +# EXAMPLE 1 ------------------------------------------------------------------- + + +## Input arguments ------------------------------------------------------------ + +# It is assumed that observations from county 26 +# from the first floor are unavailable. +con <- rep(1, nrow(radon)) +con[radon$county == 26 & radon$basement == 1] <- 0 + +fixed.part <- 'basement + uranium' +random.part <- '(basement|county)' +reg <- select(radon, -log.radon) # population matrix of auxiliary variables +p <- c(0.75, 0.9) # orders of Quantiles of Absolute Prediction Error +estMSE <- TRUE # to compute the naive MSE estimator of the EBLUP + +# for prediction of the mean in county 26 using the EBLUP +gamma <- + (1 / sum((radon$county == 26))) * ifelse((radon$county == 26), 1, 0) + +# for prediction of the arithmetic mean, geometric mean and median +# in county 26 using the plug-in predictor +thetaFun <- function(x) { + c(mean(x[radon$county == 26]), psych::geometric.mean(x[radon$county == 26]), + median(x[radon$county == 26])) + } + +backTransExp <- function(x) exp(x) + +# observations of the variable of interest assumed to be available +YS <- Ypop[con == 1] + + +## Predictors ----------------------------------------------------------------- + + +### EBLUP predictor ----------------------------------------------------------- + +myeblup <- EBLUP(YS, fixed.part, random.part, reg, con, gamma, + weights = NULL, estMSE) +class(myeblup) +# the value of the predictor of the arithmetic mean of +# logarithms of radon measurements: +myeblup +myeblup$thetaP +myeblup$neMSE # the value of the naive MSE estimator of the EBLUP +print(myeblup) +summary(myeblup) + +### PLUG-IN predictor --------------------------------------------------------- + +myplugin <- plugInLMM(YS, fixed.part, random.part, reg, con, weights = NULL, + backTrans = backTransExp, thetaFun) +class(myplugin) +# values of the predictor of the arithmetic mean, geometric mean +# and median of radon measurements: +myplugin +myplugin$thetaP +print(myplugin) +summary(myplugin) + + +# EXAMPLE 2 ------------------------------------------------------------------- + + +B <- 5 # number of bootstrap iterations + +## Residual bootstrap --------------------------------------------------------- + +# accuracy measures estimates based on the residual bootstrap +set.seed(1056) +residBoot <- bootRes(myplugin, B, p, correction = TRUE) +# values of estimated RMSEs of the predictor of three characteristics: +# the arithmetic mean, geometric mean and median of radon measurements: +residBoot$estRMSE +# values of estimated QAPEs (of order 0.75 in the first row, +# and of order 0.9 in the second row) of the predictor of three characteristics: +# the arithmetic mean, geometric mean and median of radon measurements: +residBoot$estQAPE + + +# EXAMPLE 2 (ADDITIONAL RESULTS) ---------------------------------------------- + + +## Parametric bootstrap ------------------------------------------------------- + +# parametric bootstrap accuracy measures estimates: +# (We present the code for illustrative purposes only. The parametric bootstrap +# should not be used because the normality assumption is not met.) +set.seed(1056) +paramBoot <- bootPar(myplugin, B, p) +paramBoot$estRMSE +paramBoot$estQAPE + +## Parallel computing --------------------------------------------------------- + +### Residual bootstrap -------------------------------------------------------- + +# accuracy measures estimates based on +# the residual bootstrap with the correction procedure +# (parallel computing is applied): +set.seed(1056) +residBootFuture <- bootResFuture(myplugin, B, p, correction = TRUE) +residBootFuture$estRMSE +residBootFuture$estQAPE + +### Parametric bootstrap ------------------------------------------------------ + +# parametric bootstrap accuracy measures estimates +# (parallel computing is applied): +# (We present the code for illustrative purposes only. The parametric bootstrap +# should not be used because the normality assumption is not met.) +set.seed(1056) +paramBootFuture <- bootParFuture(myplugin, B, p) +paramBootFuture$estRMSE +paramBootFuture$estQAPE + + +# EXAMPLE 3 ------------------------------------------------------------------- + + +## Input arguments ------------------------------------------------------------ + +# Let us define a misspecified predictor: +fixed.part.mis <- '1' +random.part.mis <- '(1|county)' +myplugin.mis <- plugInLMM(YS, fixed.part.mis, random.part.mis, reg, con, + weights = NULL, backTrans = backTransExp, thetaFun) + +## Residual bootstrap --------------------------------------------------------- + +set.seed(1056) +residBootMis <- bootResMis(myplugin, myplugin.mis, B, p, correction = TRUE) +# residual bootstrap with the correction RMSE estimators +# of 'plugin' of: the arithmetic mean, geometric mean and median +# of radon measurements in county 26: +residBootMis$estRMSElmm +# residual bootstrap with the correction RMSE estimators +# of 'plugin.mis' of: the arithmetic mean, geometric mean and median +# of radon measurements in county 26: +residBootMis$estRMSElmmMis +# residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +# of 'plugin' of: the arithmetic mean, geometric mean and median +# of radon measurements in county 26: +residBootMis$estQAPElmm +# residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +# of 'plugin.mis' of: the arithmetic mean, geometric mean and median +# of radon measurements in county 26: +residBootMis$estQAPElmmMis + + +# EXAMPLE 3 (ADDITIONAL RESULTS) ---------------------------------------------- + + +## Parametric bootstrap ------------------------------------------------------- + +# (We present the code for illustrative purposes only. The parametric bootstrap +# should not be used because the normality assumption is not met.) +set.seed(1056) +paramBootMis <- bootParMis(myplugin, myplugin.mis, B, p) +# parametric bootstrap RMSE estimators of 'plugin' +# of 'plugin' of: the arithmetic mean, geometric mean and median +# of radon measurements in county 26: +paramBootMis$estRMSElmm +# parametric bootstrap RMSE estimators of 'plugin.mis' +# of 'plugin.mis' of: the arithmetic mean, geometric mean and median +# of radon measurements in county 26: +paramBootMis$estRMSElmmMis +# parametric bootstrap QAPE estimators of order 0.75 and 0.9 of 'plugin' of +# the arithmetic mean, geometric mean and median of +# radon measurements in county 26: +paramBootMis$estQAPElmm +# parametric bootstrap QAPE estimators of order 0.75 and 0.9 of 'plugin.mis' of +# the arithmetic mean, geometric mean and median of +# radon measurements in county 26: +paramBootMis$estQAPElmmMis + + +# EXAMPLE 4 ------------------------------------------------------------------- + + +## Input arguments ------------------------------------------------------------ + +predictorLMMmis <- myplugin # to define the model +predictorLMM <- myplugin # which properties are assessed in the simulation study +predictorLMM2 <- myplugin # which properties are assessed in the sim. study +K <- 5 # the number of MC iterations +# diag. elements of the covariance matrix of random components are divided by: +ratioR <- 1 +# diag. elements of the covariance matrix of random effects are divided by: +ratioG <- 1 + +## Monte Carlo analysis ------------------------------------------------------- + +set.seed(1086) +MC <- mcLMMmis(Ypop, predictorLMMmis, predictorLMM, predictorLMM2, + K, p, ratioR, ratioG) + +# relative bias of 'predictorLMM' +# of the arithmetic mean, geometric mean and median in county 26 (in %): +MC$rBlmm +# relative RMSE of 'predictorLMM' +# of the arithmetic mean, geometric mean and median in county 26 (in %): +MC$rRMSElmm +# QAPE of order 0.75 and 0.9 of 'predictorLMM' +# of the arithmetic mean, geometric mean and median in county 26: +MC$QAPElmm + + +# EXAMPLE 4 (ADDITIONAL RESULTS) ---------------------------------------------- + + +## Input arguments ------------------------------------------------------------ + +# Let us define another predictor: +fixed.part.mis2 <- 'uranium' +random.part.mis2 <- '(1|county)' +myplugin.mis2 <- plugInLMM(YS, fixed.part.mis2, random.part.mis2, reg, con, + weights = NULL, backTrans = backTransExp, thetaFun) + +predictorLMMmis <- myplugin # to define the model +predictorLMM <- myplugin.mis # which properties are assessed in the MC study +predictorLMM2 <- myplugin.mis2 # which properties are assessed in the MC study +K <- 5 # the number of MC iterations +# diag. elements of the covariance matrix of random components are divided by: +ratioR <- 1 +# diag. elements of the covariance matrix of random effects are divided by: +ratioG <- 1 + +## Monte Carlo analysis ------------------------------------------------------- + +set.seed(1086) +MC2 <- mcLMMmis(Ypop, predictorLMMmis, predictorLMM, predictorLMM2, + K, p, ratioR, ratioG) + +# relative bias of 'predictorLMM' +# of the arithmetic mean, geometric mean and median in county 26 (in %): +MC2$rBlmm +# relative bias of 'predictorLMM2' +# of the arithmetic mean, geometric mean and median in county 26 (in %): +MC2$rBlmm2 +# relative RMSE of 'predictorLMM' +# of the arithmetic mean, geometric mean and median in county 26 (in %): +MC2$rRMSElmm +# relative RMSE of 'predictorLMM2' +# of the arithmetic mean, geometric mean and median in county 26 (in %): +MC2$rRMSElmm2 +# QAPE of order 0.75 and 0.9 of 'predictorLMM' +# of the arithmetic mean, geometric mean and median in county 26: +MC2$QAPElmm +# QAPE of order 0.75 and 0.9 of 'predictorLMM2' +# of the arithmetic mean, geometric mean and median in county 26: +MC2$QAPElmm2 \ No newline at end of file diff --git a/_articles/RJ-2024-004/wolny-zadlo.bib b/_articles/RJ-2024-004/wolny-zadlo.bib new file mode 100644 index 0000000000..d2b723014e --- /dev/null +++ b/_articles/RJ-2024-004/wolny-zadlo.bib @@ -0,0 +1,623 @@ +@article{boubeta2016empirical, + title={Empirical best prediction under area-level Poisson mixed models}, + author={Boubeta, Miguel and Lombard{\'\i}a, Mar{\'\i}a Jos{\'e} and Morales, Domingo}, + journal={Test}, + volume={25}, + number={3}, + pages={548--569}, + year={2016}, + publisher={Springer} +} + +@article{chambers2013random, + title={A random effect block bootstrap for clustered data}, + author={Chambers, Raymond and Chandra, Hukum}, + journal={Journal of Computational and Graphical Statistics}, + volume={22}, + number={2}, + pages={452--470}, + year={2013}, + publisher={Taylor \& Francis} +} + +@article{christiaensen2012, + title={Small area estimation-based prediction methods to track poverty: validation and applications}, + author={Christiaensen, Luc and Lanjouw, Peter and Luoto, Jill and Stifel, David}, + journal={The Journal of Economic Inequality}, + volume={10}, + number={2}, + pages={267--297}, + year={2012}, + publisher={Springer} +} + +@article{carpenter2003novel, + title={A novel bootstrap procedure for assessing the relationship between class size and achievement}, + author={Carpenter, James R and Goldstein, Harvey and Rasbash, Jon}, + journal={Journal of the Royal Statistical Society: Series C (Applied Statistics)}, + volume={52}, + number={4}, + pages={431--443}, + year={2003}, + publisher={Wiley Online Library} +} + +@article{chwila2019properties, + title={On properties of empirical best predictors}, + author={Chwila, Adam and {\.Z}{\k{a}}d{\l}o, Tomasz}, + journal={Communications in Statistics-Simulation and Computation}, + pages={1--34}, + year={2019}, + publisher={Taylor \& Francis} +} + +@article{davidson2007improving, + title={Improving the reliability of bootstrap tests with the fast double bootstrap}, + author={Davidson, Russell and MacKinnon, James G}, + journal={Computational Statistics \& Data Analysis}, + volume={51}, + number={7}, + pages={3259--3281}, + year={2007}, + publisher={Elsevier} +} + +@inproceedings{erciulescu2014parametric, + title={Parametric bootstrap procedures for small area prediction variance}, + author={Erciulescu, Andreea L and Fuller, Wayne A}, + booktitle={Proceedings of the Survey Research Methods Section}, + year={2014}, + organization={American Statistical Association Washington, DC} +} + + +@article{gonzales2007, + title={Estimation of the mean squared error of predictors of small area linear parameters under a logistic mixed model}, + author={Gonz{\'a}lez-Manteiga, Wenceslao and Lombard\'{\i}a, Marí{\'a} Jos{\'e} and Molina, Isabel and Morales, Domingo and Santamar\'{\i}a, Laureano}, + journal={Computational Statistics \& Data Analysis}, + volume={51}, + pages={2720--2733}, + year={2007}, + publisher={Elsevier} +} + +@article{gonzales2008, + title={Bootstrap mean squared error of small-area EBLUP}, + author={Gonz{\'a}lez-Manteiga, Wenceslao and Lombard\'{\i}a, Marí{\'a} Jos{\'e} and Molina, Isabel and Morales, Domingo and Santamar\'{\i}a, Laureano}, + journal={Journal of Statistical Computation and Simulation}, + volume={78}, + pages={443--462}, + year={2008} +} + +@article{henderson1950estimation, + title={Estimation of genetic parameters}, + author={Henderson, Charles R}, + journal={Biometrics}, + volume={6}, + number={2}, + pages={186--187}, + year={1950} +} + +@article{hall2006parametric, + title={On parametric bootstrap methods for small area prediction}, + author={Hall, Peter and Maiti, Tapabrata}, + journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + volume={68}, + number={2}, + pages={221--238}, + year={2006}, + publisher={Wiley Online Library} +} + +@article{hobza2016empirical, + title={Empirical best prediction under unit-level logit mixed models}, + author={Hobza, Tom{\'a}{\v{s}} and Morales, Domingo}, + journal={Journal of official statistics}, + volume={32}, + number={3}, + pages={661--692}, + year={2016}, + publisher={Sciendo} +} +@article{jiang1996reml, + title={REML estimation: asymptotic behavior and related topics}, + author={Jiang, Jiming}, + journal={The Annals of Statistics}, + volume={24}, + number={1}, + pages={255--286}, + year={1996}, + publisher={Institute of Mathematical Statistics} +} + +@article{kackar1981unbiasedness, + title={Unbiasedness of two-stage estimation and prediction procedures for mixed linear models}, + author={Kackar, Raghu N and Harville, David A}, + journal={Communications in statistics-theory and methods}, + volume={10}, + number={13}, + pages={1249--1261}, + year={1981}, + publisher={Taylor \& Francis} +} + +@article{kreutzmann2019r, + title={The R package emdi for estimating and mapping regionally disaggregated indicators}, + author={Kreutzmann, Ann-Kristin and Pannier, S{\"o}ren and Rojas-Perilla, Natalia and Schmid, Timo and Templ, Matthias and Tzavidis, Nikos}, + journal={Journal of Statistical Software}, + volume={91}, + year={2019} +} + +@Article{KS2016converg, + author = {Waldemar W. Koczkodaj and Jacek Szybowski}, + title = {The Limit of Inconsistency Reduction in Pairwise Comparisons}, + journal = {Applied Mathematics and Computer Science}, + year = {2016}, + volume = {26}, + number = {3}, + pages = {721--729}, + bibsource = {dblp computer science bibliography, http://dblp.org}, + biburl = {http://dblp.dagstuhl.de/rec/bib/journals/amcs/KoczkodajS16}, + doi = {10.1515/amcs-2016-0050}, +} + +@article{molina2010small, + title={Small area estimation of poverty indicators}, + author={Molina, Isabel and Rao, JNK2010}, + journal={Canadian Journal of Statistics}, + volume={38}, + number={3}, + pages={369--385}, + year={2010}, + publisher={Wiley Online Library} +} + + +} +@article{pfeffermann2013new, + title={New important developments in small area estimation}, + author={Pfeffermann, Danny and others}, + journal={Statistical Science}, + volume={28}, + number={1}, + pages={40--68}, + year={2013}, + publisher={Institute of Mathematical Statistics} +} + + + +@article{rao1994small, + title={Small-area estimation by combining time-series and cross-sectional data}, + author={Rao, Jon NK and Yu, Mingyu}, + journal={Canadian Journal of Statistics}, + volume={22}, + number={4}, + pages={511--528}, + year={1994}, + publisher={Wiley Online Library} +} + +@article{royall1976linear, + title={The linear least-squares prediction approach to two-stage sampling}, + author={Royall, Richard M}, + journal={Journal of the American Statistical Association}, + volume={71}, + number={355}, + pages={657--664}, + year={1976}, + publisher={Taylor \& Francis Group} +} + +@article{thai2013comparison, + title={A comparison of bootstrap approaches for estimating uncertainty of parameters in linear mixed-effects models}, + author={Thai, Hoai-Thu and Mentr{\'e}, France and Holford, Nicholas HG and Veyrat-Follet, Christine and Comets, Emmanuelle}, + journal={Pharmaceutical statistics}, + volume={12}, + number={3}, + pages={129--140}, + year={2013}, + publisher={Wiley Online Library} +} + +@article{zadlo2020bootstrap, + title={On bootstrap estimators of some prediction accuracy measures of loss reserves in a non-life insurance company}, + author={Wolny-Dominiak, Alicja and {\.Z}{\k{a}}d{\l}o, Tomasz}, + journal={Communications in Statistics-Simulation and Computation}, + pages={1--16}, + year={2020}, + publisher={Taylor \& Francis} +} + +@inproceedings{zadlo2013parametric, + title={On parametric bootstrap and alternatives of MSE}, + author={{\.Z}{\k{a}}d{\l}o, T}, + booktitle={Proceedings of 31st International Conference Mathematical Methods in Economics}, + pages={1081--1086}, + year={2013} +} + +@article{zadlo2017EBLUP, + title={On prediction of population and subpopulation characteristics for future periods}, + author={{\.Z}{\k{a}}d{\l}o, Tomasz}, + journal={Communications in Statistics-Simulation and Computation}, + volume={461}, + number={10}, + pages={8086-8104}, + year={2017}, + publisher={Taylor \& Francis} +} + +@article{frees1999, + title={A longitudinal data analysis interpretation of credibility models}, + author={Frees, Edward W and Young, Virginia R and Luo, Yu}, + journal={Insurance: Mathematics and Economics}, + volume={24}, + number={3}, + pages={229--247}, + year={1999}, + publisher={Elsevier} +} + +@book{buhlmann2005, + title={A course in credibility theory and its applications}, + author={B{\"u}hlmann, Hans and Gisler, Alois}, + year={2005}, + publisher={Springer} +} + +@Manual{qape, + title = {qape: Quantile of Absolute Prediction Errors}, + author = {Alicja Wolny-Dominiak and Tomasz {\.Z}{\k{a}}d{\l}o}, + year = {2023}, + note = {R package version 2.0}, + url = {https://CRAN.R-project.org/package=qape} + } + +@article{fay1979estimates, + title={Estimates of income for small places: an application of James-Stein procedures to census data}, + author={Fay III, Robert E and Herriot, Roger A}, + journal={Journal of the American Statistical Association}, + volume={74}, + number={366a}, + pages={269--277}, + year={1979}, + publisher={Taylor \& Francis} +} + + @Manual{josae, + title = {JoSAE: Unit-Level and Area-Level Small Area Estimation}, + author = {Johannes Breidenbach}, + year = {2018}, + note = {R package version 0.3.0}, + url = {https://CRAN.R-project.org/package=JoSAE} + } + +@article{gonzalez1978small, + title={Small-area estimation with application to unemployment and housing estimates}, + author={Gonzalez, Maria Elena and Hoza, Christine}, + journal={Journal of the American Statistical Association}, + volume={73}, + number={361}, + pages={7--15}, + year={1978}, + publisher={Taylor \& Francis} +} + +%%%WK +@article{SAE1988, + title={An error-components model for prediction of county crop areas using survey and satellite data}, + author={Battese, George E and Harter, Rachel M and Fuller, Wayne A}, + journal={Journal of the American Statistical Association}, + volume={83}, + number={401}, + pages={28--36}, + year={1988}, + publisher={Taylor \& Francis} +} + +@article{1M, + title={1,000,000 cases of COVID-19 outside of China: The date predicted by a simple heuristic}, + author={Koczkodaj, WW and Mansournia, MA and Pedrycz, W and Wolny-Dominiak, A and Zabrodskii, PF and Strza{\l}ka, D and Armstrong, T and Zolfaghari, AH and D{\k{e}}bski, Maciej and Mazurek, J}, + journal={Global Epidemiology}, + volume={2}, + pages={100023}, + year={2020}, + publisher={Elsevier} +} + +@book{rao2015small, + title={Small area estimation}, + author={Rao, John NK and Molina, Isabel}, + year={2015}, + publisher={John Wiley \& Sons} +} + +@article{KBJ2018, + title={Using machine learning and small area estimation to predict building-level municipal solid waste generation in cities}, + author={Kontokosta, Constantine E and Hong, Boyeong and Johnson, Nicholas E and Starobin, Daniel}, + journal={Computers, Environment and Urban Systems}, + volume={70}, + pages={151--162}, + year={2018}, + publisher={Elsevier} +} + +@article{donohue2011conditional, + title={Conditional Akaike information under generalized linear and proportional hazards mixed models}, + author={Donohue, MC and Overholser, R and Xu, R and Vaida, F}, + journal={Biometrika}, + volume={98}, + number={3}, + pages={685--700}, + year={2011}, + publisher={Oxford University Press} +} + + +@article{SAS2020, + title={Mapping the geodemographics of digital inequality in Great Britain: An integration of machine learning into small area estimation}, + author={Singleton, Alex and Alexiou, Alexandros and Savani, Rahul}, + journal={Computers, Environment and Urban Systems}, + volume={82}, + pages={101486}, + year={2020}, + publisher={Elsevier} +} +%https://www.sciencedirect.com/science/article/pii/S0198971519307963 + +@article{BGNVG2021, + title={Small Area Estimation in Health Sector by Support Vector Machine (SVM)}, + author={Basavarajaiah, DM and CA, Gopal Krishna Mithra and Narasimhamurthy, B and Veeregowda, BM and Gouri, Mahadevappa D}, + journal={Annals of the Romanian Society for Cell Biology}, + pages={4459--4474}, + year={2021} +} +%https://www.annalsofrscb.ro/index.php/journal/article/view/1467 + + +%% R packages +@Article{lme4, + title = {Fitting Linear Mixed-Effects Models Using {lme4}}, + author = {Douglas Bates and Martin M{\"a}chler and Ben Bolker and Steve Walker}, + journal = {Journal of Statistical Software}, + year = {2015}, + volume = {67}, + number = {1}, + pages = {1--48}, + doi = {10.18637/jss.v067.i01} + } + + @Article{emdi, + title = {The {R} Package {emdi} for Estimating and Mapping Regionally Disaggregated Indicators}, + author = {Ann-Kristin Kreutzmann and S\"oren Pannier and Natalia Rojas-Perilla and Timo Schmid and Matthias Templ and Nikos Tzavidis}, + journal = {Journal of Statistical Software}, + year = {2019}, + volume = {91}, + number = {7}, + pages = {1--33}, + doi = {10.18637/jss.v091.i07}, + } + + + +@Manual{msae, + title = {msae: Multivariate Fay Herriot Models for Small Area Estimation}, + author = {Novia Permatasari and Azka Ubaidillah}, + year = {2021}, + note = {R package version 0.1.4}, + url = {https://CRAN.R-project.org/package=msae}, + } + +@Article{sae, + author = {Isabel Molina and Yolanda Marhuenda}, + title = {{sae}: An {R} Package for Small Area Estimation}, + journal = {The R Journal}, + year = {2015}, + volume = {7}, + number = {1}, + pages = {81--98}, + month = {jun}, + url = {https://journal.r-project.org/archive/2015/RJ-2015-007/RJ-2015-007.pdf}, + } + + @Manual{saery, + title = {saery: Small Area Estimation for Rao and Yu Model}, + author = {Maria Dolores Esteban Lefler and Domingo Morales Gonzalez and Agustin Perez Martin}, + year = {2014}, + note = {R package version 1.0}, + url = {https://CRAN.R-project.org/package=saery}, + } + + %AI+R + +@inproceedings{iilasso, + title={Independently interpretable lasso: A new regularizer for sparse regression with uncorrelated variables}, + author={Takada, Masaaki and Suzuki, Taiji and Fujisawa, Hironori}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={454--463}, + year={2018}, + organization={PMLR} +} + %https://cran.r-project.org/web/packages/iilasso/index.html + +@inproceedings{SCCI, + title={Testing conditional independence on discrete data using stochastic complexity}, + author={Marx, Alexander and Vreeken, Jilles}, + booktitle={The 22nd International Conference on Artificial Intelligence and Statistics}, + pages={496--505}, + year={2019}, + organization={PMLR} +} + %https://cran.r-project.org/web/packages/SCCI/SCCI.pdf + +@inproceedings{graphkernels, + title={Efficient graphlet kernels for large graph comparison}, + author={Shervashidze, Nino and Vishwanathan, SVN and Petri, Tobias and Mehlhorn, Kurt and Borgwardt, Karsten}, + booktitle={Artificial intelligence and Statistics}, + pages={488--495}, + year={2009}, + organization={PMLR} +} +%https://cran.r-project.org/web/packages/graphkernels/graphkernels.pdf + +@inproceedings{GFA, + title={Bayesian group factor analysis}, + author={Virtanen, Seppo and Klami, Arto and Khan, Suleiman and Kaski, Samuel}, + booktitle={Artificial Intelligence and Statistics}, + pages={1269--1277}, + year={2012}, + organization={PMLR} +} +%https://cran.r-project.org/web/packages/GFA/ + + + + +%https://cran.r-project.org/web/packages/plsdof/plsdof.pdf + + +%%%AWD do danych radon +@article{nero1994statistically, + title={Statistically based methodologies for mapping of radon'actual'concentrations: the case of Minnesota}, + author={Nero, AV and Leiden, SM and Nolan, DA and Price, PN and Rein, S and Revzan, KL and Woolenberg, HR and Gadgil, AJ}, + journal={Radiation Protection Dosimetry}, + volume={56}, + number={1-4}, + pages={215--219}, + year={1994}, + publisher={Oxford University Press} +} + +@article{price1996bayesian, + title={Bayesian prediction of mean indoor radon concentrations for Minnesota counties}, + author={Price, Phillip N and Nero, Anthony V and Gelman, Andrew}, + journal={Health Physics}, + volume={71}, + number={6}, + pages={922--936}, + year={1996}, + publisher={LWW} +} + +@article{gelman2006bayesian, + title={Bayesian measures of explained variance and pooling in multilevel (hierarchical) models}, + author={Gelman, Andrew and Pardoe, Iain}, + journal={Technometrics}, + volume={48}, + number={2}, + pages={241--251}, + year={2006}, + publisher={Taylor \& Francis} +} + +@phdthesis{loy2013diagnostics, + title={Diagnostics for mixed/hierarchical linear models}, + author={Loy, Adam}, +school = "Iowa State University", + year={2013} +} + +@article{loy2015you, + title={Are you normal? the problem of confounded residual structures in hierarchical linear models}, + author={Loy, Adam and Hofmann, Heike}, + journal={Journal of Computational and Graphical Statistics}, + volume={24}, + number={4}, + pages={1191--1209}, + year={2015}, + publisher={Taylor \& Francis} +} + +@article{loy2017model, + title={Model choice and diagnostics for linear mixed-effects models using statistics on street corners}, + author={Loy, Adam and Hofmann, Heike and Cook, Dianne}, + journal={Journal of Computational and Graphical Statistics}, + volume={26}, + number={3}, + pages={478--492}, + year={2017}, + publisher={Taylor \& Francis} +} + +@article{cantoni2021review, + title={Review and comparison of measures of explained variation and model selection in linear mixed-effects models}, + author={Cantoni, Eva and Jacot, Nad{\`e}ge and Ghisletta, Paolo}, + journal={Econometrics and Statistics}, + year={2021}, + publisher={Elsevier} +} + +@article{gelman1999analysis, + title={Analysis of local decisions using hierarchical modeling, applied to home radon measurement and remediation}, + author={Lin, Chiayu and Gelman, Andrew and Price, Phillip N and Krantz, David H}, + journal={Statistical Science}, + volume={14}, + number={3}, + pages={305--337}, + year={1999}, + publisher={Institute of Mathematical Statistics} +} + +@incollection{peck_should_2005, + address = {Belmont, CA}, + edition = {4th edition}, + title = {Should You Measure the Radon Concentration in Your Home?}, + isbn = {978-0-534-37282-8}, + language = {English}, + booktitle = {Statistics: {A} {Guide} to the {Unknown}}, + publisher = {Duxbury Press}, + author = {Price, Phillip N. and Gelman, Andrew}, + collaborator = {Peck, Roxy and Casella, George and Cobb, George W. and Hoerl, Roger and Nolan, Deborah}, + month = mar, + year = {2005}, + pages = {149--170} +} + +@book{cook2007interactive, + title={Interactive and dynamic graphics for data analysis: with R and GGobi}, + author={Cook, Dianne and Swayne, Deborah F and Buja, Andreas}, + volume={1}, + year={2007}, + publisher={Springer} +} + +@book{gelman_data_2006, + address = {Cambridge ; New York}, + edition = {1st edition}, + title = {Data {Analysis} {Using} {Regression} and {Multilevel}/{Hierarchical} {Models}}, + isbn = {978-0-521-68689-1}, + language = {English}, + publisher = {Cambridge University Press}, + author = {Gelman, Andrew and Hill, Jennifer}, + month = dec, + year = {2006} +} + + @Article{HLMdiag, + title = {{HLMdiag}: A Suite of Diagnostics for Hierarchical Linear Models in {R}}, + author = {Adam Loy and Heike Hofmann}, + journal = {Journal of Statistical Software}, + year = {2014}, + volume = {56}, + number = {5}, + pages = {1--28}, + url = {https://www.jstatsoft.org/article/view/v056i05}, + } + +@article{apte1999predicting, + title={Predicting New Hampshire indoor radon concentrations from geologic information and other covariates}, + author={Apte, MG and Price, PN and Nero, AV and Revzan, KL}, + journal={Environmental Geology}, + volume={37}, + number={3}, + pages={181--194}, + year={1999}, + publisher={Springer} +} + + + diff --git a/_articles/RJ-2024-004/wolny-zadlo.tex b/_articles/RJ-2024-004/wolny-zadlo.tex new file mode 100644 index 0000000000..07d23d4ee0 --- /dev/null +++ b/_articles/RJ-2024-004/wolny-zadlo.tex @@ -0,0 +1,605 @@ +\title{Prediction, Bootstrapping and Monte Carlo Analyses Based on Linear Mixed Models with QAPE 2.0 Package} +\author{by Alicja Wolny--Dominiak and Tomasz \.{Z}\c{a}d{\l}o} + +\maketitle % Produces the title. + +%\bigskip +%\abstract{ + %An abstract of less than 150 words. + + + %Introductory section which may include references in parentheses + %\citep{R}, or cite a reference such as \citet{R} in the text. + + %\section{Introduction} + %\begin{figure}[htbp] + % \centering + % \includegraphics{Rlogo} + % \caption{The logo of R.} + % \label{figure:rlogo} + %\end{figure} + + %\section{Introduction} + + + % Please keep the abstract below 300 words + + \abstract{ + The paper presents a new R package \CRANpkg{qape} for prediction, accuracy estimation of various predictors and Monte Carlo simulation studies of properties of both predictors and estimators of accuracy measures. It allows to predict any population and subpopulation characteristics of the response variable based on the Linear Mixed Model (LMM). The response variable can be transformed, e.g. to logarithm and the data can be in the cross-sectional or longitudinal framework. Three bootstrap algorithms are developed: parametric, residual and double, allowing to estimate the prediction accuracy. Analyses can also include Monte Carlo simulation studies of properties of the methods used. Unlike other packages, in the prediction process the user can flexibly define the predictor, the model, the transformation function of the response variable, the predicted characteristics and the method of accuracy estimation. +% +% \bigskip +% \noindent \textbf{Keywords:} linear mixed model, prediction accuracy, bootstrap procedures, model misspecification, Monte Carlo analyses +} + +\section{Introduction} +\label{intro} +One of the tasks in application of mixed models in the real-life problems is the prediction of random effects. Then, the predicted values give the possibility for further prediction, e.g. characteristics of interest such as sum, mean or quantiles or the future value of the response variable for cross-sectional or longitudinal data. + +Three main predictors of these characteristics are proposed in the literature: Empirical Best Linear Unbiased Predictors - EBLUPs (see e.g. \cite{henderson1950estimation} and \cite{royall1976linear}), PLUG-IN predictors (see e.g. \cite{boubeta2016empirical}, \cite{chwila2019properties}, \cite{hobza2016empirical}) and Empirical Best Predictors - EBPs (see e.g. \cite{molina2010small}). Each assumes the LMM to model the response variable. + +The numerous successful applications of these three predictors for cross-sectional and longitudinal data can be found in the model approach in survey sampling, including the small area estimation. In paper \cite{fay1979estimates} the Authors introduce the prediction of the mean income for small places based on the special case of the LMM model called Fay-Herriot model and the EBLUP. The analysis of poverty is extended in many works, e.g. in \cite{molina2010small} and \cite{christiaensen2012}. In turn, in \cite{SAE1988} the Authors analyse the total crop areas based on survey and satellite data using EBLUPs. The proposed LMM model is known as the Battese-Harter-Fuller model. The predictors are also exploited in the subject of experience rating in non-life insurance, see \cite{frees1999} and \cite{buhlmann2005}, where the longitudinal data are under consideration. The insurance premium for the next period for every policy in the insurance portfolio is predicted. + +A major challenge in this type of prediction is the estimation of the prediction accuracy measure. Most often it is the Root Mean Squared Error (RMSE), which is given in analytical form or can be e.g. estimated using bootstrap. A feature of the distribution of the squared prediction error is usually a very strong positive asymmetry. Because the mean is not recommended as the appropriate measure of the central tendency in such distributions, the alternative prediction accuracy measure called the Quantile of Absolute Prediction Errors (QAPE), proposed by \cite{zadlo2013parametric} and \cite{zadlo2020bootstrap}, can be applied. + +There is a variety of R packages to calculate the considered predictors together with the accuracy measure of prediction, usually the RMSE. The package \CRANpkg{sae}, see \cite{sae}, provides EBLUPs based on Fay-Herriot and Battese-Harter-Fuller models. In turn, the multivariate EBLUP for Fay-Herriot models is implemented in \CRANpkg{msae}, see \cite{msae}. Several EBLUPs introduced in \cite{rao1994small} are implemented in package \CRANpkg{saery} introduced by \cite{saery}, likewise in \CRANpkg{JoSAE}, see \cite{josae}, but with additional heteroscedasticity analysis. The EBP is provided in the package \CRANpkg{emdi} described in \cite{kreutzmann2019r}. + + A new package in this area is our proposed package \CRANpkg{qape}. It allows the prediction of flexibly defined characteristics of the response variable using the above three predictors, assuming an appropriate LMM. A novel feature of the package \CRANpkg{qape}, compared to those already in place, is the ability of bootstrap estimation of the prediction accuracy measures, both the RMSE and QAPE. Three types of bootstrap procedures are provided: parametric, residual and double. + +%The functions provided in the package \CRANpkg{qape} can be seen as giving an extension of some usability of all these packages. +There are three groups of functions in this package: predictors values calculation, bootstrap estimation of RMSE and QAPE measures, and Monte Carlo (MC) analysis of properties of predictors and prediction accuracy estimators. The prediction is based on a LMM model defined by the user and allows to predict the population characteristics of the response variable, which can be defined by a linear combination (in the case of EBLUP), by any R function (e.g. \code{sum}) or any function defined by the user (in the case of the EBP and PLUG-IN predictors). The package allows for full flexibility in defining: the model, the predicted characteristic, and the transformation of the response variable. + +This paper is organized as follows. Firstly, the background of the LMM is presented together with the theoretical foundations of the prediction including prediction accuracy measures. Then, the package functionality in the area of prediction is presented and illustrated. A short application based on \code{radon} data, a cross-sectional dataset available in \CRANpkg{HLMdiag} package, to predict three subpopulation characteristics is shown. Subsequently, the theoretical background of the prediction accuracy measures estimation based on bootstrap is presented. Implementations of bootstrap algorithms in \CRANpkg{qape} are briefly introduced. Finally, the procedure of the model-based Monte Carlo simulation study is discussed. The paper ends with a conclusion. + + +\section{Prediction accuracy measures} \label{PAM} + +We consider the problem of prediction of any given function of the population vector $\mathbf{Y}$ of the response variable: +\begin{equation}\label{theta} +\theta = f_{\theta}(\mathbf{Y}) +\end{equation} +under the LMM. It covers linear combinations of $\mathbf{Y}$ (such as one future realization of the response variable or population and subpopulation means and totals) but also other population and subpopulation characteristics such quantiles and variability measures. + + +To assess the accuracy of the particular predictor $\hat \theta$, firstly, the prediction error is defined as $U=\hat{\theta}-\theta$. Therefore, the well-known RMSE has the following formula: +\begin{equation}\label{eq0} + RMSE(\hat{\theta})=\sqrt{E(\hat{\theta}-\theta)^{2}}=\sqrt{E({{U}^{2}})}. +\end{equation} +The alternative to the RMSE based on the mean could be the QAPE based on quantiles. It represents the $p$th quantile of the absolute prediction error $|U|$, see \cite{zadlo2013parametric} and \cite{zadlo2020bootstrap}, and it is given by: +\begin{equation}\label{eq1} + QAPE_p(\hat{\theta}) = \inf \left\{ {x:P\left( {\left| {{\hat{\theta}-\theta}} \right| \le x} \right) \ge p} \right\} =\inf \left\{ {x:P\left( {\left| {{U}} \right| \le x} \right) \ge p} \right\} +\end{equation} +This measure informs that at least $p100\%$ of observed absolute prediction errors are smaller than or equal to $QAPE_p(\hat{\theta})$, while at least $(1-p)100\%$ of them are higher than or equal to $QAPE_p(\hat{\theta})$. Quantiles reflect the relation between the magnitude of the error and the probability of its realization. It means that using the QAPE, it is possible to make a full description of the distribution of prediction errors instead of using the average (reflected by the RMSE). Furthermore, the MSE is the mean of positively (usually very strongly) skewed squared prediction errors, where the mean should not be used as a measure of the central tendency of positively skewed distributions. + +The above described accuracy prediction measures RMSE and QAPE can be estimated using the bootstrap techniques. Their estimators as well as the bootstrap distributions of the prediction errors based on any (assumed or misspecified) model are provided in \CRANpkg{qape} package, including algorithms where the parallel computing is used. + +In the \CRANpkg{qape} package, the whole prediction process has its own specific procedure, which can be presented in the following steps. + +\begin{procedure} The process of prediction, accuracy measures estimation and Monte Carlo simulation analyses in \CRANpkg{qape} +\label{Proc1} +\begin{enumerate} + \item Define the characteristics of the response variable to predict, + \item provide the information on sample and population values, + \item define the LMM, + \item estimate parameters of the LMM, + \item predict the random variable $\theta$ using the chosen class of predictors, + \item estimate the prediction accuracy measures RMSE and QAPE using one of the developed bootstrap algorithms, + \item conduct simulation analyses of properties of predictors and accuracy measures estimators under any (also misspecified) LMM model. +\end{enumerate} +\end{procedure} + +\section{The prediction under LMM} + +The main functions of the \CRANpkg{qape} package provide the bootstrap estimation of prediction accuracy measures. However, it must be preceded by the prediction process, including the choice of the LMM and the predictor. + +\subsection{The model} +Let $\mathbf{Y}$ denote the vector of response variables $Y_1, Y_2,..., Y_N$. Assuming, without a loss of generality, that only the first $n$ realizations of $Y_i$ are observed, $\mathbf{Y}$ can be decomposed as $\mathbf{Y}= +\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{Y}_r^T +\end{bmatrix}^T$ , +where $\mathbf{Y}_s$ and $\mathbf{Y}_r$ are of dimension $n \times 1$ and $(N - n) \times 1$, respectively. In all notations, the subscript "s" is used for observed realizations of the variable of interest and "r" for the unobserved ones. Two known matrices of auxiliary variables are also considered, denoted by $\mathbf{X}$ and $\mathbf{Z}$, which are associated with fixed and random effects, respectively. The $\mathbf{X}$ matrix is of dimension $N \times p$, and it consists of $p$ regression variables. It can be decomposed like $\mathbf{Y}$ as follows: $\mathbf{X}= +\begin{bmatrix} + \mathbf{X}_s^T & \mathbf{X}_r^T +\end{bmatrix}^T$, +where matrices $\mathbf{X}_s$ and $\mathbf{X}_r$, both known, are of dimension $n \times p$ and $(N-n) \times p$, respectively. Similarly, the $\mathbf{Z}$ matrix of dimension $N \times h$ can be written as follows: $\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_s^T & \mathbf{Z}_r^T +\end{bmatrix}^T$, +where matrices $\mathbf{Z}_s$ and $\mathbf{Z}_r$, both known, are of dimension $n \times h$ and $(N-n) \times h$, respectively. + +Then, let $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})$ denotes the LMM of the following form (e.g. \cite{rao2015small}, p. 98): +\begin{equation} + \label{LMM} + \left\{ \begin{array}{c} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}\mathbf{v}+\mathbf{e} \\ + E(\mathbf{e})=\mathbf{0}, E(\mathbf{v})=\mathbf{0} \\ + Var(\mathbf{e})=\mathbf{R}(\pmb{\delta}), Var(\mathbf{v})=\mathbf{G}(\pmb{\delta}) + \end{array} \right. +\end{equation} +The vector of parameters in model (\ref{LMM}) is then $\boldsymbol{\psi}=\begin{bmatrix} + \boldsymbol{\beta}^T & \pmb{\delta}^T +\end{bmatrix}^T$, +where $\boldsymbol{\beta}$ is a vector of fixed effects of dimension $p \times 1$ and $\pmb{\delta}$ is a vector of variance components. The random part of the model is described by the known matrix $\mathbf{Z}$, a vector $\mathbf{v}$ of random effects of dimension $h \times 1$ and a vector $\mathbf{e}$ of random components of dimension {$N\times 1$}, where $\mathbf{e}$ and $\mathbf{v}$ are assumed to be independent. The vector of random components $\mathbf{e}$ will be decomposed similarly to the vector $\mathbf{Y}$, i.e. $\mathbf{e}=\begin{bmatrix} + \mathbf{e}_s^T & \mathbf{e}_r^T +\end{bmatrix}^T$. + +In the residual bootstrap implemented in \CRANpkg{qape}, there is a need to re-write the LMM model to take account of the specific structure of data, i.e. the grouping variables taken into account in the random part of the model. In this case, without a loss of the generality, the LMM model can be written as follows: +\begin{equation}\label{LMMa} + \mathbf{Y}=\mathbf{X}\boldsymbol{\beta} + \mathbf{Z}_1\mathbf{v}_1+...+\mathbf{Z}_l\mathbf{v}_l+...+\mathbf{Z}_L\mathbf{v}_L+\mathbf{e}, +\end{equation} +where $\mathbf{v}_1,\dots,\mathbf{v}_l,\dots,\mathbf{v}_L$ are independent vectors of random effects assumed for different divisions of the $\mathbf{Y}$ vector (under different grouping of the data) and $\mathbf{Z}_1, \dots, \mathbf{Z}_l, \dots, \mathbf{Z}_L$ are known matrices of auxiliary variables associated with random effects. Writing in (\ref{LMMa}): $\mathbf{Z}= +\begin{bmatrix} + \mathbf{Z}_1 & \dots & \mathbf{0} & \dots & \mathbf{0} \\ + \vdots & \ddots & & & \vdots \\ + \mathbf{0} & \dots & \mathbf{Z}_l & \dots & \mathbf{0} \\ + \vdots & & & \ddots & \vdots \\ + \mathbf{0} & \dots & \mathbf{0} & \dots & \mathbf{Z}_L \\ +\end{bmatrix}$ and +$\mathbf{v}= +\begin{bmatrix} + \mathbf{v}_1^T & \dots & \mathbf{v}_l^T & \dots & \mathbf{v}_L^T \\ +\end{bmatrix}^T$ +the LMM model is obtained. Let + + +\begin{equation} \label{vl} +\mathbf{v}_l=\left[ \mathbf{v}_{l1}^T \dots \mathbf{v}_{lk}^T \dots \mathbf{v}_{lK_l}^T \right]^T +\end{equation} +be of dimension $K_l J_l \times 1$, where $\mathbf{v}_{lk}$ is of dimension $J_l \times 1$ for all $k=1,...,K_l$ and $K_l$ is the number of random effects at the $l$th level of grouping. Hence, $\mathbf{Z}_l$ is $N \times K_l J_l$. For example, if the random regression coefficient model is considered with two random coefficients where both random effects are subpopulation-specific, where $D$ is the number of subpopulations, then $L=1$, $K_1=2$ and $J_1=D$. + +\subsection{Predictors} +In the \CRANpkg{qape} package, in the general case the predicted characteristic is given by any function of response variables: +\begin{equation} \label{ftheta} +\theta = f_{\theta}(\mathbf{Y}). +\end{equation} +Under the $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\psi})$ model it could be predicted using one of three predictors: +\begin{enumerate} + \item Empirical Best Linear Unbiased Predictor (EBLUP), + \item Empirical Best Predictor (EBP) under nested error LMM, + \item PLUG-IN predictor under the LMM. +\end{enumerate} + +The first predictor (EBLUP) allows to predict the linear combination of the response variables: +\begin{equation} \label{l.theta} +\theta = f_{\theta}(\mathbf{Y}) = \boldsymbol{\gamma}^T \mathbf{Y}= \boldsymbol{\gamma}_s^T \mathbf{Y}_s + \boldsymbol{\gamma}_r^T \mathbf{Y}_r, +\end{equation} +where $\boldsymbol{\gamma}$ is a vector of weights. In this case, the predicted characteristic $\theta$ is basically the linear combination of the response variable. For example, if one of the elements of $\boldsymbol{\gamma}$ equals 1 and the rest of the elements equals 0, then one realization of the response variable is predicted. If all elements in $\boldsymbol{\gamma}$ vector equal 1, then $\theta$ becomes the sum of all $Y_i$'s in the whole considered population dataset. The two-stage EBLUP corresponds to the Best Linear Unbiased Predictor (BLUP) introduced in \cite{henderson1950estimation} and \cite{royall1976linear} as: +\begin{equation} \label{BLUP} + \hat{\theta}^{BLUP} (\pmb{\delta}) = {\boldsymbol{\gamma}}_s^T \mathbf{Y}_s + \hat{\theta}_r(\pmb{\delta}), +\end{equation} +where the predictor of the linear combination $\boldsymbol{\gamma}_r^T \mathbf{Y}_r$ of unobserved random variables is given by $\hat{\theta}_r(\pmb{\delta})={\boldsymbol{\gamma }}_r^T {{\mathbf{X}}_r}{\tilde{\boldsymbol{\beta}} }(\pmb{\delta}) +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{\tilde{v}}}(\pmb{\delta})$, where $\tilde{\boldsymbol{\beta}}(\pmb{\delta})$ is the Best Linear Unbiased Estimator of $\boldsymbol{\beta}$ and $\tilde{\mathbf{v}}(\pmb{\delta})$ is the Best Linear Unbiased Predictor of $\mathbf{v}$, both presented in (\ref{LMM}). As shown by \cite{zadlo2017EBLUP} p. 8094, if $Cov(\mathbf{e}_r, \mathbf{e}_s)=\mathbf{0}$, then the predictor (\ref{BLUP}) is the BLUP of $\theta$ defined as the linear combination (\ref{l.theta}). Even if $Cov(\mathbf{e}_r, \mathbf{e}_s) \neq \mathbf{0}$, the predictor $\hat{\theta}_r(\pmb{\delta})$ is the Best Linear Unbiased Predictor of the following linear combination of $\boldsymbol{\beta}$ and $\mathbf{v}$: ${\boldsymbol{\gamma }}_r^T{{\mathbf{X}}_r}{ {\boldsymbol{\beta}} } +\boldsymbol{\gamma }_r^T{\mathbf{Z}}_r{\mathbf{{v}}}$. The EBLUP $\hat\theta^{EBLUP}$ is obtained by replacing the vector of variance components $\pmb{\delta}$ in BLUP (\ref{BLUP}) with the estimator $\hat{\pmb{\delta}}$. If (a) the expectation of the predictor is finite, (b) $\hat{\pmb{\delta}}$ is any even, translation-invariant estimator of $\pmb{\delta}$, (c) the distributions of both random effects and random components are symmetric around $\mathbf{0}$ (not necessarily normal), the EBLUP remains unbiased, as proved by \cite{kackar1981unbiasedness}. + +To introduce the second predictor, called EBP, considered e.g. by \cite{molina2010small}, firstly, the Best Predictor (BP) $\hat{\theta}^{BP}$ of characteristic $\theta(\mathbf{Y})$ has to be defined. It is computed by minimizing the Mean Squared Error $MSE(\hat\theta )=E(\hat\theta - \theta)^2$ and can be written as $\hat\theta^{BP} = E(\theta|\mathbf{Y}_s)$. It means that the conditional distribution of $\mathbf{Y}_r|\mathbf{Y}_s$ must be known to compute its value while at least the parameters of this distribution, denoted by $\boldsymbol{\psi}$ in (\ref{LMM}), are unknown. The EBP $\hat\theta^{EBP}$ is obtained by replacing these parameters with estimators $\hat{\boldsymbol{\psi}}$. Its value can be computed according to the Monte Carlo procedure presented in the supplementary document for this paper. + +The last predictor is the PLUG-IN predictor defined as (e.g. \cite{chwila2019properties}): +\begin{equation} + \hat{\theta}^{PLUG-IN}=\theta(\begin{bmatrix} + \mathbf{Y}_s^T & \mathbf{\hat{Y}}_r^T + \end{bmatrix}^T), +\end{equation} +where $\mathbf{\hat{Y}}_r$ is the vector of fitted values of unobserved random variables under the assumed model (any model specified by the statistician). Under the LMM and if the linear combination of $\mathbf{Y}$ is predicted, the PLUG-IN predictor is the EBLUP, but generally, it is not optimal. However, it was shown in simulation studies that it can have similar or even higher accuracy compared to empirical (estimated) best predictors, where the best predictors minimize the prediction mean squared errors (cf. e.g. \cite{boubeta2016empirical}, \cite{chwila2019properties}, \cite{hobza2016empirical}). Moreover, the PLUG-IN predictor is less computationally demanding than the EBP. + +\subsection{Predictors in \CRANpkg{qape}} + +To deal with the LMM model, the \CRANpkg{qape} package uses the \code{lmer()} function from the \CRANpkg{lme4} package, see \cite{lme4}. Assuming (\ref{LMM}) and based on $\mathbf{Y}_s$, the vector of model parameters $\boldsymbol{\psi} = [\boldsymbol{\beta}^T, \pmb{\delta}^T]^T$ is estimated using the Restricted Maximum Likelihood Method (REML), known to be robust on non-normality, see e.g \cite{jiang1996reml}, and $\hat{\boldsymbol{\psi}}$ is obtained. + +In order to obtain the predictor of $\theta$, one of the three \CRANpkg{qape} functions can be applied: \code{EBLUP()}, \code{ebpLMMne()} or \code{plugInLMM()}. Firstly, the characteristic of response variables of interest has to be defined. It is actually obvious for EBLUP, which can be used only to predict the population/subpopulation linear combination (e.g. the sum) by using the argument \code{gamma} equivalent to the population vector of weights $\boldsymbol{\gamma}$ in (\ref{l.theta}). For other two predictors, the EBP and the PLUG-IN, the input argument called \code{thetaFun} has to be given (see $f_{\theta}(.)$ in (\ref{ftheta})). Function \code{thetaFun} could define one characteristic or a vector of characteristics, for example: + +\begin{example} +> thetaFun1 <- function(x) median(x) +> thetaFun2 <- function(x) c(sum(x), mean(x), sd(x)) +\end{example} + + +Secondly, two groups of input arguments, common to all three predictors, has to be provided: +\begin{itemize} + \item {group 1 - arguments defining the sample and the population} + \begin{itemize} + \item \code{YS} - values of the dependent variable in the sample ($\mathbf{Y}_s$), + \item \code {reg} - the population matrix of auxiliary variables named in \code{fixed.part}, \code{random.part} and \code{division}, + \item \code{con} - the population $0-1$ vector with $1$s for elements in the sample and $0$s for elements which are not in the sample, + \end{itemize} + \item {group 2 - arguments defining the model} + \begin{itemize} + \item \code{fixed.part} - fixed-effects terms declared as in \code{lm4::lmer} function, + \item \code{random.part} - random-effects terms declared as in \code{lm4::lmer} function, + \item \code{weights} - the population vector of weights. + \end{itemize} +\end{itemize} +The weights make it possible to include heteroscedasticity of random components in the LMM. + +In \code{EBLUP()} and \code{plugInLMM()} the random-effects terms of the LMM have to be declared as the input argument \code{random.part}. The form of the \code{ebpLMMne} predictor, in turn, requires defining in the \code{ebpLMMne()} function the so-called \code{division} argument instead of \code{random.part}. This input represents the variable dividing the population dataset into subsets, which are taken into account in the nested error linear mixed model with '\code{division}'-specific random components (presented in supplementary document for this paper). + +In the process of prediction, it is often necessary to perform data transformation before estimating the model parameters. An example is the logarithmic scaling of the variable of interest. The \CRANpkg{qape} package offers the possibility for declaring the argument \code{backTrans} to conduct the data back-transformation. Hence, a very flexible solution is used which allows to use any transformation of the response variable such that the back-transformation can be defined. This argument (available in R or defined by the user function) should be the back-transformation function of the already transformed dependent variable used to define the model, e.g. for log-transformed \code{YS} used as the response variable: +\begin{example} +> backTrans <- function(x) exp(x) +\end{example} + +The main output is the value of predictor \code{thetaP}. For each class of predictors, there are two S3 methods registered for existing generic functions \code{print} and \code{summary}. The full list of output arguments is presented in detail in the \code{qape-manual} file, cf. \cite{qape}. + +\subsection{Radon data and the model} + +In order to demonstrate the functionality of the package's main functions, in the following examples the \code{radon} dataset available in \CRANpkg{HLMdiag} package (\cite{HLMdiag}) is analyzed. It contains the results of a survey measuring radon concentrations in 919 owner-occupied homes in 85 counties of Minnesota (see Figure \ref{map}). A study was conducted in 1987-1988 by the Minnesota Department of Health, showing that indoor radon levels are higher in Minnesota compared to typical levels in the U.S. In the data, the response variable \code{log.radon} (denoted in (\ref{radon.model}) by $log(Y_{ic})$) is the radon measurement in logarithms of picoCurie per liter. The independent variables, on the other hand, are: \code{uranium} ($x_{1ic}$) the average county-level soil uranium content, \code{basement} ($x_{2ic}$) the 0-1 variable indicating the level of the home at which the radon measurement was taken - 0 for basement, 1 for the first floor, and \code{county} (denoted by subscript $c$ in (\ref{radon.model})) is county ID. + +\begin{figure}[h] +\centering +\includegraphics[scale=0.5]{mapaAll.png} +\caption{The maps of characteristics of radon concentration in counties in picoCurie per liter. The gray colour means that the value is NA (Not Available)}\label{map} +\end{figure} +In all considered examples, the prediction for the county no. 26 (\code{county == 26}) is conducted and it is assumed that the observations in this county from the first floor (\code{basement == 1}) are not available (see Figure \ref{boxplot}). + +\begin{figure}[h] +\centering +\includegraphics[scale=0.55]{boxAll.png} +\caption{The distributions of radon concentration in picoCurie per liter in counties. The red line indicates county no. 26}\label{boxplot} +\end{figure} +The \code{radon} dataset is widely discussed in the literature. In the paper \cite{nero1994statistically}, the Authors used an ordinary regression model to predict county geometric means of radon concentration using surficial soil radium data from the National Uranium Resource Evaluation. In turn, the paper \cite{price1996bayesian} focuses on the prediction of the geometric mean of radon for each county, but using a Bayesian approach. For the \code{radon} data we use the following model +\begin{equation}\label{radon.model} + log(Y_{ic}) = \beta_1 x_{1ic} + (\beta_2 + v_{1c}) x_{2ic} + \beta_0 + v_{2c} + e_{ic}, +\end{equation} +where $i=1,2,\dots,N$, $c=1,2,\dots, C$, $N = 919$ observations, $C = 85$ counties, $\beta_1$, $\beta_2$ and $\beta_0$ are unknown fixed effects, $v_{1c}$ and $v_{2c}$ are random effects, $e_{ic}$ are random components, $v_{1c}$, and $e_{ic}$ are mutually independent, $v_{2c}$ and $e_{ic}$ are mutually independent too, $Cor(v_{1c}, v_{2c}) = \rho$, $v_{1c} \sim (0, \sigma^2_{v_1})$, $v_{2c} \sim (0, \sigma^2_{v_2})$ and $e_{ic} \sim (0, \sigma^2_e)$. As can easily be seen, the considered model is the random coefficient model with two correlated \code{county}-specific random effects. Its syntax written using the package \CRANpkg{lme4} notation is as follows: +\begin{verbatim} +radon.model <- lmer(log.radon ~ basement + uranium + (basement | county), data = radon) +\end{verbatim} +This and similar LMMs are considered, analyzed, and used for the considered dataset in many publications, with a good overview presented in \cite{gelman_data_2006}. In \cite{gelman2006bayesian}, based on their preceding research \cite{price1996bayesian}, \cite{gelman1999analysis}, \cite{peck_should_2005}, a very similar model but with additional multivariate normality assumptions is studied, verified and chosen as fitting well to the data within a Bayesian framework. The same model as in \cite{gelman2006bayesian} with its special cases is considered in \cite{cantoni2021review} but within the frequentist approach. Based on 25 measures of explained variation and model selection, the Authors conclude that the same model as considered in our paper (with additional normality assumption, however, which is not used in all cases considered in that paper), "seems the best" \cite[p. 10]{cantoni2021review} for the \code{radon} data. Further tests of the model are presented by \cite{loy2013diagnostics}, \cite{loy2015you} and \cite{loy2017model} (see also \cite{cook2007interactive} for the introduction of the methodology) showing among others: the normality and homescedasticity of random components, the normality of the distribution of the random slope but – what is important for our further considerations – the lack of the normality of the random intercept. Since the problem of choosing and verifying a model for the considered dataset is widely discussed in the literature, we will focus on the issues that are new in this case, namely the problem of prediction and estimation of the prediction accuracy as well as the Monte Carlo analysis of predictors' properties. + +\subsection{Example 1} + +This example shows the prediction procedure in the package \CRANpkg{qape}. In the first step, it is needed to define all the input arguments that will then be passed to the prediction functions. +\begin{verbatim} +> Ypop <- radon$log.radon # the population vector of the dependent variable +> # It is assumed that observations from the first floor +> # in county no. 26 are not available: +> con <- rep(1, nrow(radon)) +> con[radon$county == 26 & radon$basement == 1] <- 0 +> YS <- Ypop[con == 1] # sample vector of the dependent variable +> reg <- dplyr::select(radon, -log.radon) # the population matrix of auxiliary variables +> fixed.part <- 'basement + uranium' # the fixed part of the considered model +> random.part <- '(basement|county)' # the random part of the considered model +> # The vector of weights to define +> # the predicted linear combination - the mean for county == 26: +> gamma <- ++ (1 / sum((radon$county == 26))) * ifelse((radon$county == 26), 1, 0) +> estMSE <- TRUE # to include the naive MSE estimator of the EBLUP in the output +\end{verbatim} + +Then the functions corresponding to each predictor can be used. First, the EBLUP prediction in the package \CRANpkg{qape} is presented. As the EBLUP is limited to the linear combination of random variables, the predicted characteristic is simply the arithmetic mean. To be precise, it is the mean of logarithms of measurements (instead of the mean of measurements), because the EBLUP can be used only under the linear (linearized) models. As in the LMM the homescedasticity of random components is assumed, the input argument \code{weights = NULL} is set up. + +\begin{verbatim} +> myeblup <- EBLUP(YS, fixed.part, random.part, reg, con, gamma, weights = NULL, estMSE) +> # the value of the predictor of the arithmetic mean +> # of logarithms of radon measurements: +> myeblup$thetaP +[1] 1.306916 +> myeblup$neMSE # the value of the naive MSE estimator +[1] 0.002292732 +\end{verbatim} +Hence, the predicted value of the arithmetic mean of logarithms of radon measurements equals $1.306916$ log picoCurie per liter. The estimated root of prediction MSE equals $\sqrt{0.002292732} \approx 0.048$ log picoCurie per liter, but -- what is important -- it is the value of the naive RMSE estimator \citep[as defined by][p. 106]{rao2015small}, which means that it ignores the decrease of accuracy due to the estimation of model parameters. + + The second part of this example shows the prediction of the arithmetic mean, geometric mean and median of radon measurements (not logarithm of radon measurements) in county no. 26 with the use of the PLUG-IN predictor. It requires the setting of two input arguments: \code{thetaFun} and \code{backTrans}. + +\begin{verbatim} +> thetaFun <- function(x) { ++ c(mean(x[radon$county == 26]), psych::geometric.mean(x[radon$county == 26]), ++ median(x[radon$county == 26])) ++ } +> backTransExp <- function(x) exp(x) # back-transformation +> myplugin <- plugInLMM(YS, fixed.part, random.part, reg, con, weights = NULL, ++ backTrans = backTransExp, thetaFun) +> # values of the predictor of arithmetic mean, geometric mean +> # and median of radon measurements: +> myplugin$thetaP +[1] 3.694761 4.553745 3.900000 +\end{verbatim} +%Using the PLUG-IN predictor we are able to predict any characteristic of the variable of interest, taking into account any tranformation, because the back-transformation must be defined. +In this case we can conclude that the predicted values of the aritmethmic mean, geometric mean and median in county no. 26 equal: $3.694761$, $4.553745$ and $3.9$ picoCurie per liter, respectively. The problem of prediction accuracy estimation will be discussed in the next sections of the paper. + +The \CRANpkg{qape} package allows to use the Empirical Best Predictor (EBP) (see the supplementary document for this paper) as well. It provides predicted values of any function of the variable of interest, as the PLUG-IN predictor. However, this requires stronger assumptions to be met. The EBP procedure available in \CRANpkg{qape} package is prepared under the assumption of the normality of the variable of interest after any transformation. However, in the case of the considered model for logarithms of radon measurements, the assumption is not met as we mentioned before based on the results presented in the literature. It can also be verified using \code{normCholTest} function (available in \CRANpkg{qape} package) as follows: +\begin{verbatim} +> normCholTest(radon.model, shapiro.test)$p.value +[1] 2.589407e-08 +\end{verbatim} +Moreover, due to the fact of very time-consuming iterative procedure used to compute the EBP for the general case, in the \CRANpkg{qape} package the function \code{ebpLMMne} uses a very fast procedure working only for nested error Linear Mixed Models (see \cite{molina2010small}). + +The prediction of any function of the random variables based on cross-sectional data has been considered. Its special case, not presented above but widely discussed in the econometric literature, is the prediction of one random variable, in this case a radon measurement for one non-observed owner-occupied home. Furthermore, the \CRANpkg{qape} package is also designed for prediction based on longitudinal data for current or future periods as shown in examples for the \code{EBLUP}, \code{plugInLMM} and \code{ebpLMMne} functions in the \code{qape-manual} file, cf. \cite{qape}. + +\section{Bootstrap procedures} + +The \CRANpkg{qape} package provides three main types of bootstrap algorithms: the parametric bootstrap, the residual bootstrap and the double-bootstrap. + +The parametric bootstrap procedure is implemented according to \cite{gonzales2007} and \cite{gonzales2008} and could be described in the following steps: + +\begin{enumerate} + \item based on $n$ observations of the dependent and independent variables ($\mathbf{Y}_s$, $\mathbf{X}_s$ and $\mathbf{Z}_s$) estimate $\boldsymbol{\psi}$ to obtain the vector of estimates $\boldsymbol{\hat{\psi}}$, + \item generate $B$ realizations $y_{i}^{*(b)}$ of $Y_{i}$, under the $LMM(\mathbf{X}, \mathbf{Z}, \hat{\boldsymbol{\psi}})$ and multivariate normality of random effects and random components obtaining\\ + $\mathbf{y}^{*(b)}=\begin{bmatrix} + y_{1}^{*(b)} & ... & y_{i}^{*(b)} &... & y_{N}^{*(b)} + \end{bmatrix}^T$, where $i=1, 2, ... ,N$ and $b=1, 2, ... ,B$, + \item decompose the vector $\mathbf{y}^{*(b)}$ as follows + $\begin{bmatrix} + \mathbf{y}_s^{*(b)T} & \mathbf{y}_r^{*(b)T} + \end{bmatrix}^T$, + \item in the $b$th iteration ($b=1,2,...,B$) + \begin{enumerate} + \item compute the bootstrap realization $\theta^{*(b)}=\theta^{*(b)}(\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}})$ of random variable $\theta$, + \item obtain the vector of estimates $\boldsymbol{\hat{\psi}}^{*(b)}$ using $\mathbf{y}_s^{*(b)}$ and compute the bootstrap realization of predictor $\hat{\theta}$ denoted by $\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})$ based on $LMM(\mathbf{X}, \mathbf{Z}, \boldsymbol{\hat{\psi}}^{*(b)})$, + \item compute bootstrap realizations of prediction error $U^*$ denoted by $u^*$ and for the $b$th iteration given by: + \begin{equation}\label{u*b} + u^{*(b)}=\hat{\theta}^{*(b)}(\mathbf{y}_s^{*(b)},\boldsymbol{\hat{\psi}}^{*(b)})-\theta^{*(b)} + (\mathbf{y}^{*(b)},\boldsymbol{\hat{\psi}}) =\hat{\theta}^{*(b)}-\theta^{*(b)}, + \end{equation} + \end{enumerate} + \item compute the parametric bootstrap estimators of prediction accuracy measures: RMSE and QAPE replacing prediction errors $U$ in (\ref{eq0}) and (\ref{eq1}) by their bootstrap realizations. +\end{enumerate} + +Another possible method to estimate the prediction accuracy measures is the residual bootstrap. In what follows, we use the notation $srswr(\mathbf{A}, m)$ to indicate the outcome of taking a simple random sample with replacement of size $m$ of rows of matrix $\mathbf{A}$. If $\mathbf{A}$ is a vector, it simplifies to a simple random sample with replacement of size $m$ of elements of $\mathbf{A}$. + +To obtain the algorithm of the residual bootstrap, it is enough to replace step 2 of the parametric bootstrap procedure presented above with the following procedure of the population data generation based on (\ref{LMMa}): +\begin{itemize} + \item generate $B$ population vectors of the variable of interest, denoted by $\mathbf{y}^{*(b)}$ as + \begin{equation}\label{LMMboot} + \mathbf{y}^{*(b)}=\mathbf{X}\hat{\boldsymbol{\beta}} + \mathbf{Z}_1\mathbf{v}^{*(b)}_1+...+\mathbf{Z}_l\mathbf{v}^{*(b)}_l+...+\mathbf{Z}_L\mathbf{v}^{*(b)}_L+\mathbf{e}^{*(b)}, + \end{equation} + where $\hat{\boldsymbol{\beta}}$ is an estimator (e.g. REML) of ${\boldsymbol{\beta}}$, + $\mathbf{e}^{*(b)}$ is a vector of dimension $N \times 1$ defined as $srswr(col_{1 \leq i \leq n } \hat{{e}}_{i}, N)$, where $\hat{{e}}_{i}$ ($i=1,2,...,n$) are residuals, $\mathbf{v}^{*(b)}_l$ (for $1,2,...,L$) is the vector of dimension $K_l J_l \times 1$ built from the columns of the matrix: + $srswr \left( + \left[ \begin{array}{ccccc} + \hat{\mathbf{v}}_{l1} & + \dots & + \hat{\mathbf{v}}_{lk} & + \dots & + \hat{\mathbf{v}}_{lK_l} + \end{array} + \right], J_l + \right)$ +of dimension $J_l \times K_l$, where $\hat{\mathbf{v}}_{lk}$ are estimates of elements of random effects vector (\ref{vl}). + +\end{itemize} +The next 3–5 steps in this procedure are analogous to steps in the parametric bootstrap procedure. + +In the above-described step, it can be seen that if more than one vector of random effect is assumed at the $l$th level of grouping, then the elements are not sampled with replacement independently. In this case, rows of the matrix formed by these vectors are sampled with replacement. + +The residual bootstrap algorithm can also be performed with so-called "correction procedure". This procedure, which can improve the properties of the residual bootstrap estimators due to the underdispersion of the uncorrected residual bootstrap distributions, is presented in the supplementary document for this paper. + +\section{Bootstrap in \CRANpkg{qape}} + +Two bootstrap procedures are implemented in separate functions: \code{bootPar()} (the parametric bootstrap) and \code{bootRes()} (the residual bootstrap). According to the general Procedure \ref{Proc1}, the step preceding the bootstrap procedure in both functions is the definition of the predictor object. It must be one of the following: \code{EBLUP}, \code{ebpLMMne} or \code{plugInLMM}. This object has to be passed to \code{bootPar()} or \code{bootRes()} as the input parameter \code{predictor}. The other input parameters are intuitive: \code{B} - the number of bootstrap iterations and \code{p} - order of quantiles in the estimated QAPEs. + +The additional input parameter in \code{bootRes()} is a logical condition called \code{correction}, which makes it possible to include an additional correction term for both random effects and random components, presented in the supplementary document for this paper, to avoid the problem of underdispersion of residual bootstrap distributions. + +The main output values in both functions are basically the measures: \code{estRMSE} and \code{estQAPE} computed based on (\ref{eq0}) and (\ref{eq1}), respectively, where prediction errors are replaced by their bootstrap realizations. There is also the output \code{error} being the vector of bootstrap realizations of prediction errors, which is useful e.g. in in-depth analysis of the prediction accuracy and for graphical presentation of results. To estimate these accuracy measures, we use below the residual bootstrap with the correction procedure. + +%As it was mentioned earlier, the estimation of model parameters in our package is made using \code{lmer()} function from the \CRANpkg{lme4} package. There are known situations, listed for example by \cite{lme4} p. 25 where convergence warnings are generated by this function due to the fact the the estimated variances of random effects are close to zero. This issues may occur where model parameters are estimated for small or medium-sized data sets, where complex variance-covariance are assumed or the number of levels in groups taken into account for random effects is small. We have not observed such issues in case of estimation of model parameters based on the original dataset required to compute values of the predictors in previous sections. However, in the case of bootstrapping or Monte Carlo situations the situation is more complex. It is due to the fact the based on the estimates of model parameters the values of the dependent variables are generated $B$ times and than model parameters are estimated in each iteration. Hence, at least in some iterations, the values of the dependent variable can be generated based on the Linear Mixed Model, where the variance of random effects is relatively close to zero. Then, estimates of model parameters will be obtained however, convergance warnings can occur. In such a case, there are at least two possible solutions. Firstly, we can discard iterations with warnings but it will imply that our dependent variable will not follow the assumed model, as we would like to, but its conditional version only with relatively high values of variances of random effects. Secondly, we can take into account all generated realizations despite convergance warnings if only the parameters can be estimated for all iterations. We use the second solution following the argument presented in \cite{lme4} p. 25 who write that "being able to fit a singular model is an advantage: when the best fitting model lies on the boundary of a constrained space". + + +%%%%%%%%%%%%%%% +As previously stated, our package utilizes the \code{lmer()} function from the \CRANpkg{lme4} package for estimating model parameters. However, this function has been known to generate convergence warnings in certain situations, listed for example by \cite{lme4} p. 25, when the estimated variances of random effects are close to zero. Such scenarios may occur when models are estimated for smaller or medium-sized datasets, when complex variance-covariance structures are assumed, or when the grouping variable considered for random effects has only a few levels. Although we have not observed such issues estimating model parameters based on the original dataset required to compute values of the predictors in previous sections, bootstrapping or Monte Carlo simulations are more complex cases. This is because, based on the estimates of model parameters, the values of the dependent variables are generated $B$ times, and then model parameters are estimated in each out of $B$ iterations. Therefore, in at least some iterations, dependent variable values may be randomly generated giving realizations, where the variance of the random effect is relatively close to zero. As a result, estimates of model parameters can be obtained; however, convergence issues implying warnings may occur. In such cases, there are at least two possible solutions. The first option is to discard iterations with warnings, which would imply that the dependent variable would not follow the assumed model as required, but instead only its conditional version with relatively high values of variances of random effects. It will imply overdispersed bootstrap distribution of random effects, which will affect the bias of the bootstrap estimators of accuracy measures. The second option is to consider all generated realizations, despite convergence warnings, as long as the parameters can be estimated for all iterations. We opted for the latter solution, as argued in \cite{lme4} p. 25, who noted that "being able to fit a singular model is an advantage: when the best fitting model lies on the boundary of a constrained space". +%%%%%%%%%%%%% + + + +\subsection{Example 2} + +%%% 2024-02-22 + + +The analyses presented in Example 1 are continued. We extend the previous results to include the issue of estimating the prediction accuracy of the considered predictors. The use of functions for this estimation primarily requires an object of class predictor, here "myplugin". +\begin{verbatim} +> class(myplugin) +[1] "plugInLMM" +\end{verbatim} +The short chunk of the R code presents the residual bootstrap estimators of the RMSE (\code{estRMSE}) and the QAPE (\code{estQAPE}) of the PLUG-IN predictors (\code{plugin}) of previously analyzed three characteristics of radon measurements in county no. 26: the arithmetic mean, geometric mean and median. In this and subsequent examples we make the computations for relatively high number of iterations allowing, in our opinion, to get reliable results. These results are also used to prepare Figure \ref{hist}. However, the computations are time-consuming. The supplementary R file contains the same chunks of the code but the number of iterations applied is smaller in order to execute the code swiftly. +\begin{verbatim} +> # accuracy measures estimates based on +> # the residual bootstrap with the correction: +> B <- 500 # number of bootstrap iterations +> p <- c(0.75, 0.9) # orders of Quantiles of Absolute Prediction Error +> set.seed(1056) +> residBoot <- bootRes(myplugin, B, p, correction = TRUE) +> # values of estimated RMSEs of the predictor of three characteristics: +> # the arithmetic mean, geometric mean and median of radon measurements, respectively: +> residBoot$estRMSE +[1] 0.1848028 0.2003681 0.2824359 +> # values of estimated QAPEs +> # (of order 0.75 in the first row, and of order 0.9 in the second row) +> # of the predictor of three characteristics: +> # the arithmetic mean, geometric mean and median of radon measurements, +> # in the 1st, 2nd and 3rd column, respectively: +> residBoot$estQAPE + [,1] [,2] [,3] +75% 0.1533405 0.2135476 0.2908988 +90% 0.2813886 0.3397411 0.4374534 +\end{verbatim} + +Let us concentrate on interpretations of estimators of accuracy measures for the predictor of the geometric mean, i.e. the second value of \code{residBoot\$estRMSE}, and values in the second column of \code{residBoot\$estQAPE}. It is estimated that the average difference between predicted values of the geometric mean and their unknown realizations equals $0.2003681$ picoCurie per liter. Furthermore, it is estimated that at least $75\%$ of absolute prediction errors of the predictor of the geometric mean are smaller or equal to $0.2135476$ picoCurie per liter and at least $25\%$ of absolute prediction errors of the predictor are higher or equal to $0.2135476$ picoCurie per liter. Finally, it is estimated that at least $90\%$ of absolute prediction errors of the predictor of the geometric mean are smaller or equal to $0.3397411$ picoCurie per liter and at least $10\%$ of absolute prediction errors of the predictor are higher or equal to $0.3397411$ picoCurie per liter. The distributions of bootstrap absolute prediction errors with values of estimated RMSEs and QAPEs for the considered three prediction problems are presented in Figure \ref{hist}. + + +\begin{figure}[h] +\centering +\includegraphics[scale=0.59]{histAll.png} +\caption{The histograms of bootstrap absolute prediction errors for \code{myplugin} (for PLUG-IN predictors of the arithmetic mean, geometric mean and median) for $B=500$}\label{hist} +\end{figure} + + + +Since the assumption of normality is not met, the parametric bootstrap should not be used in this case. For this reason, we do not present the results for this method below, although -- but for illustrative purposes only -- they are presented in the supplementary R file. Moreover, these analyses can also be conducted using \code{bootParFuture()} and \code{bootResFuture()} functions where parallel computing algorithms are applied. The input arguments and the output of these functions are the same as in \code{bootPar()} and \code{bootRes()}. Examples based on these functions are also included in the supplementary R file. + +\section{Bootstrap under the misspecified model in \CRANpkg{qape}} +The \CRANpkg{qape} package also allows to use predictors under a model different from the assumed one (e.g. a simpler or more robust model), but estimate its accuracy under the assumed model. In this case, the parametric and residual bootstrap procedures are implemented in \code{bootParMis()} and \code{bootResMis()} functions. These functions allow to estimate the accuracy of two predictors under the model correctly specified for the first of them. Of course, it is expected that the estimated accuracy of the first predictor will be better than of the second one, but the key issue can be the difference between estimates of accuracy measures. A small difference, even to the second predictor's disadvantage, may be treated by the user as an argument for using the second predictor due to its properties, such as robustness or simplicity. + +The considered functions allow to estimate the accuracy of two predictors, which belong to the class \code{plugInLMM}, under the model used to define the first of them. The remaining arguments are the same as in \code{bootPar()} and \code{bootRes()} functions: \code{B} - the number of bootstrap iterations, and \code{p} - orders of QAPE estimates to be taken into account. + +The output results of \code{bootParMis()} and \code{bootResMis()} include -- similarly to \code{bootPar()} and \code{bootRes()} functions -- estimates of the RMSEs and QAPEs of both predictors (denoted here by: \code{estRMSElmm}, \code{estRMSElmmMis}, \code{estQAPElmm} and \code{estQAPElmmMis}), and boostrap realizations of their prediction errors (\code{errorLMM} and \code{errorLMMmis}). + +\subsection{Example 3} + +In this example, we study the same accuracy measures as in Example 2, but the aim is to compare the predictor \code{myplugin} and other predictor defined under the misspecified LMM. First, the misspecified model has to be defined, and a relevant predictor has to be computed. +\begin{verbatim} +> fixed.part.mis <- '1' +> random.part.mis <- '(1|county)' +> myplugin.mis <- plugInLMM(YS, fixed.part.mis, random.part.mis, reg, con, ++ weights = NULL, backTrans = backTransExp, thetaFun) +\end{verbatim} +Having two objects: \code{myplugin} and \code{myplugin.mis}, one can proceed to a comparison by estimating bootstrap prediction accuracy performed using the residual bootstrap with correction procedure. In this case, we estimate the prediction accuracy of these two predictors under the model used to define the first of them. +\begin{verbatim} +> set.seed(1056) +> residBootMis <- bootResMis(myplugin, myplugin.mis, B, p, correction = TRUE) +> # residual bootstrap with the correction RMSE estimators +> # of 'plugin' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estRMSElmm +[1] 0.1848028 0.2003681 0.2824359 +> # residual bootstrap with the correction RMSE estimators +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estRMSElmmMis +[1] 0.1919184 0.3192304 0.2762137 +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +> # of 'plugin' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estQAPElmm + [,1] [,2] [,3] +75% 0.1533405 0.2135476 0.2908988 +90% 0.2813886 0.3397411 0.4374534 +> # residual bootstrap with the correction QAPE estimators of order 0.75 and 0.9 +> # of 'plugin.mis' of: arithmetic mean, geometric mean and median +> # of radon measurements in county 26: +> residBootMis$estQAPElmmMis + [,1] [,2] [,3] +75% 0.2267062 0.3802836 0.3255197 +90% 0.2813787 0.4970726 0.4489399 +\end{verbatim} + +The results, presented above, were obtained for the same number of bootstrap iterations as in Example 2 ($B = 500$). If we compare, under the model defined in \code{plugin}, estimated RMSEs of \code{plugin} and \code{plugin.mis} predictors of the geometric mean given by $0.2003681$ and $0.3192304$ picoCurie per liter, respectively, we can state that the estimated accuracy (measured by RMSE estimators) of the first predictor is better comparing with the second one. If we are not interested in the average accuracy measures but in the right tail of the distribution of prediction errors, we can use estimates of QAPE of order 0.9 to compare the accuracy. The result for the \code{plugin.mis} of the geometric mean equals to $0.4970726$ picoCurie per liter, and it is higher comparing with $0.3397411$ picoCurie per liter obtained for \code{plugin} for the same prediction problem. Hence, in this case, the accuracy comparison based both on the RMSE and QAPE leads to the same finding. + +In the previous paragraph, we have focused on the results for the case of prediction of the geometric mean. If the comparison is made for the case of prediction of the arithmetic mean (the first column of output results) or the median (the third column of output results), we will come to the same conclusion regarding the estimated accuracy of \code{plugin} and \code{plugin.mis} as in the case of prediction of the geometric mean. + +Similarly to the residual bootstrap, the parametric bootstrap procedure \code{paramBootMis} available in \CRANpkg{qape} package can be performed. However, in the considered case the normality assumption is not met (as discussed above) and the procedure is not recommended. The appropriate chunk of the R code is presented in the supplementary R file, but it is solely intended for illustrative purposes. + +\section{Monte Carlo simulation analyses} + +In the previous section, our aim was to estimate the prediction accuracy under correctly specified or misspecified model. In this section, we do not estimate the accuracy, but we approximate the true prediction accuracy under the specified model in the Monte Carlo simulation study. The crucial difference is that in this case, the model parameters used are obtained based on the whole population dataset, not the sample. If the number of iterations is large enough, we can treat the computed values of the measures as their true values, which are unknown in practice. + +The last step of the analysis in \CRANpkg{qape} package presented in Procedure \ref{Proc1} is the Monte Carlo (MC) simulation analysis of: +\begin{itemize} + \item properties of predictors + \item and properties of parametric, residual and double bootstrap estimators of accuracy measures. +\end{itemize} +The whole Monte Carlo procedure is as follows. + +\begin{procedure} Model-based Monte Carlo simulation analyses in \CRANpkg{qape} +\label{Proc2} + \begin{enumerate}[label*=\arabic*.] + \item define the population vector of the dependent variable and the population matrix of auxiliary variables, + \item provide the information on the division of the population into the sampled and non-sampled part, + \item define $\theta$ - the characteristics of the response variable to be predicted, + \item define the predictors $\hat{\theta}$ and accuracy measures estimators which properties are to be assessed, + \item define the model to be used to generate realizations of the values of the dependent variable and estimate its parameters based on population data, + \item For {k=1, 2, ..., K} +\begin{enumerate}[label*=\arabic*.] + \item generate the population vector of the response variable based on the assumed model, + \item based on population data, compute the characteristics $\theta$, denoted by $\theta_k$, + \item based on sample data, estimate the parameters of the LMM, + \item based on sample data, compute values of predictors $\hat{\theta}$, denoted by $\hat{\theta}_k$, + \item based on sample data, estimate the accuracy of $\hat{\theta}$ using bootstrap methods, +\end{enumerate} + \item End For + \item compute accuracy measures of predictors using $\hat{\theta}_k$ and $\theta_k$ (for $k=1,2, ..., K$), + \item compute accuracy measures of estimators of prediction accuracy measures. +\end{enumerate} +\end{procedure} + +\section{Monte Carlo analyses in \CRANpkg{qape}} + +In order to perform a Monte Carlo (MC) analysis on the properties of predictors, it is necessary to have access to the entire population data for both dependent and independent variables. The function \code{mcLMMmis()} can be used with the following arguments. Firstly, the population values of the dependent variable (after a necessary transformation) should be declared as \code{Ypop}. By using the \code{Ypop} values, we can estimate the model parameters based on the entire population data (assuming that they are known). This allows us to generate values of the dependent variable in the simulation study that can mimic its distribution in the entire population, not just in the sample. This approach ensures that our simulation study can be an accurate representation of the random process in the entire population, resembling the real-world scenario. Secondly, three predictors: \code{predictorLMMmis}, \code{predictorLMM}, \code{predictorLMM2}, which belong to the class \code{plugInLMM}, are to be defined. The first one is used only to define the (possibly misspecified) model used to generate population values of the response variables. Accuracy of \code{predictorLMM} and \code{predictorLMM2} is assessed in the simulation study. The next two arguments include the number of MC iterations \code{K} and orders \code{p} of QAPEs used to assess the prediction accuracy. Finally, it should be noted that it is possible to modify covariance matrices of random components and random effects based on the model defined in \code{predictorLMMmis}, which are used tThiso generate values of the dependent variable. It is possible by declaring values of \code{ratioR} and \code{ratioG} arguments, which the diagonal elements of covariance matrices of random components and random effects, respectively, are divided by. + +The output of this function covers the following statistics of both predictors computed in the simulation study: relative biases (\code{rBlmm} and \code{rBlmm2}), relative RMSEs (\code{rRMSElmm} and \code{rRMSElmm2}) and QAPEs (\code{QAPElmm} and \code{QAPElmm2}). Simulation-based prediction errors of both predictors (\code{errorLMM} and \code{errorLMM2}) are also taken into account. + +\subsection{Example 4} + +In the example, an MC simulation is carried out assuming the \code{myplugin} predictor. The goal is to approximate the true accuracy of the prediction assuming model (\ref{radon.model}). Hence, in the package \CRANpkg{qape}, all input predictor objects in the function \code{mcLMMmis} have to be defined as \code{myplugin}. +\ +\begin{verbatim} +> # input arguments: +predictorLMMmis <- myplugin # to define the model +predictorLMM <- myplugin # which properties are assessed in the simulation study +predictorLMM2 <- myplugin # which properties are assessed in the sim. study +\end{verbatim} +Except that no modification of covariance matrices has to be used. +\begin{verbatim} +# diag. elements of the covariance matrix of random components are divided by: +ratioR <- 1 +# diag. elements of the covariance matrix of random effects are divided by: +ratioG <- 1 +\end{verbatim} +We specify the number of Monte Carlo iterations. +\begin{verbatim} +K <- 500 # the number of MC iterations +\end{verbatim} +The analysis is conducted in the object \code{MC}. + +\begin{verbatim} +> set.seed(1086) +> MC <- mcLMMmis(Ypop, predictorLMMmis, predictorLMM, predictorLMM2, ++ K, p, ratioR, ratioG) +> # relative bias of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26 (in %): +> MC$rBlmm +[1] -1.73208393 -0.04053178 -5.22355236 +\end{verbatim} +Results of the relative biases are obtained. It is seen, that under the assumed model the values of the considered predictor of the geometric mean (the second value of \code{MC\$rBlmm}) are smaller than possible realizations of the geometric mean on average by $0.04053178\%$. In turn, the relative RMSEs are as follows. + +\begin{verbatim} +> # relative RMSE of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26 (in %): +> MC$rRMSElmm +[1] 3.429465 4.665810 7.146678 +\end{verbatim} +In the considered case, the average difference between predicted values of the geometric mean and its possible realizations (the second value of \code{MC\$rRMSElmm}) equals $4.665810\%$. It should be noted that this value can be treated as the true value of the relative RMSE (if the number of iterations is large enough), not the estimated value obtained in Examples 2 and 3. + +Finally, QAPEs of orders 0.75 and 0.9 are considered. +\begin{verbatim} +> # QAPE of order 0.75 and 0.9 of 'predictorLMM' +> # of the arithmetic mean, geometric mean and median in county 26: +> MC$QAPElmm + [,1] [,2] [,3] +75% 0.1491262 0.1989504 0.2919221 +90% 0.2895684 0.2959457 0.4728064 +\end{verbatim} + +Let us interpret the results presented in the second column of \code{MC\$QAPElmm}. At least $75\%$ ($90\%$) of absolute prediction errors of the predictor of the geometric mean are smaller or equal to $0.1989504$ ($0.2959457$) picoCurie per liter and at least $25\%$ ($10\%$) of absolute prediction errors of the predictor are higher or equal to $0.1989504$ ($0.2959457$) picoCurie per liter. Similar to the values of the rRMSEs in the previous code chunk, the values can be considered to be true QAPE values, not the estimates presented in Examples 2 and 3. + +In Example 4, the accuracy of one predictor under the model used to define this predictor was presented. A more complex version of the simulation study, where the properties of two predictors are studied under the model defined by the third predictor, is presented in the supplementary R file. What is more, the \CRANpkg{qape} package also allows to use \code{mcBootMis()} function to conduct MC analyses of properties of accuracy measure estimators (estimators of MSEs and QAPEs) of two predictors (which belong to the class \code{plugInLMM}) declared as arguments. The model used in the simulation study is declared in the first predictor, but the properties of accuracy measures estimators of both predictors are studied. Output results of \code{mcBootMis()} covers simulation results on properties of different accuracy measures estimators, including the relative biases and relative RMSEs of the parametric bootstrap MSE estimators of both predictors. The same simulation-based statistics but for parametric bootstrap QAPE estimators are also included. Other bootstrap methods, including the residual bootstrap with and without the correction procedure, are also taken into account. The full list of output arguments of \code{mcBootMis()} function are presented in \code{qape-manual} file, cf. \cite{qape}. + +\section{Conclusions} + +The package enables R users to make predictions and assess the accuracy under linear mixed models based on different methods in a fast and intuitive manner -- not only based on the RMSE but also based on Quantiles of Absolute Prediction Errors. It also covers functions which allow to conduct Monte Carlo simulation analyses of properties of the methods of users interest. Its main advantage, compared to other packages, is the considerable flexibility in terms of defining the model (as in the \CRANpkg{lme4} package) and the predicted characteristic, but also the transformation of the response variable. + +In our opinion, the package is useful for scientists, practitioners and decision-makers in all areas of research where accurate estimates and forecasts for different types of data (including cross-sectional and longitudinal data) and for different characteristics play the crucial role. We believe that it will be of special interest to survey statisticians interested in the prediction for subpopulations with small or even zero sample sizes, called small areas. + + +\bibliography{wolny-zadlo} + + +\address{Alicja Wolny--Dominiak\\ + Department of Statistical and Mathematical Methods in Economics \\ + University of Economics in Katowice\\ + 50, 1 Maja Street\\ + 40--287 Katowice\\ + Poland\\} +\email{alicja.wolny-dominiak@uekat.pl} \\ +\url{web.ue.katowice.pl/woali/} + +\address{Tomasz \.{Z}\c{a}d{\l}o \\ +Department of Statistics, Econometrics and Mathematics \\ +University of Economics in Katowice\\ +50, 1 Maja Street\\ +40--287 Katowice\\ +Poland\\} +\email{tomasz.zadlo@uekat.pl} \\ +\url{web.ue.katowice.pl/zadlo/} + +%\email{} \\ + +\end{article} + +\end{document} \ No newline at end of file diff --git a/_articles/RJ-2024-005/RJ-2024-005.Rmd b/_articles/RJ-2024-005/RJ-2024-005.Rmd new file mode 100644 index 0000000000..04c50736c0 --- /dev/null +++ b/_articles/RJ-2024-005/RJ-2024-005.Rmd @@ -0,0 +1,936 @@ +--- +title: 'text2sdg: An R Package to Monitor Sustainable Development Goals from Text' +abstract: | + Monitoring progress on the United Nations Sustainable Development + Goals (SDGs) is important for both academic and non-academic + organizations. Existing approaches to monitoring SDGs have focused on + specific data types; namely, publications listed in proprietary + research databases. We present the text2sdg package for the R + language, a user-friendly, open-source package that detects SDGs in + text data using different individual query systems, an ensemble of + query systems, or custom-made ones. The text2sdg package thereby + facilitates the monitoring of SDGs for a wide array of text sources + and provides a much-needed basis for validating and improving extant + methods to detect SDGs from text. +author: +- name: Dominik S. Meier + affiliation: University of Basel + address: + - Steinengraben 22 4051 Basel + - Switzerland + - '(ORCID: 0000-0002-3999-1388)' + - | + [dominik.meier@unibas.ch](dominik.meier@unibas.ch){.uri} +- name: Rui Mata + affiliation: University of Basel + address: + - Missionsstrasse 60-62 4055 Basel + - Switzerland + - '(ORCID: 0000-0002-1679-906X)' + - | + [rui.mata@unibas.ch](rui.mata@unibas.ch){.uri} +- name: Dirk U. Wulff + affiliation: University of Basel + address: + - Missionsstrasse 60-62 4055 Basel + - Switzerland + - '(ORCID: 0000-0002-4008-8022)' + - | + [dirk.wulff@unibas.ch](dirk.wulff@unibas.ch){.uri} +date: '2025-01-10' +date_received: '2022-09-13' +journal: + firstpage: 83 + lastpage: 95 +volume: 16 +issue: 1 +slug: RJ-2024-005 +citation_url: https://rjournal.github.io/ +packages: + cran: + - text2sdg + - corpustools + - readr + - ggplot2 + - deeplr + - SDGdetector + bioc: [] +preview: preview.png +bibliography: text2sdg.bib +CTV: ~ +legacy_pdf: yes +legacy_converted: yes +output: + rjtools::rjournal_web_article: + self_contained: yes + toc: no + mathjax: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js + md_extension: -tex_math_single_backslash +draft: no + +--- + + +:::::: article +## Introduction + +The United Nations Sustainable Development Goals (SDGs) have become an +important guideline for both governmental and non-governmental +organizations to monitor and plan their contributions to social, +economic, and environmental transformations. The 17 SDGs cover large +areas of application, from ending poverty and improving health, to +fostering economic growth and preserving natural resources. As the +latest UN report [@SGD_report2022] attests, the availability of +high-quality data is still lacking in many of these areas and progress +is needed in identifying data sources that can help monitor work on +these goals. Monitoring of SDGs has typically been based on economic and +health data, which are often difficult and costly to gather (e.g., +; ). One attractive +alternative that has emerged from recent scientometric efforts is to +detect SDGs from text, such as academic publications. Digitized text +represents an attractive resource for monitoring SDGs across a large +number of domains because it is becoming widely available in various +types of documents, such as news articles, websites, corporate reports, +and social media posts. In light of this promise, we developed +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), a freely +available, open-source tool to enable the SDG-labeling of digitized text +and facilitate methodological development in this area. In what follows, +we first present some background on existing labeling systems developed +to identify SDGs from text, and then provide an overview of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package, +showcase its use in a representative case study, and discuss the promise +and limitations of the approach. + +## An overview of SDG labeling systems + +The [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package +provides a user-friendly way to use any existing or custom-made labeling +system developed to monitor the 17 SDGs in text sources. The package +implements six different labeling systems utilizing different keywords +and keyword combination rules, as well as an ensemble model based on the +six systems that was trained on labeled data. In the following, we will +first introduce the six existing labeling systems, namely the Elsevier, +Aurora, Auckland, SIRIS, SDGO, and SDSN systems, before discussing how +these systems are combined within the ensemble approach. See table +[1](#tab:systems_overview) for overview of these labeling systems. We +address custom-made labeling systems in a dedicated section below. + +### Individual labeling systems + +The most prominent SDG labeling system has been developed by *Elsevier*. +The Elsevier labeling system was integrated into the Times Higher +Education Impact Rankings in 2019, which at the time compared 1,118 +universities in their efforts to address the SDGs as measured by the +frequency of SDG-related terms in their academic output. The Elsevier +queries consist of a list of expert-vetted keywords that are combined +using logical AND operators, implying that multiple keywords must be met +to label a document as containing a certain SDG. The development of the +queries started with an original list of keywords for each SDG that were +iteratively fine tuned to maximize the number of identified papers +closely reflecting the different SDGs. This involved cropping or +combining keywords to reduce the number of irrelevant hits. A detailed +report on the initial development of the Elsevier query system is +provided by @jayabalasingham2019identifying. Since the first version, +the Elsevier labeling system has been iteratively improved, with the +latest versions including additional information specific to academic +publications and the Scopus database, such as identifiers of journal +names or research areas. +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) implements +the latest version without such additional identifiers to broaden the +package's applicability beyond the Scopus database +[@jayabalasingham2019identifying]. + +The Aurora Universities Network's \"Societal Impact and Relevance of +Research\" working group started to develop a labeling system in 2017 to +increase the visibility of research into the SDGs. Aurora's queries were +developed with the goal of identifying SDG-related academic publications +included in the Scopus database. Consequently, the syntax of Aurora +queries is similar to the Scopus query language and the Elsevier system. +However, in contrast to the Elsevier system, the queries combine +keywords in a more complex fashion, recruiting Boolean (AND, OR) and +proximity operators (e.g., w/3, implying within 3 words). As a result, +Aurora's keywords are more specific, possibly leading to a smaller +number of false positives. The initial version of the Aurora system only +included terms that appear in the SDG policy text of the targets and +indicators defined by the United Nations. Subsequent versions expanded +on this by including additional keywords that reflect academic +terminology. [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) +implements version 5.0 of the Aurora labeling system +[@vanderfeesten_maurice_2020_3817445]. This version represents an +improvement on previous versions based on a survey study +[@vanderfeesten_maurice_2020_3813230] and modifications inspired in +other efforts, namely those from Elsevier (above) and SIRIS (introduced +below). + +The Auckland labeling system [@wang2023mapping] was developed by the +University of Auckland to better understand how their research output +contributes to the SDGs. To construct the queries, they used text-mining +techniques to extract global and local SDG keywords from publication +metadata. These keywords were then sorted according to the number of +publications that include the terms and according to the keywords' term +frequency--inverse document frequency. The top-ranked keywords were then +manually reviewed to only retain keywords that are relevant. The +selected keywords were then combined with those of SDSN and Elsevier as +well as UN SDG Indicators to form the final SDG keyword list. These +queries formed the basis for the Auckland queries, which make use of +Boolean (AND, OR) operators and wildcards (e.g., \"\*\"). + +The SIRIS labeling system [@duran_silva_nicolau_2019_3567769] was +created by SIRIS Academic as part of the +[\"science4sdgs\"](http://science4sdgs.sirisacademic.com/) project to +better understand how science, innovation efforts, and technology +related to the SDGs. The SIRIS queries were constructed in a five-step +procedure. First, an initial list of keywords was extracted from the +United Nations official list of goals, targets and indicators. Second, +the list was manually enriched on a basis of a review of SDG relevant +literature. Third, a word2vec model that was trained on a text corpus +created from the enriched keyword list was used to identify keywords +that were semantically related to the initial list. Fourth, using the +DBpedia API, keywords were added that, according to the Wikipedia +corpus, had a categorical relationship with the initial list. Fifth, and +finally, the keyword list was manually revised. The queries of the SIRIS +labeling system primarily consist of individual keywords that +occasionally are combined with a logical AND. +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) implements +the only currently available version of the SIRIS labeling system +[@duran_silva_nicolau_2019_3567769] . + +The Open Source SDG (OSDG) project combines data from multiple sources +to detect SDGs in text. Instead of developing yet another query system, +OSDG's aim was to re-use and integrate existing knowledge by combining +multiple SDG \"ontologies\" (i.e., query systems). OSDG has also made +use of Microsoft Academic Graph to improve their results but because our +query-based system cannot implement this procedure, we adopt the simpler +ontology initially proposed by OSDG, which we refer to as \"SDGO\" in +the package. The labeling system was based on central keywords in the +SDG United Nations description (e.g.\"sanitation\" was classified into +\"SDG6\") and then manually expanded with additional relevant keywords +identified from a corpus of already labeled documents. The resulting +keyword list only makes use of the OR operator. +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) implements +the only currently available version of these queries [@Bautista2019]. + +Finally, the Sustainable Development Solutions Network [SDSN, @sdsn] +labeling system contains SDG-specific keywords compiled in a +collaborative effort by several universities from the Sustainable +Development Solutions Network (SDSN) Australia, New Zealand & Pacific +Network. This query system was developed to detect SDGs in large sets of +university-related text data, such as course listings or research +publications. The authors used United Nations documents, Google +searches, and personal communications as sources for the keywords. This +query system combines keywords with OR operators and does not make use +of AND operators. + +All in all, as can be seen in Table [1](#tab:systems_overview), the +latter systems differ from the former four in the complexity of their +queries: the Elsevier, Aurora, Auckland, and SIRIS systems make use of +keyword-combination queries and other criteria, such as proximity +operators, whereas SDGO and SDSN only make use of keywords. + +::: {#tab:systems_overview} + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Labeling system SDGs covered Query operators Unique keywords per SDG (mean & SD) Example query (SDG-01) + ----------------- ---------------- -------------------------------------- ------------------------------------- ----------------------------------------------------------- + Elsevier SDG 1 - SDG 16 OR, AND, wildcards (21.7) \"extreme poverty\" + + Aurora SDG 1 - SDG 17 OR, AND, wildcards, proximity search (31.6) (\"poverty\") W/3 (\"chronic\*\" OR \"extreme\") + + Auckland SDG 1 - SDG 16 OR, AND, wildcards (46.5) \"poverty eradication\" + + SIRIS SDG 1 - SDG 16 OR, AND \(148\) (\"anti-poverty\") AND (\"poverty\" OR \"vulnerability\") + + SDGO SDG 1 - SDG 17 OR \(236\) \"absolute poverty\" + + SDSN SDG 1 - SDG 17 OR (16.8) \"End poverty\" + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 1: Overview of the labeling systems implemented in + [**text2sdg**](https://CRAN.R-project.org/package=text2sdg). Legend: + OR---keywords are combined using logical ORs, implying that only the + keywords must be matched to assign an SDG label; AND---keywords are + combined using logical ANDs, implying that multiple keywords must be + matched to assign an SDG label; wildcards---keywords are matched + considering different keyword parts; proximity search---keywords must + co-occur within a certain word window to assign an SDG label. +::: + +### The ensemble labeling system + +In another publication [@wulff2023using], we evaluated the accuracy of +the six labeling systems implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) and a rival +approach [i.e., OSDG @pukelis2020osdg] using expert-labeled data sets. +These analyses lead to three critical observations. First, the accuracy +of SDG classifications was reasonable for all systems, but varied +considerably as a function of the data set. This is because the systems +differ in how liberal or conservative they assign SDGs to texts due to +differences in the types of query operators they employ. Specifically, +employing only OR-operators, SDGO and SDSN were considerably more +liberal, whereas the other four systems employing additional operators +were more conservative. In other words, the systems implement different +trade-offs between sensitivity (i.e., true-positive rate) and +specificity (i.e., true-negative rate). As a result, SDGO and SDSN +outperformed the other systems for SDG-rich documents and vice versa. In +addition to these differences in accuracy, we observed critical biases +in SDG profiles, with the systems overemphasizing different sets of +SDGs, and strong dependencies between SDG predictions and document +length. To address these limitations, we developed an ensemble model +approach that uses the the predictions of the six systems and document +length as inputs to a random forest model. After training with +expert-labeled and synthetic data, the ensemble model showed better +out-of-sample accuracy, lower false alarm rates, and smaller biases than +any individual labeling system [@wulff2023using]. As a result, this +ensemble model is also made available through +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) using a +dedicated function. + +In the following sections, we provide an overview over the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) R package +and demonstrate how its functions can be used to run to detect and +analyze SDGs in text. + +## The text2sdg package + +### Motivation for text2sdg + +Despite the effort put into developing various labeling systems and +their great promise in addressing the SDG-related data scarcity, extant +implementations of these approaches are not without shortcomings. First, +the labeling systems were mostly developed to be used within academic +citation databases (e.g., Scopus) and are not easily applied to other +text sources. Second, existing implementations lack transparent ways to +communicate which features are matched to which documents or how they +compare between a choice of labeling systems. We alleviate these +shortcomings by providing an open-source solution, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), that lets +users detect SDGs in any kind of text using any of the above-mentioned +systems, and ensemble of systems, or even customized, user-made labeling +systems. The package provides a common framework for implementing the +different extant or novel approaches and makes it easy to quantitatively +compare and visualize their results. + +### Overview of text2sdg package + +At the heart of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package are +the Lucene-style queries that are used to detect SDGs in text and the +ensemble models that build on these queries. The queries map text +features (i.e., words or a combination of words) to SDGs. For example, a +text that contains the words \"fisheries\" and \"marine\" would be +mapped to SDG 14 (i.e., conserve and sustainably use the oceans, seas +and marine resources for sustainable development) by the Aurora system. +To enable the use of such queries in R, the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package +recruits the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) +package [@corpustools]. +[**corpustools**](https://CRAN.R-project.org/package=corpustools) has +been built to implement complex search queries and execute them +efficiently for large amounts of text. Based on this, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) provides +several functions that implement extant labeling systems, facilitate the +specification of new labeling systems, and analyze and visualize search +results. Table [2](#tab:functions_overview) gives an overview of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) core +functions. + +The main functions of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) are +`detect_sdg` and `detect_sdg_systems`, which implement the ensemble +model approach [@wulff2023using] and the implemented labeling systems, +respectively, to identify SDGs in texts. The texts are provided to these +functions via the `text` argument as either a character vector or an +object of class `"tCorpus"` from +[**corpustools**](https://CRAN.R-project.org/package=corpustools). All +other arguments are optional. By default, the `detect_sdg_systems` +function runs only the Aurora, Auckland, Elsevier, and SIRIS systems, +but the set systems can be extended to all six systems using the +`system` argument. The functions further allow customization of the set +of SDGs using the `sdgs` argument and return a `tibble` with one row per +hit that has the following columns (and types) (italic column names only +present in the tibble returned by `detect_sdg_systems`): + +- document (factor) - index of element in the character vector or + corpus supply for text + +- sdg (character) - labels indicating the matched SDGs + +- system (character) - the query or ensemble system that produced the + match + +- *query_id* (integer) - identifier of query in the query system + +- *features* (character) - words in the document that were matched by + the query + +- hit (numeric) - running index of matches for each system + +Further details on the `detect_sdg` and `detect_sdg_systems` functions +and their output will be presented in the next section. + +The `detect_any` function implements the same functionality as +`detect_sdg_systems`, but permits the user to specify customized or +self-defined queries. These queries are specified via the `queries` +argument and must follow the syntax of the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) +package (see Practical Considerations section for more details). + +To support the interpretation of SDG labels generated by `detect_sdg`, +`detect_sdg_systems` and `detect_any`, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) further +provides the `plot_sdg` and `crosstab_sdg` functions. The `plot_sdg` +function visualizes the distribution of SDG labels identified in +documents by means of a customizable barplot showing SDG frequencies for +the different labeling systems. The `crosstab_sdg` function helps reveal +patterns of label co-occurrences either across SDGs or systems, which +can be controlled using the `compare` argument. + +::: {#tab:functions_overview} + -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Function Name Description + ---------------------- --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + `detect_sdg` identifies SDGs in text using an ensemble model that draws on the six labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN). + + `detect_sdg_systems` identifies SDGs in text by using labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN). + + detect_any similar to `detect_sdg` but identifies SDGs in text using user-defined queries. + + `crosstab_sdg` crosstab_sdg takes the output of detect_sdg, detect_sdg_systems, or detect_any as input and determines correlations between either query systems or SDGs. + + `plot_sdg` takes the output of detect_sdg, detect_sdg_systems, or detect_any as input and produces adjustable barplots illustrating the hit frequencies produced by the different query systems. + -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 2: Overview of package functions +::: + +## Demonstrating the functionality of text2sdg + +To showcase the functionalities of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package we +analyze the publicly available p3 dataset of the Swiss National Science +Foundation (SNSF) that lists research projects funded by the SNSF. In +addition to demonstrating +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), the case +study will permit us to discuss practical issues concerning the labeling +of SDGs, including relevant differences between labeling systems. The +data to reproduce the analyses presented below can be found at + [@meier_2024_11060662]. + +### Preparing the SNSF projects data + +The SNSF projects data was downloaded from +. As of March 2022, the p3 database +included information on 81,237 research projects. From the data, we +removed 54,288 projects where the abstract was absent or not written in +English. This left us with a total of 26,949 projects. To ready this +data for analysis, we read it using the `readr` function of the +[**readr**](https://CRAN.R-project.org/package=readr) package [@readr], +producing a `tibble` named `projects`. A reduced version of this +`tibble` is included in the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package and +available through the `projects` object after +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) has been +loaded. + +### Using `detect_sdg` and `detect_sdg_systems` to detect SDGs + +To label the abstracts in `projects` using `detect_sdg`, we only have to +supply the character vector that includes the abstracts to the `text` +argument of the `detect_sdg` function. In addition the example below +makes use of the `synthetic` argument to implement the `"equal"` +(default) and `"triple"` version of the ensemble model. As a result, two +versions of the ensemble model are run that were trained on an equal +amount of synthetic (non-SDG related) and expert-labeled data and three +times the amount of synthetic than labeled data, respectively. A larger +amount of synthetic data in training lowers the false-positive rate, but +also compromises accuracy [cf. @wulff2023using for more details]. + +``` r +# detect SDGs +> sdgs_ensemble <- detect_sdg(text = projects, ++ synthetic = c("equal","triple")) +Running systems +Obtaining text lengths +Building features +Running ensemble + +> head(sdgs_ensemble) +# A tibble: 6 × 4 + document sdg system hit + +1 22 SDG-06 Ensemble equal 2539 +2 39 SDG-03 Ensemble equal 498 +3 39 SDG-07 Ensemble equal 2953 +4 39 SDG-08 Ensemble equal 4080 +5 41 SDG-13 Ensemble equal 5690 +6 41 SDG-13 Ensemble triple 3684 + + +``` + +The first two columns of the `tibble` returned by `detect_sdg` show the +document and SDGs identified by the model. Further columns show the +system producing the hit and a running hit index for a given system. As +the predictions of the six individual labeling systems are used as input +for the ensemble models, they will be computed in the background. The +user can access these predictions by calling +`attr(sdgs_ensemble, "system_hits")`. Alternatively, the user can use +the `detect_sdg_systems` function, which provides additional options for +customization. + +As with the `detect_sdg` function, the `detect_sdg_systems` function +requires a character vector as input to the `text` argument. In +addition, the example below specifies two optional arguments. First, to +indicate that all six systems should be run, rather than the default of +only Aurora, Auckland, Elsevier, and SIRIS, we supply a character vector +of all six systems' names to the `systems` argument. Second, we +explicitly set the `output` argument to `“features”`, which in contrast +to `output = “documents”` delivers more detailed information about which +keywords that triggered the SDG labels. + +``` r +# detect SDGs +> sdgs <- detect_sdg_systems(text = projects, ++ systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO"), ++ output = "features") +Running Aurora +Running Elsevier +Running Auckland +Running SIRIS +Running SDSN +Running SDGO + +> head(sdgs) +# A tibble: 6 × 6 + document sdg system query_id features hit + +1 1 SDG-01 SDSN 392 sustainable 4 +2 1 SDG-02 SDSN 376 maize 3 +3 1 SDG-02 SDSN 629 sustainable 8 +4 1 SDG-08 SDGO 3968 work 1 +5 1 SDG-08 SDSN 812 work 11 +6 1 SDG-09 SDSN 483 research 6 + +``` + +The above `tibble` produced by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) contains for +every combination of document, SDG, system, and query (columns 1 to 4), +the query feature (keyword) that triggered the label (column 5), and a +hit index for a given system (column 6). The first row of the `tibble` +thus shows that the query 392 within SDSN labeled document number 1 with +SDG-01, because the document included the feature *sustainable*, and +that this was the fourth hit produced by the SDSN system. It is +important to note that, in other cases, multiple features of a query +might be matched, which will result in multiple rows per combination of +document, SDG, system, and query. This can be avoided by setting the +`output` argument to `“documents”`, in which case all features' hits of +such combinations will be grouped into a single row. + +### Analyzing the SDG labels + +To visualize the distribution of SDG labels across SDGs and systems in +the `sdgs` `tibble`, we apply the `plot_sdg` function. By default, +`plot_sdg` shows a barplot of the number of documents labeled by each of +the SDGs, with the frequencies associated with the different systems +stacked on top of each other. The function counts a maximum of one hit +per document-system-SDG combination. Duplicate combinations resulting +from hits by multiple queries or keywords in queries will be suppressed +by default and the function returns a message reporting the number of +cases affected. + +``` r + +> plot_sdg(sdgs) +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates. +``` + +```{r figuredefault-plot, echo=FALSE , fig.cap="Default plot of distribution of detected SDGs.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("default_plot_revision.png")) +``` + +The plot produced by `plot_sdg` (Figure  \@ref(fig:figuredefault-plot)) +shows considerable differences in the frequency of different SDGs, with +SDGs 3 ("Good Health and Well-Being") and 9 ("Industry, Innovation And +Infrastructure") being most frequent and SDGs 5 ("Gender Equality") and +14 ("Life Below Water") being least frequent. Furthermore, there are +substantial differences in the number of labels produced by different +systems, with SDSN and SDGO having produced many more labels than the +other three systems. + +To customize the visualization of SDG frequencies, the `plot_sdg` +function provides several additional arguments. For instance, by setting +`sdg_titles` to `TRUE`, the SDG titles will be added to the annotation +of the plot. Other arguments are `normalize` to show probabilities +instead of frequencies, `color` to change the filling of bars, and +`remove_duplicates` to eliminate duplicate document-system-SDG +combinations. Furthermore, as `plot_sdg` is built on +[**ggplot2**](https://CRAN.R-project.org/package=ggplot2) [@ggplot2], +the function can easily be extended by functions from the +[**ggplot2**](https://CRAN.R-project.org/package=ggplot2) universe. To +illustrate these points, the code below generates a plot (Figure + \@ref(fig:figuredefault-plot-facetted)) that includes SDG titles and +separates the results of the different SDG systems using facets. + +``` r +> plot_sdg(sdgs, ++ sdg_titles = TRUE) + ++ ggplot2::facet_wrap(~system, ncol= 1, scales = "free_y") +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates. +``` + +```{r figuredefault-plot-facetted, echo=FALSE , fig.cap="Distribution of detected SDGs facetted by system.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("default_plot_sdg_labels_revision.png")) +``` + +The separation of systems better illustrates the results of systems that +produce fewer hits and helps compare the results across systems. This +reveals, for instance, that in the Elsevier system SDG 3 ("Good Health +and Well-Being") was most prominent, whereas in the Aurora system this +was SDG 13 (\"Climate Action"). These results highlight that the +different labeling systems do not necessarily agree concerning the +assignment of SDGs to documents. + +To quantify the commonalities and differences between labeling systems, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) provides the +`crosstab_sdg` function. The function evaluates the level of alignment +across either systems (the default) or SDGs by calculating $\phi$ +coefficients between the vectors of labels. We supply the `hits` +argument of the function with the `sdgs` `tibble` containing the labels +produced by `detect_sdg`. Note that the function only considers distinct +combinations of documents, systems and SDGs, irrespective of whether the +`detect_sdg` function was run using `output = “documents”` or +`output = "features”`. + +``` r + +> crosstab_sdg(sdgs) + Auckland Aurora Elsevier SDGO SDSN SIRIS +Auckland 1.0000000 0.3345247 0.6676524 0.3314806 0.2896650 0.4115387 +Aurora 0.3345247 1.0000000 0.3256877 0.1614586 0.1569791 0.3703457 +Elsevier 0.6676524 0.3256877 1.0000000 0.2642918 0.2192051 0.3538272 +SDGO 0.3314806 0.1614586 0.2642918 1.0000000 0.3722997 0.2244774 +SDSN 0.2896650 0.1569791 0.2192051 0.3722997 1.0000000 0.2330684 +SIRIS 0.4115387 0.3703457 0.3538272 0.2244774 0.2330684 1.0000000 +``` + +The output of `crosstab_sdg()` for the SNSF projects reveals two +noteworthy insights. First, the correspondence between the labels of +different systems is rather small, as indicated by $\phi$ coefficients +that are mostly smaller than 0.4. Second, there are two groups of +systems that are more similar to one another. On the one hand, Elsevier, +Auckland, Aurora, and SIRIS, and, on the other hand, SDGO and SDSN. +These groups correspond to differences in query operators, with the +former four including AND operators in their queries, whereas the latter +two do not. `crosstab_sdg()` can also be called with the output from the +ensemble models. + +``` r +> crosstab_sdg(sdgs_ensemble) + Ensemble equal Ensemble triple +Ensemble equal 1.0000000 0.8127837 +Ensemble triple 0.8127837 1.0000000 +``` + +It can further be informative to analyze the correlations between SDGs. +To do this, we set the `compare` argument in `crosstab_sdg()` to +`"sdgs"`. The output below shows the result for the first six SDGs by +setting `sdgs = 1:6`. It can be seen that certain pairs of SDGs---in +particular, SDG-01 and SDG-02---co-occur more frequently. These results +may provide insights into the co-occurrence structure of SDGs in the +data at hand. However, these results can also highlight the importance +of considering similarities between queries targeting different SDGs. + +``` r + +> crosstab_sdg(sdgs, compare = "sdgs", sdgs = 1:6) + SDG-01 SDG-02 SDG-03 SDG-04 SDG-05 SDG-06 +SDG-01 1.00000000 0.47455139 0.04811778 0.07928418 0.14252372 0.16622948 +SDG-02 0.47455139 1.00000000 0.10611662 0.06751253 0.09338952 0.17504027 +SDG-03 0.04811778 0.10611662 1.00000000 0.18092227 0.10936179 0.04882173 +SDG-04 0.07928418 0.06751253 0.18092227 1.00000000 0.11791600 0.07887042 +SDG-05 0.14252372 0.09338952 0.10936179 0.11791600 1.00000000 0.04603253 +SDG-06 0.16622948 0.17504027 0.04882173 0.07887042 0.04603253 1.00000000 + +``` + +## Practical considerations + +### Specifying user-defined labeling systems + +The query systems implemented in +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) represent +important efforts to systematize the monitoring of SDGs from text. +Nevertheless, these efforts are still relatively young and validations +of the systems are largely missing, creating a need for continued +development. [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) +supports the further development of new SDG labeling systems by +providing the `detect_any` function. In this section, we provide +additional detail on using this feature of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg). + +The `detect_any` function also uses +[**corpustools**](https://CRAN.R-project.org/package=corpustools) as the +back-end. This implies that new queries must be specified to match the +syntax of +[**corpustools**](https://CRAN.R-project.org/package=corpustools). The +syntax supports standard Boolean operators (AND, OR, and NOT), wildcard +operators, and proximity search. Boolean operators control how different +keywords are combined in a query. For instance, the query \"marine OR +fisheries\" matches text that contains either of these two words whereas +the query \"marine AND fisheries\" only matches text that contains both +words. Corpustools also allows to specify common query wildcard +operators [^1]. The wildcard operators $?$ and $*$ allow the +specification of variable word parts. For instance, the question mark +operator $?$ matches one unknown character or no character at all, e.g., +\"?ish\" would match \"fish\", \"dish\", or \"ish\". The asterisk +operator $*$, by contrast, matches any number of unknown characters, +e.g., \"\*ish\" would match \"fish\" but also \"Swedish\". Both +wildcards can be used at the start, within or end of a term. Proximity +search extends a Boolean AND, by requiring that two keywords have no +more than defined distances to one another. For instance, \"climate +change\"$\sim$``{=html}3 specifies matches in which \"climate\" +and \"change\" both occur no more than three words apart. A complete +description of the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) syntax +is presented in the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) +vignette and documentation. + +To supply a user-defined labeling system to `detect_any`, the queries +must be placed in a `data.frame` or `tibble` that additionally includes +a column specifying the labeling system's name and a column of SDG +labels corresponding to the queries. + +- system (character) - name of the labeling systems. + +- queries (character) - user-defined queries. + +- sdg (integer) - SDGs labels assigned by queries. + +The example below illustrates the application of a user-defined labeling +system using `detect_any`. First, a `tibble` is defined that includes +three rows, one for each of three different queries stored in the +`query` column. The system is called `"my_example_system"` in the +`system` column and each of the queries is assigned SDG-14 in the `sdg` +column. Note that specification of the labeling system need not be made +in R, but can easily be outsourced to a spreadsheet that is then +processed into a `tibble`. Second, the system is supplied to the +`system` argument of the `detect_any` function, along with the texts +(here, the SNSF abstracts). The output is analogous to the output of the +`detect_sdg_systems` function (for brevity, we only show the first three +lines of the output). + +``` r +> # definition of query set +> my_example_system <- tibble::tibble(system = "my_example_system", ++ query = c("marine AND fisheries", ++ "('marine fisheries') AND sea", ++ "?ish"), ++ sdg = c(14,14,14)) +> detect_any(text = projects, ++ system = my_example_system) +# A tibble: 591 × 6 + document sdg system query_id features hit + + 1 6 SDG-14 my_example_system 3 wish 122 + 2 134 SDG-14 my_example_system 3 wish 18 + 3 241 SDG-14 my_example_system 3 fish 59 +``` + +### Applying text2sdg to non-English data + +The queries of the labeling systems implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) are in +English, implying that texts in other languages must first be translated +to English. We assessed feasibility and whether translation affects the +reliability of SDG labels by making use of back translation with one +language we are most familiar with (German). To this end, we first +translated 1,500 randomly selected SNSF project abstracts from English +to German and from German to English and then compared the labels of the +original English and back-translated English abstracts. We carried out +the translation using the DeepL translation engine +([www.deepl.com/translator](https://www.deepl.com/translator)). + +Table [3](#tab:table_corr) shows the results of this analysis. +Overall, the correlations as measured by the $phi$-coefficient are very +high. The systems showed correlations above or equal to $0.88$, with +Elsevier and Auckland showing the highest value of $0.93$. Considering +that our analysis involves not only one, but two translation +steps---from German to English and back---these results suggest that +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) can be +applied to non-English text, such as German, with very high accuracy. +One should note, however, that the quality of translation may vary +across languages and translation engines so additional work is needed to +compare performance across different languages. + +::: {#tab:my-table_corr} + ---------------------------------------------------- + Aurora Elsevier Auckland SIRIS SDSN SDGO + -------- ---------- ---------- ------- ------ ------ + 0.91 0.93 0.93 0.88 0.91 0.91 + + ---------------------------------------------------- + + : Table 3: $phi$-coefficient between the labels for the original + English text and the labels for the back-translated + (English-German-English) English text +::: + +### Estimating the runtime of text2sdg + +The analysis of text data can be computationally intense. To provide +some guidance on the expected runtime of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) for data +with different numbers of documents and different document lengths, we +carried out several experiments. For this purpose, we first simulated +documents by concatenating 10, 100, 1,000, or 10,000 words drawn +randomly according to word frequencies in Wikipedia and combined 1, 10, +100, or 1,000 thus-generated documents into simulated data sets. Then we +evaluated the runtime of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) separately +by system for the simulated data sets. + +Figure \@ref(fig:figurebenchmark-plot) shows the average runtime in +seconds across 7,000 repetitions of each combination of document length +and number of documents for each of the labeling systems. The results +highlight noteworthy points. First, runtime is primarily a function of +the number of words, irrespective of how words are distributed across +documents. Second, the runtime per words decreases as the number of +words increases, which is due to a constant overhead associated with +optimizing the labeling systems' queries. Third, there are considerable +differences in the runtime between systems, which is, in part, due to +the functions' overhead and, in part, due to differences in number and +complexity of queries. The fastest system is Elsevier, processing 10 +million words in roughly one minute; the slowest system is SIRIS, +processing 10 million words in about 40 minutes. Overall, these +experiments highlight that +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) can +efficiently process large amounts of text, but also that some care +should be exercised when dealing with extremely large or many texts. In +such cases, it may be advisable to rely on more efficient labeling +systems, such as Elsevier or SDSN. + +```{r figurebenchmark-plot, echo=FALSE , fig.cap="Median runtime as a function of number of documents and document length using 6 different query systems. Each cell reflects the average runtime of 7,000 runs with numbers reflecting the median runtime in seconds and color reflecting the logarithm of the median runtime in seconds.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("benchmark_revision_final.png")) +``` + +## Other approaches to detecting SDGs in text + +There are a number of other approaches to detecting SDGs in text. First, +there are approaches outside the R ecosystem. One such tool is the +European Union's SDG Mapper +() that produces an +analysis of SDGs per document using an online interface in which +registered users can upload single documents. Another prominent example +is the OSDG tool developed by the SDG Ai Lab of the United Nations in +collaboration with private partners. It can detect SDGs in text that is +provided through the OSDG website () or, if granted +access, through an API. The OSDG tool builds on the SDG Ontology (SDGO) +that is also implemented in +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg). OSDG +additionally leverages a machine learning tool that was trained on +expert-labeled data to make the final predictions [@OSDG2]. One +advantage of OSDG relative to +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) is that it +allows to detect SDGs in 15 different languages. This is done by using +translation of the input text into English before passing it through the +OSDG workflow. While this is convenient to the user, the same outcome +can be achieved with our package by making use of translation models +through, for example the +[**deeplr**](https://CRAN.R-project.org/package=deeplr) R package. As +our proof-of-concept above has shown, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) can be used +with non-English text (e.g., German) with very high accuracy by using +such an approach. + +Second, there are currently, to our knowledge, two other R packages +aimed at providing methods for the automated detection of SDGs in text. +The [**SDGdetector**](https://CRAN.R-project.org/package=SDGdetector) +package is based on a custom query system that was generated by pooling +several existing query systems and manual adaptions. The resulting +labeling system permits finer-grained predictions on the level of SDG +targets [^2]. However, the method is computationally taxing and limited +to texts that are shorter than 750 characters or approximately 150 +words. The **SDGmapR** package builds on publicly available SDG keywords +that are assigned weights that indicate the degree to which a keyword +reflects a given SDG. The package computes SDG weights for each text by +adding up the weights of the keywords that were found in the text. The +larger this weight, the larger should be the likelihood that the text is +related to a specified SDG. The advantage of this approach is that it +permits customization of the decision boundary (i.e., the weight needed +to count a text as SDG related). However, the package does not give the +user a binary decision regarding whether a text relates to a given SDG. +None of the two packages offers an ensemble model that can be used to +categorize the presence of SDGs as is the case with +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg). + +## Discussion + +The [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package +offers an open and easily accessible way of detecting SDGs in text using +both individual query systems, a state-of-the-art ensemble model that +combines queries from extant systems [@wulff2023using], as well as +custom-made queries. + +While our package implements several query-based methods to detect SDGs +in text as well as a state-of-the-art ensemble model, the field of +detecting SDGs in text is rapidly evolving. Our aim is to continuously +update [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) as +new open source methods of detecting SDGs in text are released. Bundling +many systems in a coherent API is not only convenient for users, but +also helps catalyze development of new and hopefully more accurate +methods by making it easy to compare the performance of the different +systems. We deliberately incorporated functions that allow users to +implement and test their own query systems to facilitate this process. +We also encourage others to contribute to +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) by adding +new systems or by expanding the existing functionalities to analyse the +output of the systems. + +Indeed, although the systems implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) have been +shown to achieve high accuracy [@wulff2023using], it is important to +stress that these systems must be further developed to increase their +accuracy for a greater number of document types. Two approaches can help +in achieving this. First, unsupervised methods such as topic models +[@grun2011topicmodels] or semantic network analysis [@siew2019cognitive] +can help in identifying novel linguistic patterns for the detection of +SDGs. One should note, however, that unsupervised methods are no +replacement for top-down, rule-based methods as implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), because of +the strong requirement to compare results across data sets, analyses, +and time, which require a clear set of benchmarks that are not simply +data-driven. Second, recent transformer based models +[@reimers2019sentence] could be leveraged to learn more complex +relationships between specific linguistic patterns and SDGs. However, +the field will have to work towards producing more balanced training +data before the full potential of these approaches can be exploited. +Moreover, one should note that transformer models are computationally +expensive and often limited to short text due to architecture +constraints [@ding2020cogltx]. Whether such developments will emerge and +can be ultimately integrated into +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) or will +represent alternative approaches remains an open question. + +## Conclusion + +In this article, we introduced a new R package, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), designed to +help identify SDGs from text. The package promises to help detect SDGs +in text sources using different existing or custom-made labeling systems +as well as a high-performance ensemble model that builds on these +labeling systems. Our case study and additional analyses suggest that +the approach can handle both sources in English as well as translations, +allows user-friendly use of novel queries, and provides reasonably +efficient performance for analysing large corpora. +:::::: + +[^1]: Note that the meaning of these wildcards differs from regex + wildcards. + +[^2]: Each SDG has several targets that are operationalized with + indicators (SDG/targets/indicators). For example the first target of + SDG 1 reads as follows: \"By 2030, eradicate extreme poverty for all + people everywhere, currently measured as people living on less than + \$1.25 a day\". diff --git a/_articles/RJ-2024-005/RJ-2024-005.html b/_articles/RJ-2024-005/RJ-2024-005.html new file mode 100644 index 0000000000..9babda22a2 --- /dev/null +++ b/_articles/RJ-2024-005/RJ-2024-005.html @@ -0,0 +1,2790 @@ + + + + + + + + + + + + + + + + + + + + + + text2sdg: An R Package to Monitor Sustainable Development Goals from Text + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    text2sdg: An R Package to Monitor Sustainable Development Goals from Text

    + + + +

    Monitoring progress on the United Nations Sustainable Development +Goals (SDGs) is important for both academic and non-academic +organizations. Existing approaches to monitoring SDGs have focused on +specific data types; namely, publications listed in proprietary +research databases. We present the text2sdg package for the R +language, a user-friendly, open-source package that detects SDGs in +text data using different individual query systems, an ensemble of +query systems, or custom-made ones. The text2sdg package thereby +facilitates the monitoring of SDGs for a wide array of text sources +and provides a much-needed basis for validating and improving extant +methods to detect SDGs from text.

    +
    + + + +
    +
    +

    1 Introduction

    +

    The United Nations Sustainable Development Goals (SDGs) have become an +important guideline for both governmental and non-governmental +organizations to monitor and plan their contributions to social, +economic, and environmental transformations. The 17 SDGs cover large +areas of application, from ending poverty and improving health, to +fostering economic growth and preserving natural resources. As the +latest UN report (UN 2022) attests, the availability of +high-quality data is still lacking in many of these areas and progress +is needed in identifying data sources that can help monitor work on +these goals. Monitoring of SDGs has typically been based on economic and +health data, which are often difficult and costly to gather (e.g., +https://sdg-tracker.org/; https://www.sdgindex.org/). One attractive +alternative that has emerged from recent scientometric efforts is to +detect SDGs from text, such as academic publications. Digitized text +represents an attractive resource for monitoring SDGs across a large +number of domains because it is becoming widely available in various +types of documents, such as news articles, websites, corporate reports, +and social media posts. In light of this promise, we developed +text2sdg, a freely +available, open-source tool to enable the SDG-labeling of digitized text +and facilitate methodological development in this area. In what follows, +we first present some background on existing labeling systems developed +to identify SDGs from text, and then provide an overview of the +text2sdg package, +showcase its use in a representative case study, and discuss the promise +and limitations of the approach.

    +

    2 An overview of SDG labeling systems

    +

    The text2sdg package +provides a user-friendly way to use any existing or custom-made labeling +system developed to monitor the 17 SDGs in text sources. The package +implements six different labeling systems utilizing different keywords +and keyword combination rules, as well as an ensemble model based on the +six systems that was trained on labeled data. In the following, we will +first introduce the six existing labeling systems, namely the Elsevier, +Aurora, Auckland, SIRIS, SDGO, and SDSN systems, before discussing how +these systems are combined within the ensemble approach. See table +1 for overview of these labeling systems. We +address custom-made labeling systems in a dedicated section below.

    +

    Individual labeling systems

    +

    The most prominent SDG labeling system has been developed by Elsevier. +The Elsevier labeling system was integrated into the Times Higher +Education Impact Rankings in 2019, which at the time compared 1,118 +universities in their efforts to address the SDGs as measured by the +frequency of SDG-related terms in their academic output. The Elsevier +queries consist of a list of expert-vetted keywords that are combined +using logical AND operators, implying that multiple keywords must be met +to label a document as containing a certain SDG. The development of the +queries started with an original list of keywords for each SDG that were +iteratively fine tuned to maximize the number of identified papers +closely reflecting the different SDGs. This involved cropping or +combining keywords to reduce the number of irrelevant hits. A detailed +report on the initial development of the Elsevier query system is +provided by Jayabalasingham et al. (2019). Since the first version, +the Elsevier labeling system has been iteratively improved, with the +latest versions including additional information specific to academic +publications and the Scopus database, such as identifiers of journal +names or research areas. +text2sdg implements +the latest version without such additional identifiers to broaden the +package’s applicability beyond the Scopus database +(Jayabalasingham et al. 2019).

    +

    The Aurora Universities Network’s "Societal Impact and Relevance of +Research" working group started to develop a labeling system in 2017 to +increase the visibility of research into the SDGs. Aurora’s queries were +developed with the goal of identifying SDG-related academic publications +included in the Scopus database. Consequently, the syntax of Aurora +queries is similar to the Scopus query language and the Elsevier system. +However, in contrast to the Elsevier system, the queries combine +keywords in a more complex fashion, recruiting Boolean (AND, OR) and +proximity operators (e.g., w/3, implying within 3 words). As a result, +Aurora’s keywords are more specific, possibly leading to a smaller +number of false positives. The initial version of the Aurora system only +included terms that appear in the SDG policy text of the targets and +indicators defined by the United Nations. Subsequent versions expanded +on this by including additional keywords that reflect academic +terminology. text2sdg +implements version 5.0 of the Aurora labeling system +(Vanderfeesten et al. 2020a). This version represents an +improvement on previous versions based on a survey study +(Vanderfeesten et al. 2020b) and modifications inspired in +other efforts, namely those from Elsevier (above) and SIRIS (introduced +below).

    +

    The Auckland labeling system (Wang et al. 2023) was developed by the +University of Auckland to better understand how their research output +contributes to the SDGs. To construct the queries, they used text-mining +techniques to extract global and local SDG keywords from publication +metadata. These keywords were then sorted according to the number of +publications that include the terms and according to the keywords’ term +frequency–inverse document frequency. The top-ranked keywords were then +manually reviewed to only retain keywords that are relevant. The +selected keywords were then combined with those of SDSN and Elsevier as +well as UN SDG Indicators to form the final SDG keyword list. These +queries formed the basis for the Auckland queries, which make use of +Boolean (AND, OR) operators and wildcards (e.g., "*").

    +

    The SIRIS labeling system (Duran-Silva et al. 2019) was +created by SIRIS Academic as part of the +"science4sdgs" project to +better understand how science, innovation efforts, and technology +related to the SDGs. The SIRIS queries were constructed in a five-step +procedure. First, an initial list of keywords was extracted from the +United Nations official list of goals, targets and indicators. Second, +the list was manually enriched on a basis of a review of SDG relevant +literature. Third, a word2vec model that was trained on a text corpus +created from the enriched keyword list was used to identify keywords +that were semantically related to the initial list. Fourth, using the +DBpedia API, keywords were added that, according to the Wikipedia +corpus, had a categorical relationship with the initial list. Fifth, and +finally, the keyword list was manually revised. The queries of the SIRIS +labeling system primarily consist of individual keywords that +occasionally are combined with a logical AND. +text2sdg implements +the only currently available version of the SIRIS labeling system +(Duran-Silva et al. 2019) .

    +

    The Open Source SDG (OSDG) project combines data from multiple sources +to detect SDGs in text. Instead of developing yet another query system, +OSDG’s aim was to re-use and integrate existing knowledge by combining +multiple SDG "ontologies" (i.e., query systems). OSDG has also made +use of Microsoft Academic Graph to improve their results but because our +query-based system cannot implement this procedure, we adopt the simpler +ontology initially proposed by OSDG, which we refer to as "SDGO" in +the package. The labeling system was based on central keywords in the +SDG United Nations description (e.g."sanitation" was classified into +"SDG6") and then manually expanded with additional relevant keywords +identified from a corpus of already labeled documents. The resulting +keyword list only makes use of the OR operator. +text2sdg implements +the only currently available version of these queries (Bautista 2019).

    +

    Finally, the Sustainable Development Solutions Network (SDSN, Sustainable Development Solutions Network (SDSN) 2021) +labeling system contains SDG-specific keywords compiled in a +collaborative effort by several universities from the Sustainable +Development Solutions Network (SDSN) Australia, New Zealand & Pacific +Network. This query system was developed to detect SDGs in large sets of +university-related text data, such as course listings or research +publications. The authors used United Nations documents, Google +searches, and personal communications as sources for the keywords. This +query system combines keywords with OR operators and does not make use +of AND operators.

    +

    All in all, as can be seen in Table 1, the +latter systems differ from the former four in the complexity of their +queries: the Elsevier, Aurora, Auckland, and SIRIS systems make use of +keyword-combination queries and other criteria, such as proximity +operators, whereas SDGO and SDSN only make use of keywords.

    +
    + + +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 1: Overview of the labeling systems implemented in +text2sdg. Legend: +OR—keywords are combined using logical ORs, implying that only the +keywords must be matched to assign an SDG label; AND—keywords are +combined using logical ANDs, implying that multiple keywords must be +matched to assign an SDG label; wildcards—keywords are matched +considering different keyword parts; proximity search—keywords must +co-occur within a certain word window to assign an SDG label.
    Labeling systemSDGs coveredQuery operatorsUnique keywords per SDG (mean & SD)Example query (SDG-01)
    ElsevierSDG 1 - SDG 16OR, AND, wildcards(21.7)"extreme poverty"
    AuroraSDG 1 - SDG 17OR, AND, wildcards, proximity search(31.6)("poverty") W/3 ("chronic*" OR "extreme")
    AucklandSDG 1 - SDG 16OR, AND, wildcards(46.5)"poverty eradication"
    SIRISSDG 1 - SDG 16OR, AND\(148\)("anti-poverty") AND ("poverty" OR "vulnerability")
    SDGOSDG 1 - SDG 17OR\(236\)"absolute poverty"
    SDSNSDG 1 - SDG 17OR(16.8)"End poverty"
    +
    +

    The ensemble labeling system

    +

    In another publication (Wulff et al. 2023), we evaluated the accuracy of +the six labeling systems implemented by +text2sdg and a rival +approach (i.e., OSDG Pukelis et al. 2020) using expert-labeled data sets. +These analyses lead to three critical observations. First, the accuracy +of SDG classifications was reasonable for all systems, but varied +considerably as a function of the data set. This is because the systems +differ in how liberal or conservative they assign SDGs to texts due to +differences in the types of query operators they employ. Specifically, +employing only OR-operators, SDGO and SDSN were considerably more +liberal, whereas the other four systems employing additional operators +were more conservative. In other words, the systems implement different +trade-offs between sensitivity (i.e., true-positive rate) and +specificity (i.e., true-negative rate). As a result, SDGO and SDSN +outperformed the other systems for SDG-rich documents and vice versa. In +addition to these differences in accuracy, we observed critical biases +in SDG profiles, with the systems overemphasizing different sets of +SDGs, and strong dependencies between SDG predictions and document +length. To address these limitations, we developed an ensemble model +approach that uses the the predictions of the six systems and document +length as inputs to a random forest model. After training with +expert-labeled and synthetic data, the ensemble model showed better +out-of-sample accuracy, lower false alarm rates, and smaller biases than +any individual labeling system (Wulff et al. 2023). As a result, this +ensemble model is also made available through +text2sdg using a +dedicated function.

    +

    In the following sections, we provide an overview over the +text2sdg R package +and demonstrate how its functions can be used to run to detect and +analyze SDGs in text.

    +

    3 The text2sdg package

    +

    Motivation for text2sdg

    +

    Despite the effort put into developing various labeling systems and +their great promise in addressing the SDG-related data scarcity, extant +implementations of these approaches are not without shortcomings. First, +the labeling systems were mostly developed to be used within academic +citation databases (e.g., Scopus) and are not easily applied to other +text sources. Second, existing implementations lack transparent ways to +communicate which features are matched to which documents or how they +compare between a choice of labeling systems. We alleviate these +shortcomings by providing an open-source solution, +text2sdg, that lets +users detect SDGs in any kind of text using any of the above-mentioned +systems, and ensemble of systems, or even customized, user-made labeling +systems. The package provides a common framework for implementing the +different extant or novel approaches and makes it easy to quantitatively +compare and visualize their results.

    +

    Overview of text2sdg package

    +

    At the heart of the +text2sdg package are +the Lucene-style queries that are used to detect SDGs in text and the +ensemble models that build on these queries. The queries map text +features (i.e., words or a combination of words) to SDGs. For example, a +text that contains the words "fisheries" and "marine" would be +mapped to SDG 14 (i.e., conserve and sustainably use the oceans, seas +and marine resources for sustainable development) by the Aurora system. +To enable the use of such queries in R, the +text2sdg package +recruits the +corpustools +package (Welbers and van Atteveldt 2021). +corpustools has +been built to implement complex search queries and execute them +efficiently for large amounts of text. Based on this, +text2sdg provides +several functions that implement extant labeling systems, facilitate the +specification of new labeling systems, and analyze and visualize search +results. Table 2 gives an overview of the +text2sdg core +functions.

    +

    The main functions of +text2sdg are +detect_sdg and detect_sdg_systems, which implement the ensemble +model approach (Wulff et al. 2023) and the implemented labeling systems, +respectively, to identify SDGs in texts. The texts are provided to these +functions via the text argument as either a character vector or an +object of class "tCorpus" from +corpustools. All +other arguments are optional. By default, the detect_sdg_systems +function runs only the Aurora, Auckland, Elsevier, and SIRIS systems, +but the set systems can be extended to all six systems using the +system argument. The functions further allow customization of the set +of SDGs using the sdgs argument and return a tibble with one row per +hit that has the following columns (and types) (italic column names only +present in the tibble returned by detect_sdg_systems):

    +
      +
    • document (factor) - index of element in the character vector or +corpus supply for text

    • +
    • sdg (character) - labels indicating the matched SDGs

    • +
    • system (character) - the query or ensemble system that produced the +match

    • +
    • query_id (integer) - identifier of query in the query system

    • +
    • features (character) - words in the document that were matched by +the query

    • +
    • hit (numeric) - running index of matches for each system

    • +
    +

    Further details on the detect_sdg and detect_sdg_systems functions +and their output will be presented in the next section.

    +

    The detect_any function implements the same functionality as +detect_sdg_systems, but permits the user to specify customized or +self-defined queries. These queries are specified via the queries +argument and must follow the syntax of the +corpustools +package (see Practical Considerations section for more details).

    +

    To support the interpretation of SDG labels generated by detect_sdg, +detect_sdg_systems and detect_any, +text2sdg further +provides the plot_sdg and crosstab_sdg functions. The plot_sdg +function visualizes the distribution of SDG labels identified in +documents by means of a customizable barplot showing SDG frequencies for +the different labeling systems. The crosstab_sdg function helps reveal +patterns of label co-occurrences either across SDGs or systems, which +can be controlled using the compare argument.

    +
    + + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 2: Overview of package functions
    Function NameDescription
    detect_sdgidentifies SDGs in text using an ensemble model that draws on the six labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN).
    detect_sdg_systemsidentifies SDGs in text by using labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN).
    detect_anysimilar to detect_sdg but identifies SDGs in text using user-defined queries.
    crosstab_sdgcrosstab_sdg takes the output of detect_sdg, detect_sdg_systems, or detect_any as input and determines correlations between either query systems or SDGs.
    plot_sdgtakes the output of detect_sdg, detect_sdg_systems, or detect_any as input and produces adjustable barplots illustrating the hit frequencies produced by the different query systems.
    +
    +

    4 Demonstrating the functionality of text2sdg

    +

    To showcase the functionalities of the +text2sdg package we +analyze the publicly available p3 dataset of the Swiss National Science +Foundation (SNSF) that lists research projects funded by the SNSF. In +addition to demonstrating +text2sdg, the case +study will permit us to discuss practical issues concerning the labeling +of SDGs, including relevant differences between labeling systems. The +data to reproduce the analyses presented below can be found at +https://doi.org/10.5281/zenodo.11060662 (Meier 2024).

    +

    Preparing the SNSF projects data

    +

    The SNSF projects data was downloaded from +https://data.snf.ch/datasets. As of March 2022, the p3 database +included information on 81,237 research projects. From the data, we +removed 54,288 projects where the abstract was absent or not written in +English. This left us with a total of 26,949 projects. To ready this +data for analysis, we read it using the readr function of the +readr package (Wickham et al. 2021), +producing a tibble named projects. A reduced version of this +tibble is included in the +text2sdg package and +available through the projects object after +text2sdg has been +loaded.

    +

    Using detect_sdg and detect_sdg_systems to detect SDGs

    +

    To label the abstracts in projects using detect_sdg, we only have to +supply the character vector that includes the abstracts to the text +argument of the detect_sdg function. In addition the example below +makes use of the synthetic argument to implement the "equal" +(default) and "triple" version of the ensemble model. As a result, two +versions of the ensemble model are run that were trained on an equal +amount of synthetic (non-SDG related) and expert-labeled data and three +times the amount of synthetic than labeled data, respectively. A larger +amount of synthetic data in training lowers the false-positive rate, but +also compromises accuracy (cf. Wulff et al. 2023 for more details).

    +
    # detect SDGs
    +> sdgs_ensemble <- detect_sdg(text = projects,
    ++                             synthetic = c("equal","triple"))
    +Running systems
    +Obtaining text lengths
    +Building features
    +Running ensemble
    +
    +> head(sdgs_ensemble)
    +# A tibble: 6 × 4
    +  document sdg    system            hit
    +  <fct>    <chr>  <chr>           <int>
    +1 22       SDG-06 Ensemble equal   2539
    +2 39       SDG-03 Ensemble equal    498
    +3 39       SDG-07 Ensemble equal   2953
    +4 39       SDG-08 Ensemble equal   4080
    +5 41       SDG-13 Ensemble equal   5690
    +6 41       SDG-13 Ensemble triple  3684
    +
    +    
    +

    The first two columns of the tibble returned by detect_sdg show the +document and SDGs identified by the model. Further columns show the +system producing the hit and a running hit index for a given system. As +the predictions of the six individual labeling systems are used as input +for the ensemble models, they will be computed in the background. The +user can access these predictions by calling +attr(sdgs_ensemble, "system_hits"). Alternatively, the user can use +the detect_sdg_systems function, which provides additional options for +customization.

    +

    As with the detect_sdg function, the detect_sdg_systems function +requires a character vector as input to the text argument. In +addition, the example below specifies two optional arguments. First, to +indicate that all six systems should be run, rather than the default of +only Aurora, Auckland, Elsevier, and SIRIS, we supply a character vector +of all six systems’ names to the systems argument. Second, we +explicitly set the output argument to “features”, which in contrast +to output = “documents” delivers more detailed information about which +keywords that triggered the SDG labels.

    +
    # detect SDGs
    +> sdgs <- detect_sdg_systems(text = projects,
    ++                            systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO"),
    ++                            output = "features")
    +Running Aurora
    +Running Elsevier
    +Running Auckland
    +Running SIRIS
    +Running SDSN
    +Running SDGO
    +    
    +> head(sdgs)
    +# A tibble: 6 × 6
    +  document sdg    system query_id features      hit
    +  <fct>    <chr>  <chr>     <dbl> <chr>       <int>
    +1 1        SDG-01 SDSN        392 sustainable     4
    +2 1        SDG-02 SDSN        376 maize           3
    +3 1        SDG-02 SDSN        629 sustainable     8
    +4 1        SDG-08 SDGO       3968 work            1
    +5 1        SDG-08 SDSN        812 work           11
    +6 1        SDG-09 SDSN        483 research        6
    +    
    +

    The above tibble produced by +text2sdg contains for +every combination of document, SDG, system, and query (columns 1 to 4), +the query feature (keyword) that triggered the label (column 5), and a +hit index for a given system (column 6). The first row of the tibble +thus shows that the query 392 within SDSN labeled document number 1 with +SDG-01, because the document included the feature sustainable, and +that this was the fourth hit produced by the SDSN system. It is +important to note that, in other cases, multiple features of a query +might be matched, which will result in multiple rows per combination of +document, SDG, system, and query. This can be avoided by setting the +output argument to “documents”, in which case all features’ hits of +such combinations will be grouped into a single row.

    +

    Analyzing the SDG labels

    +

    To visualize the distribution of SDG labels across SDGs and systems in +the sdgs tibble, we apply the plot_sdg function. By default, +plot_sdg shows a barplot of the number of documents labeled by each of +the SDGs, with the frequencies associated with the different systems +stacked on top of each other. The function counts a maximum of one hit +per document-system-SDG combination. Duplicate combinations resulting +from hits by multiple queries or keywords in queries will be suppressed +by default and the function returns a message reporting the number of +cases affected.

    +
    
    +> plot_sdg(sdgs)
    +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates.
    +
    +
    +graphic without alt text +

    +Figure 1: Default plot of distribution of detected SDGs. +

    +
    +
    +

    The plot produced by plot_sdg (Figure  1) +shows considerable differences in the frequency of different SDGs, with +SDGs 3 (“Good Health and Well-Being”) and 9 (“Industry, Innovation And +Infrastructure”) being most frequent and SDGs 5 (“Gender Equality”) and +14 (“Life Below Water”) being least frequent. Furthermore, there are +substantial differences in the number of labels produced by different +systems, with SDSN and SDGO having produced many more labels than the +other three systems.

    +

    To customize the visualization of SDG frequencies, the plot_sdg +function provides several additional arguments. For instance, by setting +sdg_titles to TRUE, the SDG titles will be added to the annotation +of the plot. Other arguments are normalize to show probabilities +instead of frequencies, color to change the filling of bars, and +remove_duplicates to eliminate duplicate document-system-SDG +combinations. Furthermore, as plot_sdg is built on +ggplot2 (Wickham 2016), +the function can easily be extended by functions from the +ggplot2 universe. To +illustrate these points, the code below generates a plot (Figure + 2) that includes SDG titles and +separates the results of the different SDG systems using facets.

    +
    > plot_sdg(sdgs, 
    ++          sdg_titles = TRUE) + 
    ++   ggplot2::facet_wrap(~system, ncol= 1, scales = "free_y")
    +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates.
    +
    +
    +graphic without alt text +

    +Figure 2: Distribution of detected SDGs facetted by system. +

    +
    +
    +

    The separation of systems better illustrates the results of systems that +produce fewer hits and helps compare the results across systems. This +reveals, for instance, that in the Elsevier system SDG 3 (“Good Health +and Well-Being”) was most prominent, whereas in the Aurora system this +was SDG 13 ("Climate Action”). These results highlight that the +different labeling systems do not necessarily agree concerning the +assignment of SDGs to documents.

    +

    To quantify the commonalities and differences between labeling systems, +text2sdg provides the +crosstab_sdg function. The function evaluates the level of alignment +across either systems (the default) or SDGs by calculating \(\phi\) +coefficients between the vectors of labels. We supply the hits +argument of the function with the sdgs tibble containing the labels +produced by detect_sdg. Note that the function only considers distinct +combinations of documents, systems and SDGs, irrespective of whether the +detect_sdg function was run using output = “documents” or +output = "features”.

    +
    
    +> crosstab_sdg(sdgs)
    +          Auckland    Aurora  Elsevier      SDGO      SDSN     SIRIS
    +Auckland 1.0000000 0.3345247 0.6676524 0.3314806 0.2896650 0.4115387
    +Aurora   0.3345247 1.0000000 0.3256877 0.1614586 0.1569791 0.3703457
    +Elsevier 0.6676524 0.3256877 1.0000000 0.2642918 0.2192051 0.3538272
    +SDGO     0.3314806 0.1614586 0.2642918 1.0000000 0.3722997 0.2244774
    +SDSN     0.2896650 0.1569791 0.2192051 0.3722997 1.0000000 0.2330684
    +SIRIS    0.4115387 0.3703457 0.3538272 0.2244774 0.2330684 1.0000000
    +

    The output of crosstab_sdg() for the SNSF projects reveals two +noteworthy insights. First, the correspondence between the labels of +different systems is rather small, as indicated by \(\phi\) coefficients +that are mostly smaller than 0.4. Second, there are two groups of +systems that are more similar to one another. On the one hand, Elsevier, +Auckland, Aurora, and SIRIS, and, on the other hand, SDGO and SDSN. +These groups correspond to differences in query operators, with the +former four including AND operators in their queries, whereas the latter +two do not. crosstab_sdg() can also be called with the output from the +ensemble models.

    +
    > crosstab_sdg(sdgs_ensemble)
    +                Ensemble equal Ensemble triple
    +Ensemble equal       1.0000000       0.8127837
    +Ensemble triple      0.8127837       1.0000000
    +

    It can further be informative to analyze the correlations between SDGs. +To do this, we set the compare argument in crosstab_sdg() to +"sdgs". The output below shows the result for the first six SDGs by +setting sdgs = 1:6. It can be seen that certain pairs of SDGs—in +particular, SDG-01 and SDG-02—co-occur more frequently. These results +may provide insights into the co-occurrence structure of SDGs in the +data at hand. However, these results can also highlight the importance +of considering similarities between queries targeting different SDGs.

    +
    
    +> crosstab_sdg(sdgs, compare = "sdgs", sdgs = 1:6)
    +           SDG-01     SDG-02     SDG-03     SDG-04     SDG-05     SDG-06
    +SDG-01 1.00000000 0.47455139 0.04811778 0.07928418 0.14252372 0.16622948
    +SDG-02 0.47455139 1.00000000 0.10611662 0.06751253 0.09338952 0.17504027
    +SDG-03 0.04811778 0.10611662 1.00000000 0.18092227 0.10936179 0.04882173
    +SDG-04 0.07928418 0.06751253 0.18092227 1.00000000 0.11791600 0.07887042
    +SDG-05 0.14252372 0.09338952 0.10936179 0.11791600 1.00000000 0.04603253
    +SDG-06 0.16622948 0.17504027 0.04882173 0.07887042 0.04603253 1.00000000
    +

    5 Practical considerations

    +

    Specifying user-defined labeling systems

    +

    The query systems implemented in +text2sdg represent +important efforts to systematize the monitoring of SDGs from text. +Nevertheless, these efforts are still relatively young and validations +of the systems are largely missing, creating a need for continued +development. text2sdg +supports the further development of new SDG labeling systems by +providing the detect_any function. In this section, we provide +additional detail on using this feature of +text2sdg.

    +

    The detect_any function also uses +corpustools as the +back-end. This implies that new queries must be specified to match the +syntax of +corpustools. The +syntax supports standard Boolean operators (AND, OR, and NOT), wildcard +operators, and proximity search. Boolean operators control how different +keywords are combined in a query. For instance, the query "marine OR +fisheries" matches text that contains either of these two words whereas +the query "marine AND fisheries" only matches text that contains both +words. Corpustools also allows to specify common query wildcard +operators 1. The wildcard operators \(?\) and \(*\) allow the +specification of variable word parts. For instance, the question mark +operator \(?\) matches one unknown character or no character at all, e.g., +"?ish" would match "fish", "dish", or "ish". The asterisk +operator \(*\), by contrast, matches any number of unknown characters, +e.g., "*ish" would match "fish" but also "Swedish". Both +wildcards can be used at the start, within or end of a term. Proximity +search extends a Boolean AND, by requiring that two keywords have no +more than defined distances to one another. For instance, "climate +change"\(\sim\)3 specifies matches in which "climate" +and "change" both occur no more than three words apart. A complete +description of the +corpustools syntax +is presented in the +corpustools +vignette and documentation.

    +

    To supply a user-defined labeling system to detect_any, the queries +must be placed in a data.frame or tibble that additionally includes +a column specifying the labeling system’s name and a column of SDG +labels corresponding to the queries.

    +
      +
    • system (character) - name of the labeling systems.

    • +
    • queries (character) - user-defined queries.

    • +
    • sdg (integer) - SDGs labels assigned by queries.

    • +
    +

    The example below illustrates the application of a user-defined labeling +system using detect_any. First, a tibble is defined that includes +three rows, one for each of three different queries stored in the +query column. The system is called "my_example_system" in the +system column and each of the queries is assigned SDG-14 in the sdg +column. Note that specification of the labeling system need not be made +in R, but can easily be outsourced to a spreadsheet that is then +processed into a tibble. Second, the system is supplied to the +system argument of the detect_any function, along with the texts +(here, the SNSF abstracts). The output is analogous to the output of the +detect_sdg_systems function (for brevity, we only show the first three +lines of the output).

    +
    > # definition of query set
    +> my_example_system <- tibble::tibble(system = "my_example_system",
    ++                             query = c("marine AND fisheries", 
    ++                                        "('marine fisheries') AND sea", 
    ++                                        "?ish"),
    ++                             sdg = c(14,14,14))
    +> detect_any(text = projects, 
    ++            system = my_example_system)
    +# A tibble: 591 × 6
    +   document sdg    system            query_id features   hit
    +   <fct>    <chr>  <chr>                <dbl> <chr>    <int>
    + 1 6        SDG-14 my_example_system        3 wish       122
    + 2 134      SDG-14 my_example_system        3 wish        18
    + 3 241      SDG-14 my_example_system        3 fish        59
    +

    Applying text2sdg to non-English data

    +

    The queries of the labeling systems implemented by +text2sdg are in +English, implying that texts in other languages must first be translated +to English. We assessed feasibility and whether translation affects the +reliability of SDG labels by making use of back translation with one +language we are most familiar with (German). To this end, we first +translated 1,500 randomly selected SNSF project abstracts from English +to German and from German to English and then compared the labels of the +original English and back-translated English abstracts. We carried out +the translation using the DeepL translation engine +(www.deepl.com/translator).

    +

    Table 3 shows the results of this analysis. +Overall, the correlations as measured by the \(phi\)-coefficient are very +high. The systems showed correlations above or equal to \(0.88\), with +Elsevier and Auckland showing the highest value of \(0.93\). Considering +that our analysis involves not only one, but two translation +steps—from German to English and back—these results suggest that +text2sdg can be +applied to non-English text, such as German, with very high accuracy. +One should note, however, that the quality of translation may vary +across languages and translation engines so additional work is needed to +compare performance across different languages.

    +
    + + ++++++++ + + + + + + + + + + + + + + + + + + + + +
    Table 3: \(phi\)-coefficient between the labels for the original +English text and the labels for the back-translated +(English-German-English) English text
    AuroraElsevierAucklandSIRISSDSNSDGO
    0.910.930.930.880.910.91
    +
    +

    Estimating the runtime of text2sdg

    +

    The analysis of text data can be computationally intense. To provide +some guidance on the expected runtime of +text2sdg for data +with different numbers of documents and different document lengths, we +carried out several experiments. For this purpose, we first simulated +documents by concatenating 10, 100, 1,000, or 10,000 words drawn +randomly according to word frequencies in Wikipedia and combined 1, 10, +100, or 1,000 thus-generated documents into simulated data sets. Then we +evaluated the runtime of +text2sdg separately +by system for the simulated data sets.

    +

    Figure 3 shows the average runtime in +seconds across 7,000 repetitions of each combination of document length +and number of documents for each of the labeling systems. The results +highlight noteworthy points. First, runtime is primarily a function of +the number of words, irrespective of how words are distributed across +documents. Second, the runtime per words decreases as the number of +words increases, which is due to a constant overhead associated with +optimizing the labeling systems’ queries. Third, there are considerable +differences in the runtime between systems, which is, in part, due to +the functions’ overhead and, in part, due to differences in number and +complexity of queries. The fastest system is Elsevier, processing 10 +million words in roughly one minute; the slowest system is SIRIS, +processing 10 million words in about 40 minutes. Overall, these +experiments highlight that +text2sdg can +efficiently process large amounts of text, but also that some care +should be exercised when dealing with extremely large or many texts. In +such cases, it may be advisable to rely on more efficient labeling +systems, such as Elsevier or SDSN.

    +
    +
    +graphic without alt text +

    +Figure 3: Median runtime as a function of number of documents and document length using 6 different query systems. Each cell reflects the average runtime of 7,000 runs with numbers reflecting the median runtime in seconds and color reflecting the logarithm of the median runtime in seconds. +

    +
    +
    +

    6 Other approaches to detecting SDGs in text

    +

    There are a number of other approaches to detecting SDGs in text. First, +there are approaches outside the R ecosystem. One such tool is the +European Union’s SDG Mapper +(https://knowsdgs.jrc.ec.europa.eu/sdgmapper) that produces an +analysis of SDGs per document using an online interface in which +registered users can upload single documents. Another prominent example +is the OSDG tool developed by the SDG Ai Lab of the United Nations in +collaboration with private partners. It can detect SDGs in text that is +provided through the OSDG website (https://osdg.ai/) or, if granted +access, through an API. The OSDG tool builds on the SDG Ontology (SDGO) +that is also implemented in +text2sdg. OSDG +additionally leverages a machine learning tool that was trained on +expert-labeled data to make the final predictions (Pukelis et al. 2022). One +advantage of OSDG relative to +text2sdg is that it +allows to detect SDGs in 15 different languages. This is done by using +translation of the input text into English before passing it through the +OSDG workflow. While this is convenient to the user, the same outcome +can be achieved with our package by making use of translation models +through, for example the +deeplr R package. As +our proof-of-concept above has shown, +text2sdg can be used +with non-English text (e.g., German) with very high accuracy by using +such an approach.

    +

    Second, there are currently, to our knowledge, two other R packages +aimed at providing methods for the automated detection of SDGs in text. +The SDGdetector +package is based on a custom query system that was generated by pooling +several existing query systems and manual adaptions. The resulting +labeling system permits finer-grained predictions on the level of SDG +targets 2. However, the method is computationally taxing and limited +to texts that are shorter than 750 characters or approximately 150 +words. The SDGmapR package builds on publicly available SDG keywords +that are assigned weights that indicate the degree to which a keyword +reflects a given SDG. The package computes SDG weights for each text by +adding up the weights of the keywords that were found in the text. The +larger this weight, the larger should be the likelihood that the text is +related to a specified SDG. The advantage of this approach is that it +permits customization of the decision boundary (i.e., the weight needed +to count a text as SDG related). However, the package does not give the +user a binary decision regarding whether a text relates to a given SDG. +None of the two packages offers an ensemble model that can be used to +categorize the presence of SDGs as is the case with +text2sdg.

    +

    7 Discussion

    +

    The text2sdg package +offers an open and easily accessible way of detecting SDGs in text using +both individual query systems, a state-of-the-art ensemble model that +combines queries from extant systems (Wulff et al. 2023), as well as +custom-made queries.

    +

    While our package implements several query-based methods to detect SDGs +in text as well as a state-of-the-art ensemble model, the field of +detecting SDGs in text is rapidly evolving. Our aim is to continuously +update text2sdg as +new open source methods of detecting SDGs in text are released. Bundling +many systems in a coherent API is not only convenient for users, but +also helps catalyze development of new and hopefully more accurate +methods by making it easy to compare the performance of the different +systems. We deliberately incorporated functions that allow users to +implement and test their own query systems to facilitate this process. +We also encourage others to contribute to +text2sdg by adding +new systems or by expanding the existing functionalities to analyse the +output of the systems.

    +

    Indeed, although the systems implemented by +text2sdg have been +shown to achieve high accuracy (Wulff et al. 2023), it is important to +stress that these systems must be further developed to increase their +accuracy for a greater number of document types. Two approaches can help +in achieving this. First, unsupervised methods such as topic models +(Grün and Hornik 2011) or semantic network analysis (Siew et al. 2019) +can help in identifying novel linguistic patterns for the detection of +SDGs. One should note, however, that unsupervised methods are no +replacement for top-down, rule-based methods as implemented by +text2sdg, because of +the strong requirement to compare results across data sets, analyses, +and time, which require a clear set of benchmarks that are not simply +data-driven. Second, recent transformer based models +(Reimers and Gurevych 2019) could be leveraged to learn more complex +relationships between specific linguistic patterns and SDGs. However, +the field will have to work towards producing more balanced training +data before the full potential of these approaches can be exploited. +Moreover, one should note that transformer models are computationally +expensive and often limited to short text due to architecture +constraints (Ding et al. 2020). Whether such developments will emerge and +can be ultimately integrated into +text2sdg or will +represent alternative approaches remains an open question.

    +

    8 Conclusion

    +

    In this article, we introduced a new R package, +text2sdg, designed to +help identify SDGs from text. The package promises to help detect SDGs +in text sources using different existing or custom-made labeling systems +as well as a high-performance ensemble model that builds on these +labeling systems. Our case study and additional analyses suggest that +the approach can handle both sources in English as well as translations, +allows user-friendly use of novel queries, and provides reasonably +efficient performance for analysing large corpora.

    +
    +

    9 CRAN packages used

    +

    text2sdg, corpustools, readr, ggplot2, deeplr, SDGdetector

    +

    10 CRAN Task Views implied by cited packages

    +

    ChemPhys, Phylogenetics, Spatial, TeachingStatistics, WebTechnologies

    +

    11 Note

    +

    This article is converted from a Legacy LaTeX article using the +texor package. +The pdf version is the official version. To report a problem with the html, +refer to CONTRIBUTE on the R Journal homepage.

    +
    +
    +N. Bautista. SDG ontology. 2019. URL https://doi.org/10.6084/m9.figshare.11106113.v1. +
    +
    +M. Ding, C. Zhou, H. Yang and J. Tang. Cogltx: Applying bert to long texts. Advances in Neural Information Processing Systems, 33: 12792–12804, 2020. +
    +
    +N. Duran-Silva, E. Fuster, F. A. Massucci and A. Quinquillà. A controlled vocabulary defining the semantic perimeter of Sustainable Development Goals. 2019. URL https://doi.org/10.5281/zenodo.3567769. +
    +
    +B. Grün and K. Hornik. Topicmodels: An r package for fitting topic models. Journal of statistical software, 40: 1–30, 2011. DOI https://doi.org/10.18637/jss.v040.i13. +
    +
    +B. Jayabalasingham, R. Boverhof, K. Agnew and L. Klein. Identifying research supporting the united nations sustainable development goals. Mendeley Data, 1: 2019. URL https://doi.org/10.17632/87txkw7khs.1. +
    +
    +D. S. Meier. Descriptions of SNSF-funded research projects. 2024. URL https://doi.org/10.5281/zenodo.11060662. +
    +
    +L. Pukelis, N. Bautista-Puig, G. Statulevičiūtė, V. Stančiauskas, G. Dikmener and D. Akylbekova. OSDG 2.0: A multilingual tool for classifying text data by UN sustainable development goals (SDGs). 2022. URL https://arxiv.org/abs/2211.11252. +
    +
    +L. Pukelis, N. B. Puig, M. Skrynik and V. Stanciauskas. OSDG–open-source approach to classify text data by UN sustainable development goals (SDGs). arXiv preprint arXiv:2005.14569, 2020. DOI https://doi.org/10.48550/arXiv.2005.14569. +
    +
    +N. Reimers and I. Gurevych. Sentence-bert: Sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084, 2019. DOI https://doi.org/10.48550/arXiv.1908.10084. +
    +
    +C. S. Siew, D. U. Wulff, N. M. Beckage and Y. N. Kenett. Cognitive network science: A review of research on cognition through the lens of network representations, processes, and dynamics. Complexity, 2019: 2019. DOI https://doi.org/10.1155/2019/2108423. +
    +
    +Sustainable Development Solutions Network (SDSN). Compiled list of SDG keywords., 2021. URL https://ap-unsdsn.org/regional-initiatives/universities-sdgs/ [online; last accessed September 30, 2010]. +
    +
    +UN. The Sustainable Development Goals Report 2022. United Nations, 2022. +
    +
    +M. Vanderfeesten, R. Otten and E. Spielberg. Search Queries for "Mapping Research Output to the Sustainable Development Goals (SDGs)" v5.0. 2020a. URL https://doi.org/10.5281/zenodo.3817445. +
    +
    +M. Vanderfeesten, E. Spielberg and Y. Gunes. Survey data of "Mapping Research Output to the Sustainable Development Goals (SDGs)". 2020b. URL https://doi.org/10.5281/zenodo.3813230. +
    +
    +W. Wang, W. Kang and J. Mu. Mapping research to the sustainable development goals (SDGs). 2023. URL https://doi.org/10.21203/rs.3.rs-2544385/v1. +
    +
    +K. Welbers and W. van Atteveldt. Corpustools: Managing, querying and analyzing tokenized text. 2021. URL https://CRAN.R-project.org/package=corpustools. R package version 0.4.8. +
    +
    +H. Wickham. ggplot2: Elegant graphics for data analysis. Springer-Verlag New York, 2016. URL https://ggplot2.tidyverse.org. +
    +
    +H. Wickham, J. Hester and J. Bryan. Readr: Read rectangular text data. 2021. URL https://CRAN.R-project.org/package=readr. R package version 2.1.1. +
    +
    +D. U. Wulff, D. S. Meier and R. Mata. Using novel data and ensemble models to improve automated labeling of sustainable development goals. arXiv preprint arXiv:2301.11353, 2023. +
    +
    +
    +
    +
      +
    1. Note that the meaning of these wildcards differs from regex +wildcards.↩︎

    2. +
    3. Each SDG has several targets that are operationalized with +indicators (SDG/targets/indicators). For example the first target of +SDG 1 reads as follows: "By 2030, eradicate extreme poverty for all +people everywhere, currently measured as people living on less than +$1.25 a day".

      +
      +↩︎
    4. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Meier, et al., "text2sdg: An R Package to Monitor Sustainable Development Goals from Text", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-005,
    +  author = {Meier, Dominik S. and Mata, Rui and Wulff, Dirk U.},
    +  title = {text2sdg: An R Package to Monitor Sustainable Development Goals from Text},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-005},
    +  doi = {10.32614/RJ-2024-005},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {83-95}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-005/RJ-2024-005.pdf b/_articles/RJ-2024-005/RJ-2024-005.pdf new file mode 100644 index 0000000000..9d2527c759 Binary files /dev/null and b/_articles/RJ-2024-005/RJ-2024-005.pdf differ diff --git a/_articles/RJ-2024-005/RJournal.sty b/_articles/RJ-2024-005/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-005/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-005/RJwrapper.md b/_articles/RJ-2024-005/RJwrapper.md new file mode 100644 index 0000000000..d2fa0a2109 --- /dev/null +++ b/_articles/RJ-2024-005/RJwrapper.md @@ -0,0 +1,906 @@ +--- +abstract: | + Monitoring progress on the United Nations Sustainable Development + Goals (SDGs) is important for both academic and non-academic + organizations. Existing approaches to monitoring SDGs have focused on + specific data types; namely, publications listed in proprietary + research databases. We present the text2sdg package for the R + language, a user-friendly, open-source package that detects SDGs in + text data using different individual query systems, an ensemble of + query systems, or custom-made ones. The text2sdg package thereby + facilitates the monitoring of SDGs for a wide array of text sources + and provides a much-needed basis for validating and improving extant + methods to detect SDGs from text. +address: +- | + Dominik S. Meier\ + University of Basel\ + Steinengraben 22 4051 Basel\ + Switzerland\ + (ORCID: 0000-0002-3999-1388)\ + [dominik.meier@unibas.ch](dominik.meier@unibas.ch){.uri} +- | + Rui Mata\ + University of Basel\ + Missionsstrasse 60-62 4055 Basel\ + Switzerland\ + (ORCID: 0000-0002-1679-906X)\ + [rui.mata@unibas.ch](rui.mata@unibas.ch){.uri} +- | + Dirk U. Wulff\ + University of Basel\ + Missionsstrasse 60-62 4055 Basel\ + Switzerland\ + (ORCID: 0000-0002-4008-8022)\ + [dirk.wulff@unibas.ch](dirk.wulff@unibas.ch){.uri} +author: +- by Dominik S. Meier, Rui Mata, and Dirk U. Wulff +bibliography: +- text2sdg.bib +title: "**text2sdg**: An R Package to Monitor Sustainable Development + Goals from Text" +--- + +:::::: article +## Introduction + +The United Nations Sustainable Development Goals (SDGs) have become an +important guideline for both governmental and non-governmental +organizations to monitor and plan their contributions to social, +economic, and environmental transformations. The 17 SDGs cover large +areas of application, from ending poverty and improving health, to +fostering economic growth and preserving natural resources. As the +latest UN report [@SGD_report2022] attests, the availability of +high-quality data is still lacking in many of these areas and progress +is needed in identifying data sources that can help monitor work on +these goals. Monitoring of SDGs has typically been based on economic and +health data, which are often difficult and costly to gather (e.g., +; ). One attractive +alternative that has emerged from recent scientometric efforts is to +detect SDGs from text, such as academic publications. Digitized text +represents an attractive resource for monitoring SDGs across a large +number of domains because it is becoming widely available in various +types of documents, such as news articles, websites, corporate reports, +and social media posts. In light of this promise, we developed +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), a freely +available, open-source tool to enable the SDG-labeling of digitized text +and facilitate methodological development in this area. In what follows, +we first present some background on existing labeling systems developed +to identify SDGs from text, and then provide an overview of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package, +showcase its use in a representative case study, and discuss the promise +and limitations of the approach. + +## An overview of SDG labeling systems + +The [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package +provides a user-friendly way to use any existing or custom-made labeling +system developed to monitor the 17 SDGs in text sources. The package +implements six different labeling systems utilizing different keywords +and keyword combination rules, as well as an ensemble model based on the +six systems that was trained on labeled data. In the following, we will +first introduce the six existing labeling systems, namely the Elsevier, +Aurora, Auckland, SIRIS, SDGO, and SDSN systems, before discussing how +these systems are combined within the ensemble approach. See table +\@ref(tab:systems_overview) for overview of these labeling systems. We +address custom-made labeling systems in a dedicated section below. + +### Individual labeling systems + +The most prominent SDG labeling system has been developed by *Elsevier*. +The Elsevier labeling system was integrated into the Times Higher +Education Impact Rankings in 2019, which at the time compared 1,118 +universities in their efforts to address the SDGs as measured by the +frequency of SDG-related terms in their academic output. The Elsevier +queries consist of a list of expert-vetted keywords that are combined +using logical AND operators, implying that multiple keywords must be met +to label a document as containing a certain SDG. The development of the +queries started with an original list of keywords for each SDG that were +iteratively fine tuned to maximize the number of identified papers +closely reflecting the different SDGs. This involved cropping or +combining keywords to reduce the number of irrelevant hits. A detailed +report on the initial development of the Elsevier query system is +provided by @jayabalasingham2019identifying. Since the first version, +the Elsevier labeling system has been iteratively improved, with the +latest versions including additional information specific to academic +publications and the Scopus database, such as identifiers of journal +names or research areas. +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) implements +the latest version without such additional identifiers to broaden the +package's applicability beyond the Scopus database +[@jayabalasingham2019identifying]. + +The Aurora Universities Network's \"Societal Impact and Relevance of +Research\" working group started to develop a labeling system in 2017 to +increase the visibility of research into the SDGs. Aurora's queries were +developed with the goal of identifying SDG-related academic publications +included in the Scopus database. Consequently, the syntax of Aurora +queries is similar to the Scopus query language and the Elsevier system. +However, in contrast to the Elsevier system, the queries combine +keywords in a more complex fashion, recruiting Boolean (AND, OR) and +proximity operators (e.g., w/3, implying within 3 words). As a result, +Aurora's keywords are more specific, possibly leading to a smaller +number of false positives. The initial version of the Aurora system only +included terms that appear in the SDG policy text of the targets and +indicators defined by the United Nations. Subsequent versions expanded +on this by including additional keywords that reflect academic +terminology. [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) +implements version 5.0 of the Aurora labeling system +[@vanderfeesten_maurice_2020_3817445]. This version represents an +improvement on previous versions based on a survey study +[@vanderfeesten_maurice_2020_3813230] and modifications inspired in +other efforts, namely those from Elsevier (above) and SIRIS (introduced +below). + +The Auckland labeling system [@wang2023mapping] was developed by the +University of Auckland to better understand how their research output +contributes to the SDGs. To construct the queries, they used text-mining +techniques to extract global and local SDG keywords from publication +metadata. These keywords were then sorted according to the number of +publications that include the terms and according to the keywords' term +frequency--inverse document frequency. The top-ranked keywords were then +manually reviewed to only retain keywords that are relevant. The +selected keywords were then combined with those of SDSN and Elsevier as +well as UN SDG Indicators to form the final SDG keyword list. These +queries formed the basis for the Auckland queries, which make use of +Boolean (AND, OR) operators and wildcards (e.g., \"\*\"). + +The SIRIS labeling system [@duran_silva_nicolau_2019_3567769] was +created by SIRIS Academic as part of the +[\"science4sdgs\"](http://science4sdgs.sirisacademic.com/) project to +better understand how science, innovation efforts, and technology +related to the SDGs. The SIRIS queries were constructed in a five-step +procedure. First, an initial list of keywords was extracted from the +United Nations official list of goals, targets and indicators. Second, +the list was manually enriched on a basis of a review of SDG relevant +literature. Third, a word2vec model that was trained on a text corpus +created from the enriched keyword list was used to identify keywords +that were semantically related to the initial list. Fourth, using the +DBpedia API, keywords were added that, according to the Wikipedia +corpus, had a categorical relationship with the initial list. Fifth, and +finally, the keyword list was manually revised. The queries of the SIRIS +labeling system primarily consist of individual keywords that +occasionally are combined with a logical AND. +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) implements +the only currently available version of the SIRIS labeling system +[@duran_silva_nicolau_2019_3567769] . + +The Open Source SDG (OSDG) project combines data from multiple sources +to detect SDGs in text. Instead of developing yet another query system, +OSDG's aim was to re-use and integrate existing knowledge by combining +multiple SDG \"ontologies\" (i.e., query systems). OSDG has also made +use of Microsoft Academic Graph to improve their results but because our +query-based system cannot implement this procedure, we adopt the simpler +ontology initially proposed by OSDG, which we refer to as \"SDGO\" in +the package. The labeling system was based on central keywords in the +SDG United Nations description (e.g.\"sanitation\" was classified into +\"SDG6\") and then manually expanded with additional relevant keywords +identified from a corpus of already labeled documents. The resulting +keyword list only makes use of the OR operator. +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) implements +the only currently available version of these queries [@Bautista2019]. + +Finally, the Sustainable Development Solutions Network [SDSN, @sdsn] +labeling system contains SDG-specific keywords compiled in a +collaborative effort by several universities from the Sustainable +Development Solutions Network (SDSN) Australia, New Zealand & Pacific +Network. This query system was developed to detect SDGs in large sets of +university-related text data, such as course listings or research +publications. The authors used United Nations documents, Google +searches, and personal communications as sources for the keywords. This +query system combines keywords with OR operators and does not make use +of AND operators. + +All in all, as can be seen in Table \@ref(tab:systems_overview), the +latter systems differ from the former four in the complexity of their +queries: the Elsevier, Aurora, Auckland, and SIRIS systems make use of +keyword-combination queries and other criteria, such as proximity +operators, whereas SDGO and SDSN only make use of keywords. + +::: {#tab:systems_overview} + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Labeling system SDGs covered Query operators Unique keywords per SDG (mean & SD) Example query (SDG-01) + ----------------- ---------------- -------------------------------------- ------------------------------------- ----------------------------------------------------------- + Elsevier SDG 1 - SDG 16 OR, AND, wildcards (21.7) \"extreme poverty\" + + Aurora SDG 1 - SDG 17 OR, AND, wildcards, proximity search (31.6) (\"poverty\") W/3 (\"chronic\*\" OR \"extreme\") + + Auckland SDG 1 - SDG 16 OR, AND, wildcards (46.5) \"poverty eradication\" + + SIRIS SDG 1 - SDG 16 OR, AND \(148\) (\"anti-poverty\") AND (\"poverty\" OR \"vulnerability\") + + SDGO SDG 1 - SDG 17 OR \(236\) \"absolute poverty\" + + SDSN SDG 1 - SDG 17 OR (16.8) \"End poverty\" + --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 1: Overview of the labeling systems implemented in + [**text2sdg**](https://CRAN.R-project.org/package=text2sdg). Legend: + OR---keywords are combined using logical ORs, implying that only the + keywords must be matched to assign an SDG label; AND---keywords are + combined using logical ANDs, implying that multiple keywords must be + matched to assign an SDG label; wildcards---keywords are matched + considering different keyword parts; proximity search---keywords must + co-occur within a certain word window to assign an SDG label. +::: + +### The ensemble labeling system + +In another publication [@wulff2023using], we evaluated the accuracy of +the six labeling systems implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) and a rival +approach [i.e., OSDG @pukelis2020osdg] using expert-labeled data sets. +These analyses lead to three critical observations. First, the accuracy +of SDG classifications was reasonable for all systems, but varied +considerably as a function of the data set. This is because the systems +differ in how liberal or conservative they assign SDGs to texts due to +differences in the types of query operators they employ. Specifically, +employing only OR-operators, SDGO and SDSN were considerably more +liberal, whereas the other four systems employing additional operators +were more conservative. In other words, the systems implement different +trade-offs between sensitivity (i.e., true-positive rate) and +specificity (i.e., true-negative rate). As a result, SDGO and SDSN +outperformed the other systems for SDG-rich documents and vice versa. In +addition to these differences in accuracy, we observed critical biases +in SDG profiles, with the systems overemphasizing different sets of +SDGs, and strong dependencies between SDG predictions and document +length. To address these limitations, we developed an ensemble model +approach that uses the the predictions of the six systems and document +length as inputs to a random forest model. After training with +expert-labeled and synthetic data, the ensemble model showed better +out-of-sample accuracy, lower false alarm rates, and smaller biases than +any individual labeling system [@wulff2023using]. As a result, this +ensemble model is also made available through +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) using a +dedicated function. + +In the following sections, we provide an overview over the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) R package +and demonstrate how its functions can be used to run to detect and +analyze SDGs in text. + +## The text2sdg package + +### Motivation for text2sdg + +Despite the effort put into developing various labeling systems and +their great promise in addressing the SDG-related data scarcity, extant +implementations of these approaches are not without shortcomings. First, +the labeling systems were mostly developed to be used within academic +citation databases (e.g., Scopus) and are not easily applied to other +text sources. Second, existing implementations lack transparent ways to +communicate which features are matched to which documents or how they +compare between a choice of labeling systems. We alleviate these +shortcomings by providing an open-source solution, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), that lets +users detect SDGs in any kind of text using any of the above-mentioned +systems, and ensemble of systems, or even customized, user-made labeling +systems. The package provides a common framework for implementing the +different extant or novel approaches and makes it easy to quantitatively +compare and visualize their results. + +### Overview of text2sdg package + +At the heart of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package are +the Lucene-style queries that are used to detect SDGs in text and the +ensemble models that build on these queries. The queries map text +features (i.e., words or a combination of words) to SDGs. For example, a +text that contains the words \"fisheries\" and \"marine\" would be +mapped to SDG 14 (i.e., conserve and sustainably use the oceans, seas +and marine resources for sustainable development) by the Aurora system. +To enable the use of such queries in R, the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package +recruits the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) +package [@corpustools]. +[**corpustools**](https://CRAN.R-project.org/package=corpustools) has +been built to implement complex search queries and execute them +efficiently for large amounts of text. Based on this, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) provides +several functions that implement extant labeling systems, facilitate the +specification of new labeling systems, and analyze and visualize search +results. Table \@ref(tab:functions_overview) gives an overview of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) core +functions. + +The main functions of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) are +`detect_sdg` and `detect_sdg_systems`, which implement the ensemble +model approach [@wulff2023using] and the implemented labeling systems, +respectively, to identify SDGs in texts. The texts are provided to these +functions via the `text` argument as either a character vector or an +object of class `"tCorpus"` from +[**corpustools**](https://CRAN.R-project.org/package=corpustools). All +other arguments are optional. By default, the `detect_sdg_systems` +function runs only the Aurora, Auckland, Elsevier, and SIRIS systems, +but the set systems can be extended to all six systems using the +`system` argument. The functions further allow customization of the set +of SDGs using the `sdgs` argument and return a `tibble` with one row per +hit that has the following columns (and types) (italic column names only +present in the tibble returned by `detect_sdg_systems`): + +- document (factor) - index of element in the character vector or + corpus supply for text + +- sdg (character) - labels indicating the matched SDGs + +- system (character) - the query or ensemble system that produced the + match + +- *query_id* (integer) - identifier of query in the query system + +- *features* (character) - words in the document that were matched by + the query + +- hit (numeric) - running index of matches for each system + +Further details on the `detect_sdg` and `detect_sdg_systems` functions +and their output will be presented in the next section. + +The `detect_any` function implements the same functionality as +`detect_sdg_systems`, but permits the user to specify customized or +self-defined queries. These queries are specified via the `queries` +argument and must follow the syntax of the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) +package (see Practical Considerations section for more details). + +To support the interpretation of SDG labels generated by `detect_sdg`, +`detect_sdg_systems` and `detect_any`, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) further +provides the `plot_sdg` and `crosstab_sdg` functions. The `plot_sdg` +function visualizes the distribution of SDG labels identified in +documents by means of a customizable barplot showing SDG frequencies for +the different labeling systems. The `crosstab_sdg` function helps reveal +patterns of label co-occurrences either across SDGs or systems, which +can be controlled using the `compare` argument. + +::: {#tab:functions_overview} + -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Function Name Description + ---------------------- --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + `detect_sdg` identifies SDGs in text using an ensemble model that draws on the six labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN). + + `detect_sdg_systems` identifies SDGs in text by using labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN). + + detect_any similar to `detect_sdg` but identifies SDGs in text using user-defined queries. + + `crosstab_sdg` crosstab_sdg takes the output of detect_sdg, detect_sdg_systems, or detect_any as input and determines correlations between either query systems or SDGs. + + `plot_sdg` takes the output of detect_sdg, detect_sdg_systems, or detect_any as input and produces adjustable barplots illustrating the hit frequencies produced by the different query systems. + -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + : Table 2: Overview of package functions +::: + +## Demonstrating the functionality of text2sdg + +To showcase the functionalities of the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package we +analyze the publicly available p3 dataset of the Swiss National Science +Foundation (SNSF) that lists research projects funded by the SNSF. In +addition to demonstrating +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), the case +study will permit us to discuss practical issues concerning the labeling +of SDGs, including relevant differences between labeling systems. The +data to reproduce the analyses presented below can be found at + [@meier_2024_11060662]. + +### Preparing the SNSF projects data + +The SNSF projects data was downloaded from +. As of March 2022, the p3 database +included information on 81,237 research projects. From the data, we +removed 54,288 projects where the abstract was absent or not written in +English. This left us with a total of 26,949 projects. To ready this +data for analysis, we read it using the `readr` function of the +[**readr**](https://CRAN.R-project.org/package=readr) package [@readr], +producing a `tibble` named `projects`. A reduced version of this +`tibble` is included in the +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package and +available through the `projects` object after +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) has been +loaded. + +### Using `detect_sdg` and `detect_sdg_systems` to detect SDGs + +To label the abstracts in `projects` using `detect_sdg`, we only have to +supply the character vector that includes the abstracts to the `text` +argument of the `detect_sdg` function. In addition the example below +makes use of the `synthetic` argument to implement the `"equal"` +(default) and `"triple"` version of the ensemble model. As a result, two +versions of the ensemble model are run that were trained on an equal +amount of synthetic (non-SDG related) and expert-labeled data and three +times the amount of synthetic than labeled data, respectively. A larger +amount of synthetic data in training lowers the false-positive rate, but +also compromises accuracy [cf. @wulff2023using for more details]. + +``` r +# detect SDGs +> sdgs_ensemble <- detect_sdg(text = projects, ++ synthetic = c("equal","triple")) +Running systems +Obtaining text lengths +Building features +Running ensemble + +> head(sdgs_ensemble) +# A tibble: 6 × 4 + document sdg system hit + +1 22 SDG-06 Ensemble equal 2539 +2 39 SDG-03 Ensemble equal 498 +3 39 SDG-07 Ensemble equal 2953 +4 39 SDG-08 Ensemble equal 4080 +5 41 SDG-13 Ensemble equal 5690 +6 41 SDG-13 Ensemble triple 3684 + + +``` + +The first two columns of the `tibble` returned by `detect_sdg` show the +document and SDGs identified by the model. Further columns show the +system producing the hit and a running hit index for a given system. As +the predictions of the six individual labeling systems are used as input +for the ensemble models, they will be computed in the background. The +user can access these predictions by calling +`attr(sdgs_ensemble, "system_hits")`. Alternatively, the user can use +the `detect_sdg_systems` function, which provides additional options for +customization. + +As with the `detect_sdg` function, the `detect_sdg_systems` function +requires a character vector as input to the `text` argument. In +addition, the example below specifies two optional arguments. First, to +indicate that all six systems should be run, rather than the default of +only Aurora, Auckland, Elsevier, and SIRIS, we supply a character vector +of all six systems' names to the `systems` argument. Second, we +explicitly set the `output` argument to `“features”`, which in contrast +to `output = “documents”` delivers more detailed information about which +keywords that triggered the SDG labels. + +``` r +# detect SDGs +> sdgs <- detect_sdg_systems(text = projects, ++ systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO"), ++ output = "features") +Running Aurora +Running Elsevier +Running Auckland +Running SIRIS +Running SDSN +Running SDGO + +> head(sdgs) +# A tibble: 6 × 6 + document sdg system query_id features hit + +1 1 SDG-01 SDSN 392 sustainable 4 +2 1 SDG-02 SDSN 376 maize 3 +3 1 SDG-02 SDSN 629 sustainable 8 +4 1 SDG-08 SDGO 3968 work 1 +5 1 SDG-08 SDSN 812 work 11 +6 1 SDG-09 SDSN 483 research 6 + +``` + +The above `tibble` produced by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) contains for +every combination of document, SDG, system, and query (columns 1 to 4), +the query feature (keyword) that triggered the label (column 5), and a +hit index for a given system (column 6). The first row of the `tibble` +thus shows that the query 392 within SDSN labeled document number 1 with +SDG-01, because the document included the feature *sustainable*, and +that this was the fourth hit produced by the SDSN system. It is +important to note that, in other cases, multiple features of a query +might be matched, which will result in multiple rows per combination of +document, SDG, system, and query. This can be avoided by setting the +`output` argument to `“documents”`, in which case all features' hits of +such combinations will be grouped into a single row. + +### Analyzing the SDG labels + +To visualize the distribution of SDG labels across SDGs and systems in +the `sdgs` `tibble`, we apply the `plot_sdg` function. By default, +`plot_sdg` shows a barplot of the number of documents labeled by each of +the SDGs, with the frequencies associated with the different systems +stacked on top of each other. The function counts a maximum of one hit +per document-system-SDG combination. Duplicate combinations resulting +from hits by multiple queries or keywords in queries will be suppressed +by default and the function returns a message reporting the number of +cases affected. + +``` r + +> plot_sdg(sdgs) +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates. +``` + +```{r figuredefault-plot, echo=FALSE , fig.cap="Default plot of distribution of detected SDGs.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("default_plot_revision.png")) +``` + +The plot produced by `plot_sdg` (Figure  \@ref(fig:figuredefault-plot)) +shows considerable differences in the frequency of different SDGs, with +SDGs 3 ("Good Health and Well-Being") and 9 ("Industry, Innovation And +Infrastructure") being most frequent and SDGs 5 ("Gender Equality") and +14 ("Life Below Water") being least frequent. Furthermore, there are +substantial differences in the number of labels produced by different +systems, with SDSN and SDGO having produced many more labels than the +other three systems. + +To customize the visualization of SDG frequencies, the `plot_sdg` +function provides several additional arguments. For instance, by setting +`sdg_titles` to `TRUE`, the SDG titles will be added to the annotation +of the plot. Other arguments are `normalize` to show probabilities +instead of frequencies, `color` to change the filling of bars, and +`remove_duplicates` to eliminate duplicate document-system-SDG +combinations. Furthermore, as `plot_sdg` is built on +[**ggplot2**](https://CRAN.R-project.org/package=ggplot2) [@ggplot2], +the function can easily be extended by functions from the +[**ggplot2**](https://CRAN.R-project.org/package=ggplot2) universe. To +illustrate these points, the code below generates a plot (Figure + \@ref(fig:figuredefault-plot-facetted)) that includes SDG titles and +separates the results of the different SDG systems using facets. + +``` r +> plot_sdg(sdgs, ++ sdg_titles = TRUE) + ++ ggplot2::facet_wrap(~system, ncol= 1, scales = "free_y") +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates. +``` + +```{r figuredefault-plot-facetted, echo=FALSE , fig.cap="Distribution of detected SDGs facetted by system.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("default_plot_sdg_labels_revision.png")) +``` + +The separation of systems better illustrates the results of systems that +produce fewer hits and helps compare the results across systems. This +reveals, for instance, that in the Elsevier system SDG 3 ("Good Health +and Well-Being") was most prominent, whereas in the Aurora system this +was SDG 13 (\"Climate Action"). These results highlight that the +different labeling systems do not necessarily agree concerning the +assignment of SDGs to documents. + +To quantify the commonalities and differences between labeling systems, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) provides the +`crosstab_sdg` function. The function evaluates the level of alignment +across either systems (the default) or SDGs by calculating $\phi$ +coefficients between the vectors of labels. We supply the `hits` +argument of the function with the `sdgs` `tibble` containing the labels +produced by `detect_sdg`. Note that the function only considers distinct +combinations of documents, systems and SDGs, irrespective of whether the +`detect_sdg` function was run using `output = “documents”` or +`output = "features”`. + +``` r + +> crosstab_sdg(sdgs) + Auckland Aurora Elsevier SDGO SDSN SIRIS +Auckland 1.0000000 0.3345247 0.6676524 0.3314806 0.2896650 0.4115387 +Aurora 0.3345247 1.0000000 0.3256877 0.1614586 0.1569791 0.3703457 +Elsevier 0.6676524 0.3256877 1.0000000 0.2642918 0.2192051 0.3538272 +SDGO 0.3314806 0.1614586 0.2642918 1.0000000 0.3722997 0.2244774 +SDSN 0.2896650 0.1569791 0.2192051 0.3722997 1.0000000 0.2330684 +SIRIS 0.4115387 0.3703457 0.3538272 0.2244774 0.2330684 1.0000000 +``` + +The output of `crosstab_sdg()` for the SNSF projects reveals two +noteworthy insights. First, the correspondence between the labels of +different systems is rather small, as indicated by $\phi$ coefficients +that are mostly smaller than 0.4. Second, there are two groups of +systems that are more similar to one another. On the one hand, Elsevier, +Auckland, Aurora, and SIRIS, and, on the other hand, SDGO and SDSN. +These groups correspond to differences in query operators, with the +former four including AND operators in their queries, whereas the latter +two do not. `crosstab_sdg()` can also be called with the output from the +ensemble models. + +``` r +> crosstab_sdg(sdgs_ensemble) + Ensemble equal Ensemble triple +Ensemble equal 1.0000000 0.8127837 +Ensemble triple 0.8127837 1.0000000 +``` + +It can further be informative to analyze the correlations between SDGs. +To do this, we set the `compare` argument in `crosstab_sdg()` to +`"sdgs"`. The output below shows the result for the first six SDGs by +setting `sdgs = 1:6`. It can be seen that certain pairs of SDGs---in +particular, SDG-01 and SDG-02---co-occur more frequently. These results +may provide insights into the co-occurrence structure of SDGs in the +data at hand. However, these results can also highlight the importance +of considering similarities between queries targeting different SDGs. + +``` r + +> crosstab_sdg(sdgs, compare = "sdgs", sdgs = 1:6) + SDG-01 SDG-02 SDG-03 SDG-04 SDG-05 SDG-06 +SDG-01 1.00000000 0.47455139 0.04811778 0.07928418 0.14252372 0.16622948 +SDG-02 0.47455139 1.00000000 0.10611662 0.06751253 0.09338952 0.17504027 +SDG-03 0.04811778 0.10611662 1.00000000 0.18092227 0.10936179 0.04882173 +SDG-04 0.07928418 0.06751253 0.18092227 1.00000000 0.11791600 0.07887042 +SDG-05 0.14252372 0.09338952 0.10936179 0.11791600 1.00000000 0.04603253 +SDG-06 0.16622948 0.17504027 0.04882173 0.07887042 0.04603253 1.00000000 + +``` + +## Practical considerations + +### Specifying user-defined labeling systems + +The query systems implemented in +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) represent +important efforts to systematize the monitoring of SDGs from text. +Nevertheless, these efforts are still relatively young and validations +of the systems are largely missing, creating a need for continued +development. [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) +supports the further development of new SDG labeling systems by +providing the `detect_any` function. In this section, we provide +additional detail on using this feature of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg). + +The `detect_any` function also uses +[**corpustools**](https://CRAN.R-project.org/package=corpustools) as the +back-end. This implies that new queries must be specified to match the +syntax of +[**corpustools**](https://CRAN.R-project.org/package=corpustools). The +syntax supports standard Boolean operators (AND, OR, and NOT), wildcard +operators, and proximity search. Boolean operators control how different +keywords are combined in a query. For instance, the query \"marine OR +fisheries\" matches text that contains either of these two words whereas +the query \"marine AND fisheries\" only matches text that contains both +words. Corpustools also allows to specify common query wildcard +operators [^1]. The wildcard operators $?$ and $*$ allow the +specification of variable word parts. For instance, the question mark +operator $?$ matches one unknown character or no character at all, e.g., +\"?ish\" would match \"fish\", \"dish\", or \"ish\". The asterisk +operator $*$, by contrast, matches any number of unknown characters, +e.g., \"\*ish\" would match \"fish\" but also \"Swedish\". Both +wildcards can be used at the start, within or end of a term. Proximity +search extends a Boolean AND, by requiring that two keywords have no +more than defined distances to one another. For instance, \"climate +change\"$\sim$``{=html}3 specifies matches in which \"climate\" +and \"change\" both occur no more than three words apart. A complete +description of the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) syntax +is presented in the +[**corpustools**](https://CRAN.R-project.org/package=corpustools) +vignette and documentation. + +To supply a user-defined labeling system to `detect_any`, the queries +must be placed in a `data.frame` or `tibble` that additionally includes +a column specifying the labeling system's name and a column of SDG +labels corresponding to the queries. + +- system (character) - name of the labeling systems. + +- queries (character) - user-defined queries. + +- sdg (integer) - SDGs labels assigned by queries. + +The example below illustrates the application of a user-defined labeling +system using `detect_any`. First, a `tibble` is defined that includes +three rows, one for each of three different queries stored in the +`query` column. The system is called `"my_example_system"` in the +`system` column and each of the queries is assigned SDG-14 in the `sdg` +column. Note that specification of the labeling system need not be made +in R, but can easily be outsourced to a spreadsheet that is then +processed into a `tibble`. Second, the system is supplied to the +`system` argument of the `detect_any` function, along with the texts +(here, the SNSF abstracts). The output is analogous to the output of the +`detect_sdg_systems` function (for brevity, we only show the first three +lines of the output). + +``` r +> # definition of query set +> my_example_system <- tibble::tibble(system = "my_example_system", ++ query = c("marine AND fisheries", ++ "('marine fisheries') AND sea", ++ "?ish"), ++ sdg = c(14,14,14)) +> detect_any(text = projects, ++ system = my_example_system) +# A tibble: 591 × 6 + document sdg system query_id features hit + + 1 6 SDG-14 my_example_system 3 wish 122 + 2 134 SDG-14 my_example_system 3 wish 18 + 3 241 SDG-14 my_example_system 3 fish 59 +``` + +### Applying text2sdg to non-English data + +The queries of the labeling systems implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) are in +English, implying that texts in other languages must first be translated +to English. We assessed feasibility and whether translation affects the +reliability of SDG labels by making use of back translation with one +language we are most familiar with (German). To this end, we first +translated 1,500 randomly selected SNSF project abstracts from English +to German and from German to English and then compared the labels of the +original English and back-translated English abstracts. We carried out +the translation using the DeepL translation engine +([www.deepl.com/translator](https://www.deepl.com/translator)). + +Table \@ref(tab:my-table_corr) shows the results of this analysis. +Overall, the correlations as measured by the $phi$-coefficient are very +high. The systems showed correlations above or equal to $0.88$, with +Elsevier and Auckland showing the highest value of $0.93$. Considering +that our analysis involves not only one, but two translation +steps---from German to English and back---these results suggest that +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) can be +applied to non-English text, such as German, with very high accuracy. +One should note, however, that the quality of translation may vary +across languages and translation engines so additional work is needed to +compare performance across different languages. + +::: {#tab:my-table_corr} + ---------------------------------------------------- + Aurora Elsevier Auckland SIRIS SDSN SDGO + -------- ---------- ---------- ------- ------ ------ + 0.91 0.93 0.93 0.88 0.91 0.91 + + ---------------------------------------------------- + + : Table 3: $phi$-coefficient between the labels for the original + English text and the labels for the back-translated + (English-German-English) English text +::: + +### Estimating the runtime of text2sdg + +The analysis of text data can be computationally intense. To provide +some guidance on the expected runtime of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) for data +with different numbers of documents and different document lengths, we +carried out several experiments. For this purpose, we first simulated +documents by concatenating 10, 100, 1,000, or 10,000 words drawn +randomly according to word frequencies in Wikipedia and combined 1, 10, +100, or 1,000 thus-generated documents into simulated data sets. Then we +evaluated the runtime of +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) separately +by system for the simulated data sets. + +Figure \@ref(fig:figurebenchmark-plot) shows the average runtime in +seconds across 7,000 repetitions of each combination of document length +and number of documents for each of the labeling systems. The results +highlight noteworthy points. First, runtime is primarily a function of +the number of words, irrespective of how words are distributed across +documents. Second, the runtime per words decreases as the number of +words increases, which is due to a constant overhead associated with +optimizing the labeling systems' queries. Third, there are considerable +differences in the runtime between systems, which is, in part, due to +the functions' overhead and, in part, due to differences in number and +complexity of queries. The fastest system is Elsevier, processing 10 +million words in roughly one minute; the slowest system is SIRIS, +processing 10 million words in about 40 minutes. Overall, these +experiments highlight that +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) can +efficiently process large amounts of text, but also that some care +should be exercised when dealing with extremely large or many texts. In +such cases, it may be advisable to rely on more efficient labeling +systems, such as Elsevier or SDSN. + +```{r figurebenchmark-plot, echo=FALSE , fig.cap="Median runtime as a function of number of documents and document length using 6 different query systems. Each cell reflects the average runtime of 7,000 runs with numbers reflecting the median runtime in seconds and color reflecting the logarithm of the median runtime in seconds.", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100%"} +knitr::include_graphics(c("benchmark_revision_final.png")) +``` + +## Other approaches to detecting SDGs in text + +There are a number of other approaches to detecting SDGs in text. First, +there are approaches outside the R ecosystem. One such tool is the +European Union's SDG Mapper +() that produces an +analysis of SDGs per document using an online interface in which +registered users can upload single documents. Another prominent example +is the OSDG tool developed by the SDG Ai Lab of the United Nations in +collaboration with private partners. It can detect SDGs in text that is +provided through the OSDG website () or, if granted +access, through an API. The OSDG tool builds on the SDG Ontology (SDGO) +that is also implemented in +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg). OSDG +additionally leverages a machine learning tool that was trained on +expert-labeled data to make the final predictions [@OSDG2]. One +advantage of OSDG relative to +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) is that it +allows to detect SDGs in 15 different languages. This is done by using +translation of the input text into English before passing it through the +OSDG workflow. While this is convenient to the user, the same outcome +can be achieved with our package by making use of translation models +through, for example the +[**deeplr**](https://CRAN.R-project.org/package=deeplr) R package. As +our proof-of-concept above has shown, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) can be used +with non-English text (e.g., German) with very high accuracy by using +such an approach. + +Second, there are currently, to our knowledge, two other R packages +aimed at providing methods for the automated detection of SDGs in text. +The [**SDGdetector**](https://CRAN.R-project.org/package=SDGdetector) +package is based on a custom query system that was generated by pooling +several existing query systems and manual adaptions. The resulting +labeling system permits finer-grained predictions on the level of SDG +targets [^2]. However, the method is computationally taxing and limited +to texts that are shorter than 750 characters or approximately 150 +words. The **SDGmapR** package builds on publicly available SDG keywords +that are assigned weights that indicate the degree to which a keyword +reflects a given SDG. The package computes SDG weights for each text by +adding up the weights of the keywords that were found in the text. The +larger this weight, the larger should be the likelihood that the text is +related to a specified SDG. The advantage of this approach is that it +permits customization of the decision boundary (i.e., the weight needed +to count a text as SDG related). However, the package does not give the +user a binary decision regarding whether a text relates to a given SDG. +None of the two packages offers an ensemble model that can be used to +categorize the presence of SDGs as is the case with +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg). + +## Discussion + +The [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) package +offers an open and easily accessible way of detecting SDGs in text using +both individual query systems, a state-of-the-art ensemble model that +combines queries from extant systems [@wulff2023using], as well as +custom-made queries. + +While our package implements several query-based methods to detect SDGs +in text as well as a state-of-the-art ensemble model, the field of +detecting SDGs in text is rapidly evolving. Our aim is to continuously +update [**text2sdg**](https://CRAN.R-project.org/package=text2sdg) as +new open source methods of detecting SDGs in text are released. Bundling +many systems in a coherent API is not only convenient for users, but +also helps catalyze development of new and hopefully more accurate +methods by making it easy to compare the performance of the different +systems. We deliberately incorporated functions that allow users to +implement and test their own query systems to facilitate this process. +We also encourage others to contribute to +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) by adding +new systems or by expanding the existing functionalities to analyse the +output of the systems. + +Indeed, although the systems implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) have been +shown to achieve high accuracy [@wulff2023using], it is important to +stress that these systems must be further developed to increase their +accuracy for a greater number of document types. Two approaches can help +in achieving this. First, unsupervised methods such as topic models +[@grun2011topicmodels] or semantic network analysis [@siew2019cognitive] +can help in identifying novel linguistic patterns for the detection of +SDGs. One should note, however, that unsupervised methods are no +replacement for top-down, rule-based methods as implemented by +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), because of +the strong requirement to compare results across data sets, analyses, +and time, which require a clear set of benchmarks that are not simply +data-driven. Second, recent transformer based models +[@reimers2019sentence] could be leveraged to learn more complex +relationships between specific linguistic patterns and SDGs. However, +the field will have to work towards producing more balanced training +data before the full potential of these approaches can be exploited. +Moreover, one should note that transformer models are computationally +expensive and often limited to short text due to architecture +constraints [@ding2020cogltx]. Whether such developments will emerge and +can be ultimately integrated into +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg) or will +represent alternative approaches remains an open question. + +## Conclusion + +In this article, we introduced a new R package, +[**text2sdg**](https://CRAN.R-project.org/package=text2sdg), designed to +help identify SDGs from text. The package promises to help detect SDGs +in text sources using different existing or custom-made labeling systems +as well as a high-performance ensemble model that builds on these +labeling systems. Our case study and additional analyses suggest that +the approach can handle both sources in English as well as translations, +allows user-friendly use of novel queries, and provides reasonably +efficient performance for analysing large corpora. +:::::: + +[^1]: Note that the meaning of these wildcards differs from regex + wildcards. + +[^2]: Each SDG has several targets that are operationalized with + indicators (SDG/targets/indicators). For example the first target of + SDG 1 reads as follows: \"By 2030, eradicate extreme poverty for all + people everywhere, currently measured as people living on less than + \$1.25 a day\". diff --git a/_articles/RJ-2024-005/RJwrapper.tex b/_articles/RJ-2024-005/RJwrapper.tex new file mode 100644 index 0000000000..4759e3de78 --- /dev/null +++ b/_articles/RJ-2024-005/RJwrapper.tex @@ -0,0 +1,28 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + +%% load any required packages FOLLOWING this line +\usepackage{tabularx} + +\begin{document} +\setlength\extrarowheight{2pt} % make the tables look less cramped + + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{83} + +%% replace RJtemplate with your article +\begin{article} + \input{text2sdg} +\end{article} + +\end{document} diff --git a/_articles/RJ-2024-005/benchmark_revision_final.pdf b/_articles/RJ-2024-005/benchmark_revision_final.pdf new file mode 100644 index 0000000000..29eceb9ae7 Binary files /dev/null and b/_articles/RJ-2024-005/benchmark_revision_final.pdf differ diff --git a/_articles/RJ-2024-005/benchmark_revision_final.png b/_articles/RJ-2024-005/benchmark_revision_final.png new file mode 100644 index 0000000000..c373dae955 Binary files /dev/null and b/_articles/RJ-2024-005/benchmark_revision_final.png differ diff --git a/_articles/RJ-2024-005/default_plot_revision.pdf b/_articles/RJ-2024-005/default_plot_revision.pdf new file mode 100644 index 0000000000..65e3fc0f9c Binary files /dev/null and b/_articles/RJ-2024-005/default_plot_revision.pdf differ diff --git a/_articles/RJ-2024-005/default_plot_revision.png b/_articles/RJ-2024-005/default_plot_revision.png new file mode 100644 index 0000000000..e54da452a1 Binary files /dev/null and b/_articles/RJ-2024-005/default_plot_revision.png differ diff --git a/_articles/RJ-2024-005/default_plot_sdg_labels_revision.pdf b/_articles/RJ-2024-005/default_plot_sdg_labels_revision.pdf new file mode 100644 index 0000000000..60a33da04b Binary files /dev/null and b/_articles/RJ-2024-005/default_plot_sdg_labels_revision.pdf differ diff --git a/_articles/RJ-2024-005/default_plot_sdg_labels_revision.png b/_articles/RJ-2024-005/default_plot_sdg_labels_revision.png new file mode 100644 index 0000000000..f0c65bd47f Binary files /dev/null and b/_articles/RJ-2024-005/default_plot_sdg_labels_revision.png differ diff --git a/_articles/RJ-2024-005/resubmission_response.txt b/_articles/RJ-2024-005/resubmission_response.txt new file mode 100644 index 0000000000..5cad45351c --- /dev/null +++ b/_articles/RJ-2024-005/resubmission_response.txt @@ -0,0 +1,29 @@ +Thanks a lot for the final comments and feedback. We addressed all comments as noted below. + +p2 - There shouldn't be a - after Boolean in "Boolean- (AND,OR)" +-> Thanks for catching that, we changed it + +p3 - "In the next sections," = "In the following sections," +-> We replaced it as noted + +p4 - Table 2 text is very small, can you make that text same size as others. - +-> Thanks for noting that, we increased the text size of table 2. Both tables now have the same font size and are consistent with the font size of the manuscript. + +p6,7 - For Figures 1 and 2, I don't see the reason for a gradient color scale, but it's also ok to leave it as is. It's visually pleasing, just gives the sense of an ordinary variable. +-> We would like to keep the color as it is, as it is the default in the package. + +p9,10 - Add leading zeroes to numbers starting with . in bottom of page 9 and in Table 3, to match up with 0.88 in bottom of p9 and also to make it easier for reading. +-> Thanks for catching that, we changed it. + +p10 - Figure 3 - I recommend making this a bit larger, you can do two rows of 3 facets, so the text is roughly the same size as the rest of the document. This will avoid overlap on numbers like in the SIRIS facet as well. +-> We increased the figure size as suggested by doing two rows of 3 facets. + +As your data is large and you are providing on an external website, you should put instructions for obtaining these files in the code file and make sure that paths etc match those of the downloaded files +-> We now provide the data in the following zenodo repository: https://zenodo.org/records/11060662. We adapted the script so that the data are directly read from zenodo. + +Also, I’m getting an error with the code on line 22, "expanded path length 1024 would be too long…” . +-> I’m getting the same message, although it is a warning and the code works as expected. I unfortunately was not able to resolve the warning. + +Then, the code to construct benchmark_table used in Figure 3 is not provided. +-> Thanks for catching that, the code is now included in the file. + diff --git a/_articles/RJ-2024-005/text2sdg.R b/_articles/RJ-2024-005/text2sdg.R new file mode 100644 index 0000000000..27585b4a3a --- /dev/null +++ b/_articles/RJ-2024-005/text2sdg.R @@ -0,0 +1,302 @@ +library(tidyverse) +library(fastText) +library(text2sdg) +library(cowplot) + + +# Read and preprocess SNSF data ------------------------------------------- +projects <- read_csv2("https://zenodo.org/records/11060662/files/GrantWithAbstracts.csv?download=1") +nrow_before <- nrow(projects) +# remove documents with missing abstracts +projects <- projects %>% + filter(!is.na(Abstract)) + +#number of removed projects because of missing abstract +nrow_before - nrow(projects) + + +# use fasttext to identify langauge and then remove nonenglish abstracts +# lid.176.bin file can be downloaded here: https://fasttext.cc/docs/en/language-identification.html +file_bin = file.path('lid.176.bin') + +dtbl_res_in = fastText::language_identification(input_obj = projects$Abstract, + pre_trained_language_model_path = file_bin, + k = 1, + th = 0.0, + threads = 1, + verbose = TRUE) +table(dtbl_res_in$iso_lang_1) + +projects[["detected_language"]] <- dtbl_res_in$iso_lang_1 + + +#how many dropped because of language? +projects %>% + filter(detected_language != "en") %>% + nrow(.) + +#keep only english +projects <- projects %>% + filter(detected_language == "en") + +nrow_before - nrow(projects) + +# create character vector +projects <- projects[["Abstract"]] + + + +# detect SDGs with the ensemble model +sdgs_ensemble <- detect_sdg(text = projects, + synthetic = c("equal","triple")) +head(sdgs_ensemble) + + +# Run detect_sdg with the individual query models --------------------------------------------- + +# detect SDGs +#shown +sdgs <- detect_sdg_systems(text = projects, + systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO"), + output = "features") + +#shown +head(sdgs) + +# plot +#shown +plot_sdg(sdgs) + +#ggsave("3_plots/default_plot_revision.pdf", width = 12/1.5, height = 6/1.5) + + +#shown +plot_sdg(sdgs, + sdg_titles = TRUE) + + ggplot2::facet_wrap(~system, ncol= 1, scales = "free_y") + +#ggsave("3_plots/default_plot_sdg_labels_revision.pdf", width = 8, height = 8) + + +# Correlation +#shown +crosstab_sdg(sdgs) + +#shown +crosstab_sdg(sdgs_ensemble) + +#shown +crosstab_sdg(sdgs, compare = "sdgs", sdgs = 1:6) + +# Query development ------------------------------------------------------- +#shown +# definition of query set +my_example_system <- tibble::tibble(system = "my_example_system", + query = c("marine AND fisheries", + "('marine fisheries') AND sea", + "?ish"), + sdg = c(14,14,14)) + +detect_any(text = projects, + system = my_example_system) + + +# Applying text2sdg to non-English data ----------------------------------- + +df_backtrans <- read_rds("https://zenodo.org/records/11060662/files/backtrans_table.RDS?download=1") + +orig_sdg <- detect_sdg_systems(df_backtrans$orig, systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO")) + +backtrans_sdg <- detect_sdg_systems(df_backtrans$backtrans, systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO")) + + +#### prepare data for correlation test +systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO") + +sdgs <- unique(orig_sdg$sdg) + +phi_dat_orig <- tidyr::expand_grid(document = 1:length(levels(orig_sdg$document)), + system = systems, sdg = sdgs) %>% dplyr::mutate(document = as.factor(document)) %>% + dplyr::left_join(orig_sdg %>% dplyr::mutate(hit = 1) %>% + dplyr::select(document, system, sdg, hit), by = c("document", + "system", "sdg")) %>% + dplyr::mutate(hit = dplyr::if_else(is.na(hit), 0, 1)) %>% + dplyr::distinct() %>% + dplyr::arrange(document, sdg) %>% + tidyr::pivot_wider(names_from = system, values_from = hit) + + + +phi_dat_backtrans <- tidyr::expand_grid(document = 1:length(levels(backtrans_sdg$document)), + system = systems, sdg = sdgs) %>% dplyr::mutate(document = as.factor(document)) %>% + dplyr::left_join(backtrans_sdg %>% dplyr::mutate(hit = 1) %>% + dplyr::select(document, system, sdg, hit), by = c("document", + "system", "sdg")) %>% + dplyr::mutate(hit = dplyr::if_else(is.na(hit), 0, 1)) %>% + dplyr::distinct() %>% + dplyr::arrange(document, sdg) %>% + tidyr::pivot_wider(names_from = system, values_from = hit) + + +library(psych) +tab_aurora <- table(phi_dat_orig$Aurora, phi_dat_backtrans$Aurora) +tab_elsevier <- table(phi_dat_orig$Elsevier, phi_dat_backtrans$Elsevier) +tab_auckland <- table(phi_dat_orig$Auckland, phi_dat_backtrans$Auckland) +tab_siris <- table(phi_dat_orig$SIRIS, phi_dat_backtrans$SIRIS) +tab_sdsn <- table(phi_dat_orig$SDSN, phi_dat_backtrans$SDSN) +tab_sdgo <- table(phi_dat_orig$SDGO, phi_dat_backtrans$SDGO) + +phi(tab_aurora, digits = 2) +phi(tab_elsevier, digits = 2) +phi(tab_auckland, digits = 2) +phi(tab_siris, digits = 2) +phi(tab_sdsn, digits = 2) +phi(tab_sdgo, digits = 2) + + +# Performance ------------------------------------------------------------- + +benchmark_table <- read_rds("https://zenodo.org/records/11060662/files/benchmark_table_revision.rds?download=1") + +#prep data +benchmark_table %>% + count(length, ndoc) + +plot_table <- benchmark_table %>% + pivot_longer(-c(length, ndoc, index)) + + +plot_table %>% + group_by(length, ndoc, name) %>% + summarise(median = median(value)) %>% + mutate(length = factor(length), + ndoc = factor(ndoc), + name = factor(name, levels = c("Aurora", "Elsevier","Auckland", "SIRIS", "OSDG", "SDSN"))) %>% + ggplot(aes(x = length, y = ndoc, fill = log(median))) + + geom_tile(color = "white") + + geom_text(aes(label = round(median, 1)), color = "white", size = 3) + + coord_fixed() + + scale_fill_gradient(low ="#A5D7D2", + high = "#46505A") + + facet_wrap(~name, nrow = 2) + + guides(fill = guide_colourbar(barwidth = 14, + barheight = .5, + title = "Median Runtime (log seconds)", + title.theme = element_text(size = 9.5), + title.position = "top", + title.hjust = .5)) + + labs(x = "Document length", y = "Number of Documents") + + theme( + panel.background = element_rect(fill = "white", + colour = "white", + size = 0.5, linetype = "solid"), + strip.background =element_rect(fill="white"), + legend.position = "top") + +# ggsave("3_plots/benchmark_revision_final.pdf", +# width = 10, +# height = 12) + + + + + +# Unique keywords per SDG for each system (Table 1) ----------------------- +elsevier_queries %>% + select(sdg, query) %>% + mutate(query = str_replace_all(query, "OR | AND", "")) %>% + #get rid of all on-alphanumeric symbols + mutate(query_words = strsplit(gsub("[^[:alnum:] ]", "", query), " +")) %>% + group_by(sdg) %>% + unnest(query_words) %>% + mutate(query_words = str_to_lower(query_words)) %>% + distinct(sdg, query_words) %>% + #just to be sure + filter(query_words != "AND | OR") %>% + count(sdg) %>% + ungroup() %>% + summarise(mean = mean(n), sd = sd(n)) + + +aurora_queries %>% + select(sdg, query) %>% + mutate(query = str_replace_all(query, "OR | AND", "")) %>% + #get rid of all on-alphanumeric symbols + mutate(query_words = strsplit(gsub("[^[:alnum:] ]", "", query), " +")) %>% + group_by(sdg) %>% + unnest(query_words) %>% + mutate(query_words = str_to_lower(query_words)) %>% + distinct(sdg, query_words) %>% + #just to be sure + filter(query_words != "AND | OR") %>% + count(sdg) %>% + ungroup() %>% + summarise(mean = mean(n), sd = sd(n)) + +auckland_queries %>% + select(sdg, query) %>% + mutate(query = str_replace_all(query, "OR | AND", "")) %>% + #get rid of all on-alphanumeric symbols + mutate(query_words = strsplit(gsub("[^[:alnum:] ]", "", query), " +")) %>% + group_by(sdg) %>% + unnest(query_words) %>% + mutate(query_words = str_to_lower(query_words)) %>% + distinct(sdg, query_words) %>% + #just to be sure + filter(query_words != "AND | OR") %>% + count(sdg) %>% + ungroup() %>% + summarise(mean = mean(n), sd = sd(n)) + + +siris_queries%>% + select(sdg, query) %>% + mutate(query = str_replace_all(query, "OR | AND", "")) %>% + #get rid of all on-alphanumeric symbols + mutate(query_words = strsplit(gsub("[^[:alnum:] ]", "", query), " +")) %>% + group_by(sdg) %>% + unnest(query_words) %>% + mutate(query_words = str_to_lower(query_words)) %>% + distinct(sdg, query_words) %>% + #just to be sure + filter(query_words != "AND | OR") %>% + count(sdg) %>% + ungroup() %>% + summarise(mean = mean(n), sd = sd(n)) + + +sdgo_queries %>% + select(sdg, query) %>% + mutate(query = str_replace_all(query, "OR | AND", "")) %>% + #get rid of all on-alphanumeric symbols + mutate(query_words = strsplit(gsub("[^[:alnum:] ]", "", query), " +")) %>% + group_by(sdg) %>% + unnest(query_words) %>% + mutate(query_words = str_to_lower(query_words)) %>% + distinct(sdg, query_words) %>% + #just to be sure + filter(query_words != "AND | OR") %>% + count(sdg) %>% + ungroup() %>% + summarise(mean = mean(n), sd = sd(n)) + +sdsn_queries %>% + select(sdg, query) %>% + mutate(query = str_replace_all(query, "OR | AND", "")) %>% + #get rid of all on-alphanumeric symbols + mutate(query_words = strsplit(gsub("[^[:alnum:] ]", "", query), " +")) %>% + group_by(sdg) %>% + unnest(query_words) %>% + mutate(query_words = str_to_lower(query_words)) %>% + distinct(sdg, query_words) %>% + #just to be sure + filter(query_words != "AND | OR") %>% + count(sdg) %>% + ungroup() %>% + summarise(mean = mean(n), sd = sd(n)) + + + + + + diff --git a/_articles/RJ-2024-005/text2sdg.bib b/_articles/RJ-2024-005/text2sdg.bib new file mode 100644 index 0000000000..5a0b70a609 --- /dev/null +++ b/_articles/RJ-2024-005/text2sdg.bib @@ -0,0 +1,203 @@ + + + + + +@article{Bautista2019, +author = {Bautista, N}, +title = {SDG Ontology}, +year = {2019}, +url= {https://doi.org/10.6084/m9.figshare.11106113.v1} +} + + + +@book{SGD_report2022, +author = {UN}, +title = {{The Sustainable Development Goals Report 2022}}, +publisher = {United Nations}, +year = {2022}, +} + + +@article{jayabalasingham2019identifying, + title={Identifying research supporting the United Nations sustainable development goals}, + author={Jayabalasingham, Bammini and Boverhof, Roy and Agnew, Kevin and Klein, Lisette}, + journal={Mendeley Data}, + volume={1}, + url = {https://doi.org/10.17632/87txkw7khs.1}, + year={2019} +} + + + +@software{vanderfeesten_maurice_2020_3817445, + author = {Vanderfeesten, Maurice and + Otten, René and + Spielberg, Eike}, + title = {{Search Queries for "Mapping Research Output to the + Sustainable Development Goals (SDGs)" v5.0}}, + month = jul, + year = 2020, + publisher = {Zenodo}, + version = {5.0}, + url = {https://doi.org/10.5281/zenodo.3817445} +} + + + +@dataset{duran_silva_nicolau_2019_3567769, + author = {Duran-Silva, Nicolau and + Fuster, Enric and + Massucci, Francesco Alessandro and + Quinquillà, Arnau}, + title = {{A controlled vocabulary defining the semantic + perimeter of Sustainable Development Goals}}, + month = dec, + year = 2019, + publisher = {Zenodo}, + version = {1.2}, + url = {https://doi.org/10.5281/zenodo.3567769} +} + +@online{sdsn, + author = {{Sustainable Development Solutions Network (SDSN)}}, + title = {Compiled list of SDG keywords}, + year = 2021, + url = {https://ap-unsdsn.org/regional-initiatives/universities-sdgs/}, + urldate = {2010-09-30} +} + + + +@dataset{vanderfeesten_maurice_2020_3813230, + author = {Vanderfeesten, Maurice and + Spielberg, Eike and + Gunes, Yassin}, + title = {{Survey data of "Mapping Research Output to the + Sustainable Development Goals (SDGs)"}}, + month = may, + year = 2020, + publisher = {Zenodo}, + version = {1.0.1}, + doi = {10.5281/zenodo.3813230}, + url = {https://doi.org/10.5281/zenodo.3813230} +} + + + +@Manual{corpustools, + title = {corpustools: Managing, Querying and Analyzing Tokenized Text}, + author = {Kasper Welbers and Wouter {van Atteveldt}}, + year = {2021}, + note = {R package version 0.4.8}, + url = {https://CRAN.R-project.org/package=corpustools}, +} + +@Manual{readr, + title = {readr: Read Rectangular Text Data}, + author = {Hadley Wickham and Jim Hester and Jennifer Bryan}, + year = {2021}, + note = {R package version 2.1.1}, + url = {https://CRAN.R-project.org/package=readr}, + } + + @Book{ggplot2, + author = {Hadley Wickham}, + title = {ggplot2: Elegant Graphics for Data Analysis}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-3-319-24277-4}, + url = {https://ggplot2.tidyverse.org}, + } + + + @article{wang2023mapping, + title={Mapping research to the Sustainable Development Goals (SDGs)}, + author={Wang, Weiwei and Kang, Weihao and Mu, Jingwen}, + year={2023}, + url = {https://doi.org/10.21203/rs.3.rs-2544385/v1} +} + +@article{wulff2023using, + title={Using novel data and ensemble models to improve automated labeling of Sustainable Development Goals}, + author={Wulff, Dirk U and Meier, Dominik S and Mata, Rui}, + journal={arXiv preprint arXiv:2301.11353}, + year={2023} +} + +@article{grun2011topicmodels, + title={topicmodels: An R package for fitting topic models}, + author={Gr{\"u}n, Bettina and Hornik, Kurt}, + journal={Journal of statistical software}, + volume={40}, + pages={1--30}, + doi={https://doi.org/10.18637/jss.v040.i13}, + year={2011} +} + +@article{reimers2019sentence, + title={Sentence-bert: Sentence embeddings using siamese bert-networks}, + author={Reimers, Nils and Gurevych, Iryna}, + journal={arXiv preprint arXiv:1908.10084}, + year={2019}, + doi={https://doi.org/10.48550/arXiv.1908.10084} +} + +@article{ding2020cogltx, + title={Cogltx: Applying bert to long texts}, + author={Ding, Ming and Zhou, Chang and Yang, Hongxia and Tang, Jie}, + journal={Advances in Neural Information Processing Systems}, + volume={33}, + pages={12792--12804}, + year={2020} +} + +@misc{OSDG2, + doi = {10.48550/ARXIV.2211.11252}, + + url = {https://arxiv.org/abs/2211.11252}, + + author = {Pukelis, Lukas and Bautista-Puig, Nuria and Statulevičiūtė, Gustė and Stančiauskas, Vilius and Dikmener, Gokhan and Akylbekova, Dina}, + + keywords = {Digital Libraries (cs.DL), FOS: Computer and information sciences, FOS: Computer and information sciences}, + + title = {OSDG 2.0: a multilingual tool for classifying text data by UN Sustainable Development Goals (SDGs)}, + + publisher = {arXiv}, + + year = {2022}, + + copyright = {Creative Commons Attribution 4.0 International} +} + + +@article{siew2019cognitive, + title={Cognitive network science: A review of research on cognition through the lens of network representations, processes, and dynamics}, + author={Siew, Cynthia SQ and Wulff, Dirk U and Beckage, Nicole M and Kenett, Yoed N}, + journal={Complexity}, + volume={2019}, + year={2019}, + doi={https://doi.org/10.1155/2019/2108423}, + publisher={Hindawi} +} + + +@article{pukelis2020osdg, + title={OSDG--Open-Source Approach to Classify Text Data by UN Sustainable Development Goals (SDGs)}, + author={Pukelis, Lukas and Puig, N{\'u}ria Bautista and Skrynik, Mykola and Stanciauskas, Vilius}, + journal={arXiv preprint arXiv:2005.14569}, + year={2020}, + doi={https://doi.org/10.48550/arXiv.2005.14569} +} + + +@dataset{meier_2024_11060662, + author = {Meier, Dominik S.}, + title = {Descriptions of SNSF-funded research projects}, + month = apr, + year = 2024, + publisher = {Zenodo}, + doi = {10.5281/zenodo.11060662}, + url = {https://doi.org/10.5281/zenodo.11060662} +} \ No newline at end of file diff --git a/_articles/RJ-2024-005/text2sdg.tex b/_articles/RJ-2024-005/text2sdg.tex new file mode 100644 index 0000000000..f0f6175828 --- /dev/null +++ b/_articles/RJ-2024-005/text2sdg.tex @@ -0,0 +1,398 @@ +% !TeX root = RJwrapper.tex + + +\title{text2sdg: An R Package to Monitor Sustainable Development Goals from Text} +\author{by Dominik S. Meier, Rui Mata, and Dirk U. Wulff} + +\maketitle + +\abstract{ +Monitoring progress on the United Nations Sustainable Development Goals (SDGs) is important for both academic and non-academic organizations. Existing approaches to monitoring SDGs have focused on specific data types; namely, publications listed in proprietary research databases. We present the text2sdg package for the R language, a user-friendly, open-source package that detects SDGs in text data using different individual query systems, an ensemble of query systems, or custom-made ones. The text2sdg package thereby facilitates the monitoring of SDGs for a wide array of text sources and provides a much-needed basis for validating and improving extant methods to detect SDGs from text. +} + +\section{Introduction} + +The United Nations Sustainable Development Goals (SDGs) have become an important guideline for both governmental and non-governmental organizations to monitor and plan their contributions to social, economic, and environmental transformations. The 17 SDGs cover large areas of application, from ending poverty and improving health, to fostering economic growth and preserving natural resources. As the latest UN report \cite[]{SGD_report2022} attests, the availability of high-quality data is still lacking in many of these areas and progress is needed in identifying data sources that can help monitor work on these goals. Monitoring of SDGs has typically been based on economic and health data, which are often difficult and costly to gather (e.g., \url{https://sdg-tracker.org/}; \url{https://www.sdgindex.org/}). One attractive alternative that has emerged from recent scientometric efforts is to detect SDGs from text, such as academic publications. Digitized text represents an attractive resource for monitoring SDGs across a large number of domains because it is becoming widely available in various types of documents, such as news articles, websites, corporate reports, and social media posts. In light of this promise, we developed \CRANpkg{text2sdg}, a freely available, open-source tool to enable the SDG-labeling of digitized text and facilitate methodological development in this area. In what follows, we first present some background on existing labeling systems developed to identify SDGs from text, and then provide an overview of the \CRANpkg{text2sdg} package, showcase its use in a representative case study, and discuss the promise and limitations of the approach. + +\section{An overview of SDG labeling systems} + + The \CRANpkg{text2sdg} package provides a user-friendly way to use any existing or custom-made labeling system developed to monitor the 17 SDGs in text sources. The package implements six different labeling systems utilizing different keywords and keyword combination rules, as well as an ensemble model based on the six systems that was trained on labeled data. In the following, we will first introduce the six existing labeling systems, namely the Elsevier, Aurora, Auckland, SIRIS, SDGO, and SDSN systems, before discussing how these systems are combined within the ensemble approach. See table \ref{tab:systems_overview} for overview of these labeling systems. We address custom-made labeling systems in a dedicated section below. + + \subsection{Individual labeling systems} + + The most prominent SDG labeling system has been developed by \textit{Elsevier}. The Elsevier labeling system was integrated into the Times Higher Education Impact Rankings in 2019, which at the time compared 1,118 universities in their efforts to address the SDGs as measured by the frequency of SDG-related terms in their academic output. The Elsevier queries consist of a list of expert-vetted keywords that are combined using logical AND operators, implying that multiple keywords must be met to label a document as containing a certain SDG. The development of the queries started with an original list of keywords for each SDG that were iteratively fine tuned to maximize the number of identified papers closely reflecting the different SDGs. This involved cropping or combining keywords to reduce the number of irrelevant hits. A detailed report on the initial development of the Elsevier query system is provided by \citet[]{jayabalasingham2019identifying}. Since the first version, the Elsevier labeling system has been iteratively improved, with the latest versions including additional information specific to academic publications and the Scopus database, such as identifiers of journal names or research areas. \CRANpkg{text2sdg} implements the latest version without such additional identifiers to broaden the package's applicability beyond the Scopus database \citep[]{jayabalasingham2019identifying}. + + The Aurora Universities Network's "Societal Impact and Relevance of Research" working group started to develop a labeling system in 2017 to increase the visibility of research into the SDGs. Aurora's queries were developed with the goal of identifying SDG-related academic publications included in the Scopus database. Consequently, the syntax of Aurora queries is similar to the Scopus query language and the Elsevier system. However, in contrast to the Elsevier system, the queries combine keywords in a more complex fashion, recruiting Boolean (AND, OR) and proximity operators (e.g., w/3, implying within 3 words). As a result, Aurora's keywords are more specific, possibly leading to a smaller number of false positives. The initial version of the Aurora system only included terms that appear in the SDG policy text of the targets and indicators defined by the United Nations. Subsequent versions expanded on this by including additional keywords that reflect academic terminology. \CRANpkg{text2sdg} implements version 5.0 of the Aurora labeling system \citep{vanderfeesten_maurice_2020_3817445}. This version represents an improvement on previous versions based on a survey study \citep{vanderfeesten_maurice_2020_3813230} and modifications inspired in other efforts, namely those from Elsevier (above) and SIRIS (introduced below). + + The Auckland labeling system \citep{wang2023mapping} was developed by the University of Auckland to better understand how their research output contributes to the SDGs. To construct the queries, they used text-mining techniques to extract global and local SDG keywords from publication metadata. These keywords were then sorted according to the number of publications that include the terms and according to the keywords' term frequency–inverse document frequency. The top-ranked keywords were then manually reviewed to only retain keywords that are relevant. The selected keywords were then combined with those of SDSN and Elsevier as well as UN SDG Indicators to form the final SDG keyword list. These queries formed the basis for the Auckland queries, which make use of Boolean (AND, OR) operators and wildcards (e.g., "*"). + + The SIRIS labeling system \cite[]{duran_silva_nicolau_2019_3567769} was created by SIRIS Academic as part of the \href{http://science4sdgs.sirisacademic.com/}{"science4sdgs"} project to better understand how science, innovation efforts, and technology related to the SDGs. The SIRIS queries were constructed in a five-step procedure. First, an initial list of keywords was extracted from the United Nations official list of goals, targets and indicators. Second, the list was manually enriched on a basis of a review of SDG relevant literature. Third, a word2vec model that was trained on a text corpus created from the enriched keyword list was used to identify keywords that were semantically related to the initial list. Fourth, using the DBpedia API, keywords were added that, according to the Wikipedia corpus, had a categorical relationship with the initial list. Fifth, and finally, the keyword list was manually revised. The queries of the SIRIS labeling system primarily consist of individual keywords that occasionally are combined with a logical AND. \CRANpkg{text2sdg} implements the only currently available version of the SIRIS labeling system \cite[]{duran_silva_nicolau_2019_3567769} . + +The Open Source SDG (OSDG) project combines data from multiple sources to detect SDGs in text. Instead of developing yet another query system, OSDG's aim was to re-use and integrate existing knowledge by combining multiple SDG "ontologies" (i.e., query systems). OSDG has also made use of Microsoft Academic Graph to improve their results but because our query-based system cannot implement this procedure, we adopt the simpler ontology initially proposed by OSDG, which we refer to as "SDGO" in the package. The labeling system was based on central keywords in the SDG United Nations description (e.g."sanitation" was classified into "SDG6") and then manually expanded with additional relevant keywords identified from a corpus of already labeled documents. The resulting keyword list only makes use of the OR operator. \CRANpkg{text2sdg} implements the only currently available version of these queries \cite[]{Bautista2019}. + + Finally, the Sustainable Development Solutions Network \cite[SDSN,][]{sdsn} labeling system contains SDG-specific keywords compiled in a collaborative effort by several universities from the Sustainable Development Solutions Network (SDSN) Australia, New Zealand \& Pacific Network. This query system was developed to detect SDGs in large sets of university-related text data, such as course listings or research publications. The authors used United Nations documents, Google searches, and personal communications as sources for the keywords. This query system combines keywords with OR operators and does not make use of AND operators. + + All in all, as can be seen in Table \ref{tab:systems_overview}, the latter systems differ from the former four in the complexity of their queries: the Elsevier, Aurora, Auckland, and SIRIS systems make use of keyword-combination queries and other criteria, such as proximity operators, whereas SDGO and SDSN only make use of keywords. + +\begin{table}[] +\footnotesize +\begin{tabularx}{\linewidth}{@{} >{\hsize=0.08\hsize}X >{\hsize=0.15\hsize}X >{\hsize=0.2\hsize}X >{\hsize=0.2\hsize}X >{\hsize=0.3\hsize}X @{}} +\toprule +Labeling system & + SDGs covered & + Query operators & + + Unique keywords per SDG (mean \& SD) & + Example query (SDG-01) \\ + + \midrule +Elsevier & + SDG 1 - SDG 16 & + OR, AND, wildcards & + 74.9 (21.7) & + "extreme poverty" + \\ +Aurora & + SDG 1 - SDG 17 & + OR, AND, wildcards, proximity search& + 89.6 (31.6) & + ("poverty") W/3 ("chronic*" OR "extreme") + \\ + Auckland & + SDG 1 - SDG 16 & + OR, AND, wildcards& + 183 (46.5) & + "poverty eradication" + \\ +SIRIS & + SDG 1 - SDG 16 & + OR, AND & + 262 (148) & + ("anti-poverty") AND ("poverty" OR "vulnerability") + \\ +SDGO & + SDG 1 - SDG 17 & + OR & + 245 (236) & + "absolute poverty" + \\ +SDSN & + SDG 1 - SDG 17 & + OR & + 62.6 (16.8) & + "End poverty" + \\ \bottomrule +\end{tabularx}% +\caption{Overview of the labeling systems implemented in \CRANpkg{text2sdg}. Legend: OR---keywords are combined using logical ORs, implying that only the keywords must be matched to assign an SDG label; AND---keywords are combined using logical ANDs, implying that multiple keywords must be matched to assign an SDG label; wildcards---keywords are matched considering different keyword parts; proximity search---keywords must co-occur within a certain word window to assign an SDG label.} +\label{tab:systems_overview} +\end{table} + + + + \subsection{The ensemble labeling system} + + In another publication \citep{wulff2023using}, we evaluated the accuracy of the six labeling systems implemented by \CRANpkg{text2sdg} and a rival approach \citep[i.e., OSDG][]{pukelis2020osdg} using expert-labeled data sets. These analyses lead to three critical observations. First, the accuracy of SDG classifications was reasonable for all systems, but varied considerably as a function of the data set. This is because the systems differ in how liberal or conservative they assign SDGs to texts due to differences in the types of query operators they employ. Specifically, employing only OR-operators, SDGO and SDSN were considerably more liberal, whereas the other four systems employing additional operators were more conservative. In other words, the systems implement different trade-offs between sensitivity (i.e., true-positive rate) and specificity (i.e., true-negative rate). As a result, SDGO and SDSN outperformed the other systems for SDG-rich documents and vice versa. In addition to these differences in accuracy, we observed critical biases in SDG profiles, with the systems overemphasizing different sets of SDGs, and strong dependencies between SDG predictions and document length. To address these limitations, we developed an ensemble model approach that uses the the predictions of the six systems and document length as inputs to a random forest model. After training with expert-labeled and synthetic data, the ensemble model showed better out-of-sample accuracy, lower false alarm rates, and smaller biases than any individual labeling system \cite{wulff2023using}. As a result, this ensemble model is also made available through \CRANpkg{text2sdg} using a dedicated function. + + In the following sections, we provide an overview over the \CRANpkg{text2sdg} R package and demonstrate how its functions can be used to run to detect and analyze SDGs in text. + +\section{The text2sdg package} + +\subsection{Motivation for text2sdg} + +Despite the effort put into developing various labeling systems and their great promise in addressing the SDG-related data scarcity, extant implementations of these approaches are not without shortcomings. First, the labeling systems were mostly developed to be used within academic citation databases (e.g., Scopus) and are not easily applied to other text sources. Second, existing implementations lack transparent ways to communicate which features are matched to which documents or how they compare between a choice of labeling systems. We alleviate these shortcomings by providing an open-source solution, \CRANpkg{text2sdg}, that lets users detect SDGs in any kind of text using any of the above-mentioned systems, and ensemble of systems, or even customized, user-made labeling systems. The package provides a common framework for implementing the different extant or novel approaches and makes it easy to quantitatively compare and visualize their results. + +\subsection{Overview of text2sdg package} + +At the heart of the \CRANpkg{text2sdg} package are the Lucene-style queries that are used to detect SDGs in text and the ensemble models that build on these queries. The queries map text features (i.e., words or a combination of words) to SDGs. For example, a text that contains the words "fisheries" and "marine" would be mapped to SDG 14 (i.e., conserve and sustainably use the oceans, seas and marine resources for sustainable development) by the Aurora system. To enable the use of such queries in R, the \CRANpkg{text2sdg} package recruits the \CRANpkg{corpustools} package \citep{corpustools}. \CRANpkg{corpustools} has been built to implement complex search queries and execute them efficiently for large amounts of text. Based on this, \CRANpkg{text2sdg} provides several functions that implement extant labeling systems, facilitate the specification of new labeling systems, and analyze and visualize search results. Table \ref{tab:functions_overview} gives an overview of the \CRANpkg{text2sdg} core functions. + +The main functions of \CRANpkg{text2sdg} are \code{detect\_sdg} and \code{detect\_sdg\_systems}, which implement the ensemble model approach \citep{wulff2023using} and the implemented labeling systems, respectively, to identify SDGs in texts. The texts are provided to these functions via the \code{text} argument as either a character vector or an object of class \code{"tCorpus"} from \CRANpkg{corpustools}. All other arguments are optional. By default, the \code{detect\_sdg\_systems} function runs only the Aurora, Auckland, Elsevier, and SIRIS systems, but the set systems can be extended to all six systems using the \code{system} argument. The functions further allow customization of the set of SDGs using the \code{sdgs} argument and return a \code{tibble} with one row per hit that has the following columns (and types) (italic column names only present in the tibble returned by \code{detect\_sdg\_systems}): + +\begin{itemize} + \item document (factor) - index of element in the character vector or corpus supply for text + \item sdg (character) - labels indicating the matched SDGs + \item system (character) - the query or ensemble system that produced the match + \item \textit{query\_id} (integer) - identifier of query in the query system + \item \textit{features} (character) - words in the document that were matched by the query + \item hit (numeric) - running index of matches for each system +\end{itemize} + +Further details on the \code{detect\_sdg} and \code{detect\_sdg\_systems} functions and their output will be presented in the next section. + +The \code{detect\_any} function implements the same functionality as \code{detect\_sdg\_systems}, but permits the user to specify customized or self-defined queries. These queries are specified via the \code{queries} argument and must follow the syntax of the \CRANpkg{corpustools} package (see Practical Considerations section for more details). + +To support the interpretation of SDG labels generated by \code{detect\_sdg}, \code{detect\_sdg\_systems} and \code{detect\_any}, \CRANpkg{text2sdg} further provides the \code{plot\_sdg} and \code{crosstab\_sdg} functions. The \code{plot\_sdg} function visualizes the distribution of SDG labels identified in documents by means of a customizable barplot showing SDG frequencies for the different labeling systems. The \code{crosstab\_sdg} function helps reveal patterns of label co-occurrences either across SDGs or systems, which can be controlled using the \code{compare} argument. + + +\begin{table}[] +\footnotesize +\begin{tabularx}{\linewidth}{@{} >{\hsize=0.2\hsize}X >{\hsize=0.8\hsize}X @{}} % Adjusting column width proportions +\toprule +Function Name & Description \\ \midrule +\code{detect\_sdg} & identifies SDGs in text using an ensemble model that draws on the six labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN). \\ +\code{detect\_sdg\_systems} & identifies SDGs in text by using labeling systems (Elsevier, Aurora, Auckland, SIRIS, SDGO, SDSN). \\ + +detect\_any & similar to \code{detect\_sdg} but identifies SDGs in text using user-defined queries. \\ +\code{crosstab\_sdg} & crosstab\_sdg takes the output of detect\_sdg, detect\_sdg\_systems, or detect\_any as input and determines correlations between either query systems or SDGs. \\ +\code{plot\_sdg} & takes the output of detect\_sdg, detect\_sdg\_systems, or detect\_any as input and produces adjustable barplots illustrating the hit frequencies produced by the different query systems. \\ \bottomrule +\end{tabularx} +\caption{Overview of package functions} +\label{tab:functions_overview} +\end{table} + +\section{Demonstrating the functionality of text2sdg} + +To showcase the functionalities of the \CRANpkg{text2sdg} package we analyze the publicly available p3 dataset of the Swiss National Science Foundation (SNSF) that lists research projects funded by the SNSF. In addition to demonstrating \CRANpkg{text2sdg}, the case study will permit us to discuss practical issues concerning the labeling of SDGs, including relevant differences between labeling systems. The data to reproduce the analyses presented below can be found at \url{https://doi.org/10.5281/zenodo.11060662} \citep{meier_2024_11060662}. + +\subsection{Preparing the SNSF projects data} + +The SNSF projects data was downloaded from \url{https://data.snf.ch/datasets}. As of March 2022, the p3 database included information on 81,237 research projects. From the data, we removed 54,288 projects where the abstract was absent or not written in English. This left us with a total of 26,949 projects. To ready this data for analysis, we read it using the \code{readr} function of the \CRANpkg{readr} package \citep{readr}, producing a \code{tibble} named \texttt{projects}. A reduced version of this \code{tibble} is included in the \CRANpkg{text2sdg} package and available through the \code{projects} object after \CRANpkg{text2sdg} has been loaded. + +\subsection{Using \code{detect\_sdg} and \code{detect\_sdg\_systems} to detect SDGs} + +To label the abstracts in \code{projects} using \code{detect\_sdg}, we only have to supply the character vector that includes the abstracts to the \code{text} argument of the \code{detect\_sdg} function. In addition the example below makes use of the \code{synthetic} argument to implement the \code{"equal"} (default) and \code{"triple"} version of the ensemble model. As a result, two versions of the ensemble model are run that were trained on an equal amount of synthetic (non-SDG related) and expert-labeled data and three times the amount of synthetic than labeled data, respectively. A larger amount of synthetic data in training lowers the false-positive rate, but also compromises accuracy \cite[cf.][for more details]{wulff2023using}. + +\begin{example} +# detect SDGs +> sdgs_ensemble <- detect_sdg(text = projects, ++ synthetic = c("equal","triple")) +Running systems +Obtaining text lengths +Building features +Running ensemble + +> head(sdgs_ensemble) +# A tibble: 6 × 4 + document sdg system hit + +1 22 SDG-06 Ensemble equal 2539 +2 39 SDG-03 Ensemble equal 498 +3 39 SDG-07 Ensemble equal 2953 +4 39 SDG-08 Ensemble equal 4080 +5 41 SDG-13 Ensemble equal 5690 +6 41 SDG-13 Ensemble triple 3684 + + +\end{example} + +The first two columns of the \code{tibble} returned by \code{detect\_sdg} show the document and SDGs identified by the model. Further columns show the system producing the hit and a running hit index for a given system. As the predictions of the six individual labeling systems are used as input for the ensemble models, they will be computed in the background. The user can access these predictions by calling \code{attr(sdgs\_ensemble, "system\_hits")}. Alternatively, the user can use the \code{detect\_sdg\_systems} function, which provides additional options for customization. + +As with the \code{detect\_sdg} function, the \code{detect\_sdg\_systems} function requires a character vector as input to the \code{text} argument. In addition, the example below specifies two optional arguments. First, to indicate that all six systems should be run, rather than the default of only Aurora, Auckland, Elsevier, and SIRIS, we supply a character vector of all six systems’ names to the \code{systems} argument. Second, we explicitly set the \code{output} argument to \texttt{“features”}, which in contrast to \code{output = “documents”} delivers more detailed information about which keywords that triggered the SDG labels. + +\begin{example} +# detect SDGs +> sdgs <- detect_sdg_systems(text = projects, ++ systems = c("Aurora", "Elsevier", "Auckland", "SIRIS", "SDSN", "SDGO"), ++ output = "features") +Running Aurora +Running Elsevier +Running Auckland +Running SIRIS +Running SDSN +Running SDGO + +> head(sdgs) +# A tibble: 6 × 6 + document sdg system query_id features hit + +1 1 SDG-01 SDSN 392 sustainable 4 +2 1 SDG-02 SDSN 376 maize 3 +3 1 SDG-02 SDSN 629 sustainable 8 +4 1 SDG-08 SDGO 3968 work 1 +5 1 SDG-08 SDSN 812 work 11 +6 1 SDG-09 SDSN 483 research 6 + +\end{example} + +The above \code{tibble} produced by \CRANpkg{text2sdg} contains for every combination of document, SDG, system, and query (columns 1 to 4), the query feature (keyword) that triggered the label (column 5), and a hit index for a given system (column 6). The first row of the \code{tibble} thus shows that the query 392 within SDSN labeled document number 1 with SDG-01, because the document included the feature \textit{sustainable}, and that this was the fourth hit produced by the SDSN system. It is important to note that, in other cases, multiple features of a query might be matched, which will result in multiple rows per combination of document, SDG, system, and query. This can be avoided by setting the \code{output} argument to \texttt{“documents”}, in which case all features' hits of such combinations will be grouped into a single row. + +\subsection{Analyzing the SDG labels} + +To visualize the distribution of SDG labels across SDGs and systems in the \texttt{sdgs} \code{tibble}, we apply the \code{plot\_sdg} function. By default, \code{plot\_sdg} shows a barplot of the number of documents labeled by each of the SDGs, with the frequencies associated with the different systems stacked on top of each other. The function counts a maximum of one hit per document-system-SDG combination. Duplicate combinations resulting from hits by multiple queries or keywords in queries will be suppressed by default and the function returns a message reporting the number of cases affected. + +\begin{example} + +> plot_sdg(sdgs) +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates. +\end{example} + +\begin{figure}[htbp] + \centering + \includegraphics[width=1\linewidth]{default_plot_revision.pdf} + \caption{Default plot of distribution of detected SDGs.} + \label{figure:default_plot} +\end{figure} + +The plot produced by \code{plot\_sdg} (Figure ~\ref{figure:default_plot}) shows considerable differences in the frequency of different SDGs, with SDGs 3 (“Good Health and Well-Being”) and 9 (“Industry, Innovation And Infrastructure”) being most frequent and SDGs 5 (“Gender Equality”) and 14 (“Life Below Water”) being least frequent. Furthermore, there are substantial differences in the number of labels produced by different systems, with SDSN and SDGO having produced many more labels than the other three systems. + +To customize the visualization of SDG frequencies, the \code{plot\_sdg} function provides several additional arguments. For instance, by setting \code{sdg\_titles} to \texttt{TRUE}, the SDG titles will be added to the annotation of the plot. Other arguments are \code{normalize} to show probabilities instead of frequencies, \code{color} to change the filling of bars, and \code{remove\_duplicates} to eliminate duplicate document-system-SDG combinations. Furthermore, as \code{plot\_sdg} is built on \CRANpkg{ggplot2} \citep{ggplot2}, the function can easily be extended by functions from the \CRANpkg{ggplot2} universe. To illustrate these points, the code below generates a plot (Figure ~\ref{figure:default_plot_facetted}) that includes SDG titles and separates the results of the different SDG systems using facets. + +\begin{example} +> plot_sdg(sdgs, ++ sdg_titles = TRUE) + ++ ggplot2::facet_wrap(~system, ncol= 1, scales = "free_y") +139048 duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates. +\end{example} + + +\begin{figure}[htbp] + \centering + \includegraphics[width=1\linewidth]{default_plot_sdg_labels_revision.pdf} + \caption{Distribution of detected SDGs facetted by system.} + \label{figure:default_plot_facetted} +\end{figure} + +The separation of systems better illustrates the results of systems that produce fewer hits and helps compare the results across systems. This reveals, for instance, that in the Elsevier system SDG 3 (“Good Health and Well-Being”) was most prominent, whereas in the Aurora system this was SDG 13 ("Climate Action”). These results highlight that the different labeling systems do not necessarily agree concerning the assignment of SDGs to documents. + +To quantify the commonalities and differences between labeling systems, \CRANpkg{text2sdg} provides the \code{crosstab\_sdg} function. The function evaluates the level of alignment across either systems (the default) or SDGs by calculating $\phi$ coefficients between the vectors of labels. We supply the \code{hits} argument of the function with the \texttt{sdgs} \code{tibble} containing the labels produced by \code{detect\_sdg}. Note that the function only considers distinct combinations of documents, systems and SDGs, irrespective of whether the \code{detect\_sdg} function was run using \code{output = “documents”} or \code{output = "features”}. + +\begin{example} + +> crosstab_sdg(sdgs) + Auckland Aurora Elsevier SDGO SDSN SIRIS +Auckland 1.0000000 0.3345247 0.6676524 0.3314806 0.2896650 0.4115387 +Aurora 0.3345247 1.0000000 0.3256877 0.1614586 0.1569791 0.3703457 +Elsevier 0.6676524 0.3256877 1.0000000 0.2642918 0.2192051 0.3538272 +SDGO 0.3314806 0.1614586 0.2642918 1.0000000 0.3722997 0.2244774 +SDSN 0.2896650 0.1569791 0.2192051 0.3722997 1.0000000 0.2330684 +SIRIS 0.4115387 0.3703457 0.3538272 0.2244774 0.2330684 1.0000000 + +\end{example} + +The output of \code{crosstab\_sdg()} for the SNSF projects reveals two noteworthy insights. First, the correspondence between the labels of different systems is rather small, as indicated by $\phi$ coefficients that are mostly smaller than 0.4. Second, there are two groups of systems that are more similar to one another. On the one hand, Elsevier, Auckland, Aurora, and SIRIS, and, on the other hand, SDGO and SDSN. These groups correspond to differences in query operators, with the former four including AND operators in their queries, whereas the latter two do not. \code{crosstab\_sdg()} can also be called with the output from the ensemble models. + +\begin{example} +> crosstab_sdg(sdgs_ensemble) + Ensemble equal Ensemble triple +Ensemble equal 1.0000000 0.8127837 +Ensemble triple 0.8127837 1.0000000 +\end{example} + + +It can further be informative to analyze the correlations between SDGs. To do this, we set the \code{compare} argument in \code{crosstab\_sdg()} to \texttt{"sdgs"}. The output below shows the result for the first six SDGs by setting \code{sdgs = 1:6}. It can be seen that certain pairs of SDGs---in particular, SDG-01 and SDG-02---co-occur more frequently. These results may provide insights into the co-occurrence structure of SDGs in the data at hand. However, these results can also highlight the importance of considering similarities between queries targeting different SDGs. + +\begin{example} + +> crosstab_sdg(sdgs, compare = "sdgs", sdgs = 1:6) + SDG-01 SDG-02 SDG-03 SDG-04 SDG-05 SDG-06 +SDG-01 1.00000000 0.47455139 0.04811778 0.07928418 0.14252372 0.16622948 +SDG-02 0.47455139 1.00000000 0.10611662 0.06751253 0.09338952 0.17504027 +SDG-03 0.04811778 0.10611662 1.00000000 0.18092227 0.10936179 0.04882173 +SDG-04 0.07928418 0.06751253 0.18092227 1.00000000 0.11791600 0.07887042 +SDG-05 0.14252372 0.09338952 0.10936179 0.11791600 1.00000000 0.04603253 +SDG-06 0.16622948 0.17504027 0.04882173 0.07887042 0.04603253 1.00000000 + + +\end{example} + +\section{Practical considerations} + +\subsection{Specifying user-defined labeling systems} + + The query systems implemented in \CRANpkg{text2sdg} represent important efforts to systematize the monitoring of SDGs from text. Nevertheless, these efforts are still relatively young and validations of the systems are largely missing, creating a need for continued development. \CRANpkg{text2sdg} supports the further development of new SDG labeling systems by providing the \code{detect\_any} function. In this section, we provide additional detail on using this feature of \CRANpkg{text2sdg}. + + The \code{detect\_any} function also uses \CRANpkg{corpustools} as the back-end. This implies that new queries must be specified to match the syntax of \CRANpkg{corpustools}. The syntax supports standard Boolean operators (AND, OR, and NOT), wildcard operators, and proximity search. Boolean operators control how different keywords are combined in a query. For instance, the query "marine OR fisheries" matches text that contains either of these two words whereas the query "marine AND fisheries" only matches text that contains both words. Corpustools also allows to specify common query wildcard operators \footnote{Note that the meaning of these wildcards differs from regex wildcards.}. The wildcard operators $?$ and $*$ allow the specification of variable word parts. For instance, the question mark operator $?$ matches one unknown character or no character at all, e.g., "?ish" would match "fish", "dish", or "ish". The asterisk operator $*$, by contrast, matches any number of unknown characters, e.g., "*ish" would match "fish" but also "Swedish". Both wildcards can be used at the start, within or end of a term. Proximity search extends a Boolean AND, by requiring that two keywords have no more than defined distances to one another. For instance, "climate change"$\sim$3 specifies matches in which "climate" and "change" both occur no more than three words apart. A complete description of the \CRANpkg{corpustools} syntax is presented in the \CRANpkg{corpustools} vignette and documentation. + + To supply a user-defined labeling system to \code{detect\_any}, the queries must be placed in a \code{data.frame} or \code{tibble} that additionally includes a column specifying the labeling system's name and a column of SDG labels corresponding to the queries. + +\begin{itemize} + \item system (character) - name of the labeling systems. + \item queries (character) - user-defined queries. + \item sdg (integer) - SDGs labels assigned by queries. +\end{itemize} + + The example below illustrates the application of a user-defined labeling system using \code{detect\_any}. First, a \code{tibble} is defined that includes three rows, one for each of three different queries stored in the \code{query} column. The system is called \texttt{"my\_example\_system"} in the \texttt{system} column and each of the queries is assigned SDG-14 in the \texttt{sdg} column. Note that specification of the labeling system need not be made in R, but can easily be outsourced to a spreadsheet that is then processed into a \code{tibble}. Second, the system is supplied to the \code{system} argument of the \code{detect\_any} function, along with the texts (here, the SNSF abstracts). The output is analogous to the output of the \code{detect\_sdg\_systems} function (for brevity, we only show the first three lines of the output). + + +\begin{example} +> # definition of query set +> my_example_system <- tibble::tibble(system = "my_example_system", ++ query = c("marine AND fisheries", ++ "('marine fisheries') AND sea", ++ "?ish"), ++ sdg = c(14,14,14)) +> detect_any(text = projects, ++ system = my_example_system) +# A tibble: 591 × 6 + document sdg system query_id features hit + + 1 6 SDG-14 my_example_system 3 wish 122 + 2 134 SDG-14 my_example_system 3 wish 18 + 3 241 SDG-14 my_example_system 3 fish 59 + +\end{example} + + +\subsection{Applying text2sdg to non-English data} +The queries of the labeling systems implemented by \CRANpkg{text2sdg} are in English, implying that texts in other languages must first be translated to English. We assessed feasibility and whether translation affects the reliability of SDG labels by making use of back translation with one language we are most familiar with (German). To this end, we first translated 1,500 randomly selected SNSF project abstracts from English to German and from German to English and then compared the labels of the original English and back-translated English abstracts. We carried out the translation using the DeepL translation engine (\href{https://www.deepl.com/translator}{www.deepl.com/translator}). + +Table \ref{tab:my-table_corr} shows the results of this analysis. Overall, the correlations as measured by the $phi$-coefficient are very high. The systems showed correlations above or equal to $0.88$, with Elsevier and Auckland showing the highest value of $0.93$. Considering that our analysis involves not only one, but two translation steps---from German to English and back---these results suggest that \CRANpkg{text2sdg} can be applied to non-English text, such as German, with very high accuracy. One should note, however, that the quality of translation may vary across languages and translation engines so additional work is needed to compare performance across different languages. + +% Please add the following required packages to your document preamble: +% \usepackage{booktabs} +% \usepackage{graphicx} +\begin{table}[h] +\centering +\begin{tabular}{@{}llllll@{}} +\toprule +Aurora & Elsevier & Auckland & SIRIS & SDSN & SDGO \\ \midrule +0.91 & 0.93 & 0.93 & 0.88 & 0.91 & 0.91 \\ +\bottomrule +\end{tabular}% +\caption{$phi$-coefficient between the labels for the original English text and the labels for the back-translated (English-German-English) English text} +\label{tab:my-table_corr} +\end{table} + + + \subsection{Estimating the runtime of text2sdg} + + The analysis of text data can be computationally intense. To provide some guidance on the expected runtime of \CRANpkg{text2sdg} for data with different numbers of documents and different document lengths, we carried out several experiments. For this purpose, we first simulated documents by concatenating 10, 100, 1,000, or 10,000 words drawn randomly according to word frequencies in Wikipedia and combined 1, 10, 100, or 1,000 thus-generated documents into simulated data sets. Then we evaluated the runtime of \CRANpkg{text2sdg} separately by system for the simulated data sets. + + Figure~\ref{figure:benchmark_plot} shows the average runtime in seconds across 7,000 repetitions of each combination of document length and number of documents for each of the labeling systems. The results highlight noteworthy points. First, runtime is primarily a function of the number of words, irrespective of how words are distributed across documents. Second, the runtime per words decreases as the number of words increases, which is due to a constant overhead associated with optimizing the labeling systems' queries. Third, there are considerable differences in the runtime between systems, which is, in part, due to the functions' overhead and, in part, due to differences in number and complexity of queries. The fastest system is Elsevier, processing 10 million words in roughly one minute; the slowest system is SIRIS, processing 10 million words in about 40 minutes. + Overall, these experiments highlight that \CRANpkg{text2sdg} can efficiently process large amounts of text, but also that some care should be exercised when dealing with extremely large or many texts. In such cases, it may be advisable to rely on more efficient labeling systems, such as Elsevier or SDSN. + +\begin{figure}[htbp] + \centering + \includegraphics[width=1\linewidth]{benchmark_revision_final.pdf} + \caption{Median runtime as a function of number of documents and document length using 6 different query systems. Each cell reflects the average runtime of 7,000 runs with numbers reflecting the median runtime in seconds and color reflecting the logarithm of the median runtime in seconds.} + \label{figure:benchmark_plot} +\end{figure} + +\section{Other approaches to detecting SDGs in text} +There are a number of other approaches to detecting SDGs in text. First, there are approaches outside the R ecosystem. One such tool is the European Union's SDG Mapper (\url{https://knowsdgs.jrc.ec.europa.eu/sdgmapper}) that produces an analysis of SDGs per document using an online interface in which registered users can upload single documents. Another prominent example is the OSDG tool developed by the SDG Ai Lab of the United Nations in collaboration with private partners. It can detect SDGs in text that is provided through the OSDG website (\url{https://osdg.ai/}) or, if granted access, through an API. The OSDG tool builds on the SDG Ontology (SDGO) that is also implemented in \CRANpkg{text2sdg}. OSDG additionally leverages a machine learning tool that was trained on expert-labeled data to make the final predictions \citep{OSDG2}. One advantage of OSDG relative to \CRANpkg{text2sdg} is that it allows to detect SDGs in 15 different languages. This is done by using translation of the input text into English before passing it through the OSDG workflow. While this is convenient to the user, the same outcome can be achieved with our package by making use of translation models through, for example the \CRANpkg{deeplr} R package. As our proof-of-concept above has shown, \CRANpkg{text2sdg} can be used with non-English text (e.g., German) with very high accuracy by using such an approach. + +Second, there are currently, to our knowledge, two other R packages aimed at providing methods for the automated detection of SDGs in text. The \CRANpkg{SDGdetector} package is based on a custom query system that was generated by pooling several existing query systems and manual adaptions. The resulting labeling system permits finer-grained predictions on the level of SDG targets \footnote{Each SDG has several targets that are operationalized with indicators (SDG/targets/indicators). For example the first target of SDG 1 reads as follows: "By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than \$1.25 a day".}. However, the method is computationally taxing and limited to texts that are shorter than 750 characters or approximately 150 words. The \pkg{SDGmapR} package builds on publicly available SDG keywords that are assigned weights that indicate the degree to which a keyword reflects a given SDG. The package computes SDG weights for each text by adding up the weights of the keywords that were found in the text. The larger this weight, the larger should be the likelihood that the text is related to a specified SDG. The advantage of this approach is that it permits customization of the decision boundary (i.e., the weight needed to count a text as SDG related). However, the package does not give the user a binary decision regarding whether a text relates to a given SDG. None of the two packages offers an ensemble model that can be used to categorize the presence of SDGs as is the case with \CRANpkg{text2sdg}. + +\section{Discussion} + The \CRANpkg{text2sdg} package offers an open and easily accessible way of detecting SDGs in text using both individual query systems, a state-of-the-art ensemble model that combines queries from extant systems \citep{wulff2023using}, as well as custom-made queries. + + While our package implements several query-based methods to detect SDGs in text as well as a state-of-the-art ensemble model, the field of detecting SDGs in text is rapidly evolving. Our aim is to continuously update \CRANpkg{text2sdg} as new open source methods of detecting SDGs in text are released. Bundling many systems in a coherent API is not only convenient for users, but also helps catalyze development of new and hopefully more accurate methods by making it easy to compare the performance of the different systems. We deliberately incorporated functions that allow users to implement and test their own query systems to facilitate this process. We also encourage others to contribute to \CRANpkg{text2sdg} by adding new systems or by expanding the existing functionalities to analyse the output of the systems. + +Indeed, although the systems implemented by \CRANpkg{text2sdg} have been shown to achieve high accuracy \citep{wulff2023using}, it is important to stress that these systems must be further developed to increase their accuracy for a greater number of document types. Two approaches can help in achieving this. First, unsupervised methods such as topic models \citep{grun2011topicmodels} or semantic network analysis \citep{siew2019cognitive} can help in identifying novel linguistic patterns for the detection of SDGs. One should note, however, that unsupervised methods are no replacement for top-down, rule-based methods as implemented by \CRANpkg{text2sdg}, because of the strong requirement to compare results across data sets, analyses, and time, which require a clear set of benchmarks that are not simply data-driven. Second, recent transformer based models \citep{reimers2019sentence} could be leveraged to learn more complex relationships between specific linguistic patterns and SDGs. However, the field will have to work towards producing more balanced training data before the full potential of these approaches can be exploited. Moreover, one should note that transformer models are computationally expensive and often limited to short text due to architecture constraints \citep{ding2020cogltx}. Whether such developments will emerge and can be ultimately integrated into \CRANpkg{text2sdg} or will represent alternative approaches remains an open question. + + \section{Conclusion} +In this article, we introduced a new R package, \CRANpkg{text2sdg}, designed to help identify SDGs from text. The package promises to help detect SDGs in text sources using different existing or custom-made labeling systems as well as a high-performance ensemble model that builds on these labeling systems. Our case study and additional analyses suggest that the approach can handle both sources in English as well as translations, allows user-friendly use of novel queries, and provides reasonably efficient performance for analysing large corpora. + + +\bibliography{text2sdg} + +\address{Dominik S. Meier\\ + University of Basel\\ + Steinengraben 22 4051 Basel\\ + Switzerland\\ + (ORCID: 0000-0002-3999-1388)\\ + \email{dominik.meier@unibas.ch}} + +\address{Rui Mata\\ + University of Basel\\ + Missionsstrasse 60-62 4055 Basel\\ + Switzerland\\ + (ORCID: 0000-0002-1679-906X)\\ + \email{rui.mata@unibas.ch}} + +\address{Dirk U. Wulff\\ + University of Basel\\ + Missionsstrasse 60-62 4055 Basel\\ + Switzerland\\ + (ORCID: 0000-0002-4008-8022)\\ + \email{dirk.wulff@unibas.ch}} + diff --git a/_articles/RJ-2024-006/RJ-2024-006.R b/_articles/RJ-2024-006/RJ-2024-006.R new file mode 100644 index 0000000000..f6ac90b561 --- /dev/null +++ b/_articles/RJ-2024-006/RJ-2024-006.R @@ -0,0 +1,810 @@ +# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand +# Please edit RJ-2024-006.Rmd to modify this file + +## ----dim-pow-html, eval = knitr::is_html_output(), echo=FALSE----------------- +# +# data = matrix(c('Power', '$H_0: \\lambda_{11} = 0$', +# '$\\frac{\\widehat{\\lambda}_{11}^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', +# '', '$H_0: \\lambda_{12} = 1$', +# '$\\frac{(\\widehat{\\lambda}_{12}-1)^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$', +# 'Dimension', '$H_0: \\lambda_{11} = 1$', +# '$\\frac{(\\widehat{\\lambda}_{11}-1)^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', +# '', '$H_0: \\lambda_{12} = 0$', +# '$\\frac{\\widehat{\\lambda}_{12}^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$'), ncol=3, nrow=4, byrow=T) +# colnames(data) = c('', 'Hypothesis', 'Test') +# +# knitr::kable(data, format = "html", caption = "Power and dimension of test assessment") + + +## ----dim-pow-tex, eval = knitr::is_latex_output(), echo=FALSE----------------- + +data = matrix(c('Power', '$H_0: \\lambda_{11} = 0$', + '$\\frac{\\widehat{\\lambda}_{11}^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 1$', + '$\\frac{(\\widehat{\\lambda}_{12}-1)^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$', + 'Dimension', '$H_0: \\lambda_{11} = 1$', + '$\\frac{(\\widehat{\\lambda}_{11}-1)^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 0$', + '$\\frac{\\widehat{\\lambda}_{12}^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$'), ncol=3, nrow=4, byrow=T) +colnames(data) = c('', 'Hypothesis', 'Test') + +knitr::kable(data, format = "latex", caption = "Power and dimension of test assessment", escape = FALSE) + + +## ----figure-2states, echo=FALSE, fig.cap="Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'---- +library(ggplot2) +df <- structure( + list( + States = c(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 3), + Parameter = c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0), + Sample = c( + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 100, + 500, + 1000 + ), + Power = c( + 0.082, + 0.252, + 0.466, + 0.994, + 0.082, + 0.252, + 0.466, + 0.994, + 0.905, + 1, + 1, + 0.905, + 1, + 1 + ), + Dimension = c( + 0.057, + 0.076, + 0.058, + 0.06, + 0.057, + 0.076, + 0.058, + 0.06, + 0.097, + 0.002, + 0.003, + 0.097, + 0.002, + 0.003 + ) + ), + row.names = c(NA, 14L), + class = "data.frame" +) + +df$Sample = as.factor(df$Sample) +df$Parameter = as.factor(df$Parameter) + +df1 <- df[df$States == 2, ] +p1 <- + ggplot(df1, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.08)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p2 <- + ggplot(df1, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.05)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p1, p2, ncol=2) + + + +## ----figure-3states, echo=FALSE, fig.cap="Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'---- +df2 <- df[df$States == 3, ] +p3 <- + ggplot(df2, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.11)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p4 <- + ggplot(df2, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.05)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p3, p4, ncol=2) + + +## ----figure-persistent-1, echo=FALSE, fig.cap="Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'---- +df3 <- + structure( + list( + Case = c( + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2 + ), + Parameter = c( + 0.2, + 0.2, + 0.2, + 0.2, + 0.4, + 0.4, + 0.4, + 0.4, + 0.6, + 0.6, + 0.6, + 0.6, + 0.8, + 0.8, + 0.8, + 0.8, + 0.2, + 0.2, + 0.2, + 0.2, + 0.4, + 0.4, + 0.4, + 0.4, + 0.6, + 0.6, + 0.6, + 0.6, + 0.8, + 0.8, + 0.8, + 0.8 + ), + Sample = c( + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000 + ), + Power = c( + 0.073, + 0.065, + 0.046, + 0.019, + 0.092, + 0.096, + 0.092, + 0.097, + 0.126, + 0.282, + 0.261, + 0.276, + 0.139, + 0.435, + 0.695, + 0.999, + 0.057, + 0.076, + 0.15, + 0.256, + 0.085, + 0.14, + 0.209, + 0.715, + 0.071, + 0.087, + 0.142, + 0.315, + 0.053, + 0.103, + 0.362, + 0.599 + ), + Dimension = c( + 0.018, + 0.014, + 0.009, + 0, + 0.005, + 0.004, + 0.002, + 0.002, + 0.005, + 0.004, + 0.002, + 0.002, + 0.018, + 0.014, + 0.009, + 0, + 0.018, + 0.025, + 0.038, + 0.064, + 0.002, + 0.003, + 0.07, + 0.103, + 0.002, + 0.003, + 0.07, + 0.103, + 0.018, + 0.025, + 0.038, + 0.064 + ) + ), + row.names = c(NA, + 32L), + class = "data.frame" + ) +df3$Sample = as.factor(df3$Sample) +df3$Parameter = as.factor(df3$Parameter) + +df4 <- df3[df3$Case == 1, ] + +p5 <- + ggplot(df4, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.07)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p6 <- + ggplot(df4, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.02)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p6, p5, ncol=2) + + + +## ----figure-persistent-2, echo=FALSE, fig.cap="Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting.", fig.height=3, fig.width=6, warning=FALSE, fig.align='center'---- + df5 <- df3[df3$Case == 2, ] +p7 <- + ggplot(df5, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.8)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + + +p8 <- + ggplot(df5, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.11)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 8, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p8, p7, ncol=2) + + + +## ----multi.mtd, eval=FALSE---------------------------------------------------- +# multi.mtd(y=cbind(s1,s2), deltaStop=0.0001, is_constrained=TRUE, delta=0.1) + + +## ----multi.mtd_probit, eval=FALSE--------------------------------------------- +# multi.mtd_probit(y = cbind(s1,s2), initial=c(1,1,1), nummethod='bfgs') + + +## ----mmcx-examp, eval=FALSE--------------------------------------------------- +# mmcx(y = cbind(s1,s2), x = cbind(x), initial=c(1,1)) + + +## ----mmc_tpm, eval=FALSE------------------------------------------------------ +# MMC_tpm(s = cbind(s1,s2), x = cbind(x), value = max(x), result = res) + + +## ----summary-stat-html, eval = knitr::is_html_output(), echo=FALSE------------ +# library(GenMarkov) +# data = rbind(c('$spread_{t}$', round(summary(stockreturns$spread_1), 3)), +# c('$r_{t;SP500}$', round(summary(stockreturns$returns_sp500), 3)), +# c('$r_{t;DJIA}$', round(summary(stockreturns$returns_djia), 3))) +# +# colnames(data) = c('Variable', 'Minimum', +# '1$^{st}$ Quantile', 'Median', +# 'Mean', '3$^{rd}$ Quantile', 'Maximum') +# knitr::kable(data, format = "html", caption = "Summary statistics of $stockreturns$ dataset") + + +## ----summary-stat-tex, eval = knitr::is_latex_output(), echo=FALSE------------ +library(GenMarkov) +data = rbind(c('$spread_{t}$', round(summary(stockreturns$spread_1), 3)), + c('$r_{t;SP500}$', round(summary(stockreturns$returns_sp500), 3)), + c('$r_{t;DJIA}$', round(summary(stockreturns$returns_djia), 3))) + +colnames(data) = c('Variable', 'Minimum', + '1$^{st}$ Quantile', 'Median', + 'Mean', '3$^{rd}$ Quantile', 'Maximum') + +knitr::kable(data, format = "latex", caption = "Summary statistics of $stockreturns$ dataset", escape=FALSE) + + +## ----generate-plots, echo=FALSE, warning=FALSE, message=FALSE----------------- +library(GenMarkov) +library(ggplot2) +library(gridExtra) +#Define data and variables +s = cbind(stockreturns$sp500, stockreturns$djia) +m1 = max(s) +x = stockreturns$spread_1 + +########################################################### +### Code retrieved from ProbValuesXDependent() function ### +########################################################### + +# Create matrix with dummies for each state +dummies_list <- + apply(s, 2, function(x) { + fastDummies::dummy_cols(x, remove_selected_columns = TRUE) + }) +dummies <- matrix(unlist(dummies_list), + ncol = m1 * ncol(s), + nrow = nrow(s) +) + +# Create all possible combinations of column indices +combinations <- expand.grid(1:ncol(s), 1:ncol(dummies)) +# Order by the first variable +combinations <- combinations[order(combinations$Var1), ] + +# Extract columns from S and S_L based on the combinations +combined_list <- lapply(1:nrow(combinations), function(i, x) { + cbind(s[, combinations[i, 1]], x, dummies[, combinations[i, 2]]) +}, x = x) + +estimate_condprobs <- sapply(combined_list, function(data) { + # Define dependent variable + y <- factor(data[, 1], levels = 1:max(data[, 1])) + + # Define lagged St + s_l <- Hmisc::Lag(data[, 3]) + + # Estimate multinomial logistic regression + res <- suppressWarnings(nnet::multinom(y[s_l == 1] ~ data[, "x"][s_l == 1], trace = FALSE)) + + warn <- tryCatch( + { + nnet::multinom(y[s_l == 1] ~ data[, "x"][s_l == 1], trace = FALSE) + + if (length(warnings()) == 0) { + NULL # Return NULL if no warning occurs + } + + }, + warning = function(w) { + # Extracting the warning message without printing + warning_message <- conditionMessage(w) + return(warning_message) + } + ) + + + if(is.null(warn)){ + # Extract fitted values + px1 <- res$fitted.values + + }else if(length(warn) == 1){ + + if( (grepl("\\bgroup\\b.*\\bempty\\b", warn, ignore.case = TRUE) || grepl("\\bgroups\\b.*\\bempty\\b", warn, ignore.case = TRUE)) ){ + extracted_number <- as.numeric(regmatches(warn, gregexpr("\\d+", warn))[[1]]) + + # Extract fitted values + px1 <- res$fitted.values + + ##Add missing groups + px1 <- cbind(px1, matrix(rep(0, nrow(px1)*length(extracted_number)), + ncol=length(extracted_number), + nrow = nrow(px1), + dimnames = list(NULL, extracted_number))) + + #Re-order columns + px1 <- px1[, match(1:m1, colnames(px1))] + }else{ + warning(warn) + } + + }else{ + warning(warn) + } + + state = data[data[,3]==1,1][1] + colnames(px1) = rep(paste('From state ', state), 3) + + return(as.matrix(px1)) +}, simplify = "array") + +##### +#Subset each conditional probabilities + +##S1t, S1t-1 +estim_prob_11 = estimate_condprobs[1:3] + +##S1t, S2t-1 +estim_prob_12 = estimate_condprobs[4:6] + +##S2t, S1t-1 +estim_prob_21 = estimate_condprobs[7:9] + +##S2t, S2t-1 +estim_prob_22 = estimate_condprobs[10:12] + + +#Function to create plots +plots_estimprobs = function(df){ + plots_list <- list() + j = colnames(df)[1] + for(i in 1:3){ + ma = pracma::movavg(df[,i], n = 5, type = "s") + df_ma = data.frame(ma = ma, Time = seq(1, nrow(df))) + + plot = ggplot(df_ma, aes(x = Time, + y = ma)) + + geom_line(color = 'black') + + ylab(label = paste(j, 'to state ', i)) + + theme_minimal() + + theme(axis.title = element_text(size = 8)) + + plots_list[[i]] <- plot + } + + plots_res = arrangeGrob(grobs = plots_list, ncol = 3) + + return(plots_res) +} + +#Save plots list +plots11 = lapply(estim_prob_11, function(x) plots_estimprobs(x)) + +plots12 = lapply(estim_prob_12, function(x) plots_estimprobs(x)) + +plots21 = lapply(estim_prob_21, function(x) plots_estimprobs(x)) + +plots22 = lapply(estim_prob_22, function(x) plots_estimprobs(x)) + + + +## ----fig11, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", message=FALSE, warning=FALSE, out.width='70%'---- +grid.arrange(grobs = plots11, nrow=3) + + +## ----fig12, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'---- +grid.arrange(grobs = plots12, nrow=3) + + +## ----fig21, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'---- +grid.arrange(grobs = plots21, nrow=3) + + +## ----fig22, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'---- +grid.arrange(grobs = plots22, nrow=3) + + +## ----mmcx--------------------------------------------------------------------- +attach(stockreturns) +res <- mmcx(cbind(sp500, djia), spread_1, initial=c(1,1)) + + +## ----tpm---------------------------------------------------------------------- +tpm_max <- MMC_tpm(cbind(sp500, djia), spread_1, + value = max(spread_1), result = res) + +tpm_min <- MMC_tpm(cbind(sp500, djia), spread_1, + value = min(spread_1), result = res) + +## ----tpm-figs, eval=FALSE----------------------------------------------------- +# library(markovchain) +# plot(new('markovchain', transitionMatrix = tpm_max[,,1])) # Generate figure 9 +# plot(new('markovchain', transitionMatrix = tpm_min[,,1])) # Generate figure 10 +# plot(new('markovchain', transitionMatrix = tpm_max[,,2])) # Generate figure 11 +# plot(new('markovchain', transitionMatrix = tpm_min[,,2])) # Generate figure 12 + + +## ----fig-sp500-max, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread$_{t-1}$. The highest probability of 0.6 refers to the transition from state 2 to state 3.", out.width='60%', warning=FALSE, message=FALSE, echo=FALSE---- +library(markovchain) +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) + + +## ----fig-sp500-min, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread$_{t-1}$. The highest probability of 0.56 refers to the transition from state 2 to state 2.", out.width='60%', echo=FALSE---- +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) + + +## ----fig-djia-max, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread$_{t-1}$. The probability of 0.58 refers to the transition from state 2 to state 3.", out.width='60%', echo=FALSE---- +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) + + +## ----fig-djia-min, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread$_{t-1}$. The highest probability of 0.51 refers to the transition from state 2 to state 2.",out.width='60%', echo=FALSE---- +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) + diff --git a/_articles/RJ-2024-006/RJ-2024-006.Rmd b/_articles/RJ-2024-006/RJ-2024-006.Rmd new file mode 100644 index 0000000000..ccb58c4b37 --- /dev/null +++ b/_articles/RJ-2024-006/RJ-2024-006.Rmd @@ -0,0 +1,1107 @@ +--- +title: 'GenMarkov: Modeling Generalized Multivariate Markov Chains in R' +date: '2025-01-10' +abstract: | + This article proposes a new generalization of the Multivariate Markov Chains (MMC) model. The future values of a Markov chain commonly depend on only the past values of the chain in an autoregressive fashion. The generalization proposed in this work also considers exogenous variables that can be deterministic or stochastic. Furthermore, the effects of the MMC's past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. The Monte Carlo simulation study findings showed that our model consistently detected a non-homogeneous Markov chain. Besides, an empirical illustration demonstrated the relevance of this new model by estimating probability transition matrices over the space state of the exogenous variable. An additional and practical contribution of this work is the development of a novel R package with this generalization. +author: +- name: Carolina Vasconcelos + affiliation: NOVA Information Management School (NOVA IMS) + address: Campus de Campolide, 1070-312 Lisboa, Portugal + email: cvasconcelos@novaims.unl.pt +- name: Bruno Damásio + affiliation: NOVA Information Management School (NOVA IMS) + address: Campus de Campolide, 1070-312 Lisboa, Portugal + email: bdamasio@novaims.unl.pt +type: package +output: + rjtools::rjournal_article: + self_contained: yes + toc: no +bibliography: genmarkov.bib +date_received: '2022-09-13' +volume: 16 +issue: 1 +slug: RJ-2024-006 +draft: no +journal: + lastpage: 113 + firstpage: 96 + +--- + + +# Introduction + +Multivariate Markov chains (MMC) have a wide range of applications, in various fields. Hence, several studies and generalizations of the MMC models have been made. However, the availability of packages that allow the estimation and application of these models are scarce, and most of these methods use algorithms and software that are not broadly available or can only be applied in particular situations. In the last few years, R software has been gaining importance in the field of statistical computing. This phenomenon might be because it is free and open-source software, which compiles and runs on a wide variety of operating systems. Specifically, in R software, there are some available packages related to Markov chains (MC) and MMC. For example, the \CRANpkg{march} package [@march; @Berchtold2020] allows the computation of various Markovian models for categorical data, including homogeneous Markov chains of any order, MTD models, Hidden Markov models, and Double Chain Markov Models. Ogier Maitre developed this package with contributions from Andre Berchtold, Kevin Emery, Oliver Buschor, and Andre Berchtold maintains it. All the models computed by this package are for univariate categorical data. The \CRANpkg{markovchain} package [@markovchains] contains functions and methods to create and manage discrete-time Markov chains. In addition, it includes functions to perform statistical and probabilistic analysis (analysis of their structural proprieties). Finally, the \CRANpkg{DTMCPack} package [@DTMCPack] contains a series of functions that aid in both simulating and determining the properties of finite, discrete-time, discrete-state Markov chains. There are two main functions: `DTMC` and `MultDTMC`, which produce $n$ iterations of a Markov Chain(s) based on transition probabilities and an initial distribution given by the user, for the univariate and multivariate case, respectively. This last package is the only one available in R for MMC. In general, the work on MMC models is mostly based on improving the estimation methods and/or making the model more parsimonious. In this work, we aim to develop a new generalization that considers exogenous variables. Specifically, the effects of the MMC's past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. Additionally, we address statistical inference and implement these methods in an R package. The R package includes three functions: `multimtd`, `multimtd_probit` and `mmcx`. The first two functions estimate the MTD model for multivariate categorical data, with Chings's specification [@Ching2002] and with the Probit specification [@Nicolau2014], respectively. The last function allows the estimation of our proposed model, the Generalized Multivariate Markov Chain (GMMC) model. The R package, \CRANpkg{GenMarkov}, with these three functions is available in the Comprehensive R Archive Network (CRAN) at . + +# Multivariate Markov chains + +Markov chains can be appropriate for representing dependencies between successive observations of a random variable. However, when the order of the chain or the number of possible values increases, Markov chains have lack parsimony. In this context, @JacobLewis1978, @Pegram1980 and @Logan1981 proposed several models for HOMC. Notwithstanding these developments, the Mixture Transition Distribution model [@Raftery1985] proved to be more suitable to model HOMC, which overshadowed the previously proposed models. Several relevant extensions of the MTD model emerged: the Multimatrix MTD [@Berchtold1995; @Berchtold1996], which allowed modeling the MTD by using a different $m \times m$ transition matrix for each lag, the Infinite-Lag MTD model that assumes an infinite lag order ($l = \infty$), which was first considered by @Mehran1989 and later developed by @Le1996 in a more general context. Finally, the MTD with General State Spaces allowed modeling more general processes with an arbitrary space state [@Martin1987; @Adke1988; @Wong2001]. Although the MTD model presents a more parsimonious approach to model Markov chains with order higher than one, it has weaknesses. Namely, when considering more than one data sequence, one represents the MMC as a HOMC, by expanding the state-space. This approach could result in a more complex probability transition matrix. Consequently, this can make the estimation unfeasible as the order, states, and the number of data sequences increase. Additionally, the model assumes the same transition matrix for each lag. In this setting, @Ching2002 determined an alternative to handle the unfeasibility of the conventional multivariate Markov chain (MMC) by proposing a model with fewer parameters. The model developed is essentially the same as the MTD. However, it considers a different $m \times m$ transition matrix for each lag and considers more than one data sequence. In the proposed multivariate Markov chain model, @Ching2002 assume the following relationship: + +Let $x_t^{(j)}$ be the state vector of the $j$th sequence at time $t$. If the $j$th sequence is in state $l$ at time $t$ then + +\begin{equation} +x_{t+1}^{(j)} = \sum_{k=1}^s \lambda_{jk}P^{(jk)}x_{t}^{(k)}, \text{for } j =1, 2, \dots, s +(\#eq:eq1) +\end{equation} +where $0 \leq \lambda_{jk} \leq 1$ for $j \leq s, k \leq s$ and $\sum_{k=1}^s \lambda_{jk} =1$ for $j=1, 2, \dots, s$. The $\lambda_{jk}$ can be interpreted as the mixing probability of the $j$th state to the $k$th state. + +The state probability distribution of the $k$th sequence at time $(t + 1)$ depends on the weighted average of $P^{(jk)}x_{t}^{(k)}$ . Here $P^{(jk)}$ is a transition probability matrix from the states in the $k$th sequence to the states in the $j$th sequence and $x_t^{(k)}$ is the state probability distribution of the $k$th sequences at time $t$. In matrix form: + +\begin{equation} +\underline{x}_{t+1}^{(j)} \equiv +\left[ +\begin{array}{c} + x_{t+1}^{(1)} \\ + \vdots \\ + x_{t+1}^{(s)} +\end{array} \right ] += +\left[ +\begin{array}{ccc} +\lambda_{11}P^{(11)} & \dots & \lambda_{1s}P^{(1s)}\\ +\vdots & \ddots & \vdots\\ +\lambda_{s1}P^{(s1)}& \dots & \lambda_{ss}P^{(ss)} +\end{array} \right ] +\left[ +\begin{array}{c} + x_{t}^{(1)} \\ + \vdots \\ + x_{t}^{(s)} +\end{array} \right ] +\equiv +Q \underline{x}_{t} +(\#eq:eq2) +\end{equation} where $Q$ is an $ms \times ms$ block matrix ($s \times s$ blocks of $m \times m$ matrices) and $x_t$ is a stacked $ms$ column vector ($s$ vectors, each one with $m$ rows). + +The matrices $P^{(jk)}$ can be estimated for each data sequence by counting the transition frequency from the states in the $k$th sequence to those in the $j$th sequence, obtaining the transition frequency matrix for the data sequence. After normalization, the estimates of the transition probability matrices, i.e., $\widehat{P}^{(jk)}$, are obtained. Regarding the $\lambda_{jk}$ coefficients, the estimation method proposed by @Ching2002 involves the following optimization problem: + + +\begin{equation} +min_{\lambda} max_{i} \vert [ \sum_{k=1}^m \lambda_{jk} \widehat{P}^{(jk)} \widehat{\boldsymbol{x}}^{(k)} - \widehat{\boldsymbol{x}}^{(j)} ] \vert +(\#eq:eq3) +\end{equation} + +$$ \text{s.t. } \sum_{k=1}^s \lambda_{jk} \text{ and } \lambda_{jk} \geq 0 $$ Besides this, different models have been proposed for multiple categorical data sequences. @Kijima2002 proposed a parsimonious MMC model to simulate correlated credit risks. @Siu2005 proposed an easy to implement model; however, its applicability was limited by the number of parameters involved. @Ching2008 proposed a simplified model based on an assumption proposed in @Zhang2006. @Zhu2010 proposed a method of estimation based on minimizing the prediction error with equality and inequality restrictions and @Nicolau_2014 proposed a new approach to estimate MMC which avoids imposing restrictions on the parameters, based on non-linear least squares estimation, facilitating the model estimation and the statistical inference. @Berchtold2003 proposed a MTD model for heteroscedastic time series. Lastly, @Wang2014 proposed a new multivariate Markov chain model to reduce the number of parameters. Thus, generally, the models used in the published papers were developed by @Ching2002 or were a consequent generalization of them and addressed the MMC as an end in itself. In @Damasio2013 and @DAMASIO2014, a different and innovative concept was proposed: the usage of MMC as regressors in a certain model. Hence, given that the MMC Granger causes a specific dependent variable, and taking advantage of the information about the past state interactions between the MMC categories, it was possible to forecast the current dependent variable more accurately. Other relevant contributions are related to the optimization algorithm, as in @Lebre2008 and @ChenLio2009, and to empirical applications [@Ching2003; @Ching2006; @Damasio2018; @Damasio2019; @DamasioM2020]. Also, @Damasio2020 proposed a new methodology for detecting and testing the presence multiple structural breaks in a Markov chain occurring at unknown dates. In the vast majority of MMC models' studies, a positive correlation between the different data sequences is assumed due to the restrictions imposed. This aspect means it is always considered that at moment $t$, an increase in a state probability for a data sequence has an increasing impact on another data sequence, for time $t+1$. Thereupon, if one has a negative correlation between series, the parameter estimates are forced to be zero. The solution to this problem is very straightforward; one can relax the assumptions and not assume the constraints. However, that means the results produced by the model will no longer be probabilities. @Tavare1994 presented an alternative, by dropping the positivity condition and imposing another set of restrictions. @Ching2008 also tackled this issue and proposed a method where one splits the $Q$ matrix into the sum of two other matrices and one represents the positive correlations and another the negative correlations. Also, in @Nicolau2014, a specification completely free from constraints, inspired by the MTD model, was proposed, facilitating the estimation procedure and, at the same time, providing a more accurate specification for $P_j(i_0 | i_1, \dots, i_s)$. The model was: + +\begin{equation} +P_j(i_0 | i_1, \dots, i_s) = P_j^{\Phi}(i_0 | i_1, \dots, i_s) := +\\ + \frac{\Phi(\eta_{j0} + \eta_{j1}P(i_0|i_1) + \dots + \eta_{js}P(i_0|i_s))}{\sum_{k=1}^m \Phi(\eta_{j0} + \eta_{j1}P(k|i_1) + \dots + \eta_{js}P(k|i_s))} + (\#eq:eq4) +\end{equation} where $n_{ji} \in \mathbb{R}(j = 1, \dots, s; i = 1, \dots, m)$ and $\Phi$ is the (cumulative) standard normal distribution function. + +This specification is denoted as and MTD-Probit model. The log-likelihood is given by: \begin{equation} +LL = \sum_{i_1, i_2, \dots, i_{i_s}, i_0} n_{i_1, i_2, \dots, i_{i_s}, i_0} log(P_j^{\Phi}(i_0 | i_1, \dots, i_s) ) (\#eq:eq5) +\end{equation} and the maximum likelihood estimator is defined, as usual, as $\widehat{\eta} = \text{arg max}_{n_{j1}, \dots, n_{js}} LL$. The parameters $P_{jk}(i_0|i_1)$, $k$ =$1, \dots, s$ can be estimated in advance, through the consistent and unbiased estimators proposed by @Ching2002: + +\begin{equation} +\widehat{P}_{jk}(i_0|i_1) = \frac{n_{i_1i_0}}{\sum_{i_0=1}^n n_{i_1 i_0}} (\#eq:eq6) +\end{equation} This specification can be superior to the MTD because the estimation procedure is easier, and the standard numerical optimization routines can be easily applied in the absence of constraints. However, similarly to the standard MTD, the likelihood is not a strictly concave function on the entire parameter state-space, thus the choice of starting values is still important. Additionally, the model describes a broader range of possible dependencies since the parameters are not constrained. Moreover, this proposed model is more accurate than the MTD model. For more details on this, see @Nicolau2014. + +Overall, the published work on MMC models was mostly based on improving the estimation methods and/or making the model more parsimonious. In @Damasio2013 and @DAMASIO2014, a different approach was used, and the work developed focused on the usage of MMC as regressors in a certain model. Notably, it showed that an MMC can improve the forecast of a dependent variable. In a way, it demonstrated that an MMC can be an end in itself, but it can be an instrument to reach an end or a purpose. In this work, the opposite will be developed: instead of considering an MMC as regressors, a model in which a vector with pre-determined exogenous variables is part of $\mathcal{F}_{t-1}$ is proposed. + +# Covariates in Markov chain models + +Regarding the inclusion of covariates in Markov chains models, @Regier1968 proposed a two-state Markov chain model, where the transition matrix probabilities were a function of a parameter, $q$, that described the tendency of the subject to move from state to state. @Kalbfleisch1985 proposed a panel data analysis method under a continuous-time Markov model that could be generalized to handle covariate analysis and the fitting of certain non-homogeneous models. This work overcame the limitations of @Bart1968, @Spilerman1976 and @Wasserman1980 methodologies, by developing a new algorithm that provided a very efficient way of obtaining maximum likelihood estimates. Also, @Muenz1985 developed a Markov model for covariates dependence of binary sequences, where the transitions probabilities were estimated through two logistic regressions that depended on a set of covariates. Essentially, @Muenz1985 modeled a non-homogeneous Markov chain through logistic regression, considering only two states. @Islam2004 developed an extension of this model considering three states, and @IslamAtaharul2006 generalized this approach for HOMC. Additionally, @Azzalini1994 proposed a model to study the influence of time-dependent covariates on the marginal distribution of a binary response in serially correlated binary data, where Markov chains are expressed in terms of transitional probabilities. @jackson2011multi proposed a Markov model for panel data, which allowed for the transitions intensities to vary between individuals or constant time-dependent covariates. Specifically, this work allowed to account for different intensities throughout transitions of states and include individual-specific covariates. The time-inhomogeneos model proposed is restricted to piecewise-constant intensities. The implementation of this work is available in the package \CRANpkg{msm}. More recently, @Bolano2020 proposed an MTD-based approach to handle categorical covariates, that considers each covariate separately and combines the effects of the lags of the MTD and the covariates employing a mixture model. Specifically, the model is given by: + + +\begin{equation} +P(X_t = k \mid X_{t-1} = i, C_1 = c_1, \dots, C_l = c_l) \approx \theta_0 a_{ik} + \sum_{h=1}^l \theta_h d_{c_{h}k} (\#eq:eq7) +\end{equation} + +where $a_{ik}$ is the transition probability from state $i$ to state $k$, as in a conventional Markov chains and $d_{c_{h}k}$ is the probability of observing the states $k$ given the modality $c_h$ of the covariate $h$. Lastly, $\theta_0, \dots, \theta_l$ are the weights of the explanatory elements of the model. + +According to the literature presented, several researchers have proposed methodologies or generalizations to include covariates in Markov chain models. Primarily for social sciences and health applications, where the transition probabilities were generally modeled through logistic regression. However, there has been an increased focus on categorical covariates, opposing continuous covariates and a lack of approaches to multivariate Markov chain models. Thus, with this work, we aim to tackle this research gap. + +# Multivariate Markov chains with covariates + +## Theoretical model + +In this work, a new generalization of @Ching2002 MMC model is presented: the GMMC model, that is, we will consider exogeneous or pre-determined covariates in the $\sigma$ - algebra generated by the available information until $t-1$ ($\mathcal{F}_{t-1}$). These variables can be deterministic or stochastic and do not necessarily need to be reported at time $t$. Broadly, the model is given by: + +\begin{equation} +P(S_{jt} = k | \mathcal{ F}_{t-1} ) = P(S_{jt} = k | S_{1t-1} = i_1, S_{2t-1} = i_2, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) (\#eq:eq8) +\end{equation} We can specify this model as proposed by @Ching2002 with Raftery's notation: + +\begin{multline} +P(S_{jt} = i_0 | S_{1t-1} = i_1,\dots, S_{st-1} = i_s, \boldsymbol{x}_t) \equiv \\ +\lambda_{j1}P(S_{jt} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_t) + \dots + \lambda_{js}P(S_{jt} = i_0 | S_{st-1} = i_s, \boldsymbol{x}_t) (\#eq:eq9) +\end{multline} subject to the usual constraints. + +## Estimation and inference + +This proposed model is estimated through MLE, similar to the standard MTD model. The log-likelihood is given by: + + +\begin{equation} +LL = \sum_{t = 1}^n log P(S_{jt} = i_0 | S_{1t-1} = i_1, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) (\#eq:eq10) +\end{equation} + +Additionally, the probabilities can be estimated through an multinomial logit model. The proof for consistency and asymptotic distribution is available in the Supplementary Material section. + +## Monte Carlo simulation study + +A Monte Carlo simulation study was designed to evaluate the dimension and power of the test parameters of the proposed model. The R statistical environment was used for all computations. This simulation study was comprised of two parts. + +### Part I: Detect a non-homogeneous Markov chain + +First, we considered two sequences with two and three states. The main goal was to assess if the model detected the presence of a non-homogeneous Markov chain correctly and if the estimate of the parameter would correspond to the expected. So, given two sequences, one generated through a non-homogeneous Markov chain and the other generated through a homogeneous Markov chain, it would be expected that the parameter associated with the transition probabilities of the first sequence would be one and the parameter associated with the transition probabilities of the second sequence would be zero. With this in mind, the transitions probabilities of the first sequence were estimated through a logistic regression, where parameters of this regression were randomly generated in R, and the second sequence was generated through a first-order Markov chain. Hence, for both states cases considered, it was expected that the estimated regression would be: + + +\begin{multline} +P(S_{1t} = i_0 | S_{1t-1} = i_1, S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) = \\ +1 \times P(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}) + 0 \times P(S_{1t} = i_0 | S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) (\#eq:eq11) +\end{multline} + +To assess the test power and dimension, we used the Wald test with the following hypothesis: + +```{r dim-pow-html, eval = knitr::is_html_output(), echo=FALSE} + +data = matrix(c('Power', '$H_0: \\lambda_{11} = 0$', + '$\\frac{\\widehat{\\lambda}_{11}^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 1$', + '$\\frac{(\\widehat{\\lambda}_{12}-1)^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$', + 'Dimension', '$H_0: \\lambda_{11} = 1$', + '$\\frac{(\\widehat{\\lambda}_{11}-1)^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 0$', + '$\\frac{\\widehat{\\lambda}_{12}^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$'), ncol=3, nrow=4, byrow=T) +colnames(data) = c('', 'Hypothesis', 'Test') + +knitr::kable(data, format = "html", caption = "Power and dimension of test assessment") +``` + + +```{r dim-pow-tex, eval = knitr::is_latex_output(), echo=FALSE} + +data = matrix(c('Power', '$H_0: \\lambda_{11} = 0$', + '$\\frac{\\widehat{\\lambda}_{11}^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 1$', + '$\\frac{(\\widehat{\\lambda}_{12}-1)^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$', + 'Dimension', '$H_0: \\lambda_{11} = 1$', + '$\\frac{(\\widehat{\\lambda}_{11}-1)^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 0$', + '$\\frac{\\widehat{\\lambda}_{12}^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$'), ncol=3, nrow=4, byrow=T) +colnames(data) = c('', 'Hypothesis', 'Test') + +knitr::kable(data, format = "latex", caption = "Power and dimension of test assessment", escape = FALSE) +``` + +The simulation procedure was performed as follows: + +1. Generate the values of the coefficients for the probability transition matrix of series $S_{1t}$ randomly; +2. Generate the probability transition matrix of series $S_{2t}$ randomly; +3. Set the initial value of $S_{2t}$ to 1 and simulate the following from the defined probability transition matrix; +4. In each iteration (of 1000 repetitions), + - Generate $X_t \sim N(2,25)$; + - Generate the time-varying probabilities of series $S_{1t}$ through the values of the fixed coefficients and the lagged variable $x_t$; + - Set the initial values of the series $S_{1t}$ as 1; + - For each period $t$, simulate the next state of $S_{1t}$ from the probabilities simulated for that moment; + - Estimate the model through the function `mmcx`; + - Calculate the Wald test and add to the counter if it is rejected. + +```{r figure-2states, echo=FALSE, fig.cap="Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'} +library(ggplot2) +df <- structure( + list( + States = c(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 3), + Parameter = c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0), + Sample = c( + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 100, + 500, + 1000 + ), + Power = c( + 0.082, + 0.252, + 0.466, + 0.994, + 0.082, + 0.252, + 0.466, + 0.994, + 0.905, + 1, + 1, + 0.905, + 1, + 1 + ), + Dimension = c( + 0.057, + 0.076, + 0.058, + 0.06, + 0.057, + 0.076, + 0.058, + 0.06, + 0.097, + 0.002, + 0.003, + 0.097, + 0.002, + 0.003 + ) + ), + row.names = c(NA, 14L), + class = "data.frame" +) + +df$Sample = as.factor(df$Sample) +df$Parameter = as.factor(df$Parameter) + +df1 <- df[df$States == 2, ] +p1 <- + ggplot(df1, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.08)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p2 <- + ggplot(df1, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.05)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p1, p2, ncol=2) + +``` + +Considering two states, the test dimension was at 5.7% with a sample size of 100 observations, sightly increased with 500 observations, and returned to the expected values in 1000 and 5000 observations. For a sample size of 100, 500, and 1000 observations, we have low test power. So, when considering two states, the sample must have at least 5000 observations, or, if that is not possible, consider a higher significance level when testing for individual significance. + +```{r figure-3states, echo=FALSE, fig.cap="Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'} +df2 <- df[df$States == 3, ] +p3 <- + ggplot(df2, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.11)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p4 <- + ggplot(df2, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.05)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p3, p4, ncol=2) +``` + +Considering three states, the test dimension was 9.7% for a sample size of 100 observations, 0.2% for a sample size of 500 observations, and 0.3% for a sample size of 1000. Regarding the test power, we see similar behavior, for a sample of 100 observations, the test power was 90.5%, and from a sample of 500 observations, we reach a test power of 100%. Thus, when considering three states, one may consider a sample of 500 observations without compromising the test power and dimension. + +\newpage + +### Part II: Detecting Parameters Assigned Values + +Secondly, we performed a simulation study where we considered two non-homogeneous Markov chain with two states. Here, the main goal was to assess if the model correctly detected the parameters assigned. So, in this case, we started by generating the terms of the model proposed. These terms were estimated through logistic regression, and the parameters of this regression were randomly generated in R. Similarly to Part I, we considered a Wald test to assess the power and dimension of the test. The simulation procedure was performed as follows: + +1. Generate the values of the coefficients to calculate the probability transition matrices randomly; +2. In each iteration (of 1000 repetitions), + - Generate $\{x_t\} \sim N(2,25)$; + - Generate the probabilities $P \left(S_{jt}|S_{st-1}, x_{t-1} \right)$, with $j=1,2$ and $s=1,2$. + - Set the initial values of the series $S_{1t}$ and $S_{2t}$ as 1; + - For each period $t$, calculate the probabilities $P \left(S_{1t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)$ and $P \left( S_{2t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)$ through the assigned values of the $\lambda$'s. Considering the calculated probabilities, simulate the next state for each series, $S_{1t}$ and $S_{2t}$. + - Estimate the model through the function `mmcx`; + - Calculate the Wald test and add to the counter if it is rejected. + +The probabilities $P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)$ and $P\left(S_{1t}|S_{2t-1}, x_{t-1}\right)$ presented some differences regarding its values' distributions. Specifically, $P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)$ had more extreme probabilities values, with the minimum value being close to 0 and the maximum value being close to 1. And, the probabilities $P\left(S_{1t}|S_{2t-1}, x_{t-1} \right)$ had more moderate values, with the minimum value being, on average, 0.3 and the maximum value, 0.7. When the probabilities have values close to 1, one says that the states/regimes are persistent. We calculated the power and dimension of test for each value of $\lambda$ when the estimated probabilities are moderate and when they are extreme. Hence, considering equation 1: + + +\begin{multline} +P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\dots, S_{2t-1} = i_2, \boldsymbol{x}_{t-1} \right) = \\ +\lambda_{11}P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}\right) + \lambda_{12}P\left(S_{1t} = i_0 | S_{2t-1} = i_s, \boldsymbol{x}_{t-1} \right) (\#eq:eq12) +\end{multline} + +The parameter $\lambda_{11}$ will be associated with more extreme probabilities and $\lambda_{12}$ will be associated with more moderate probabilities. + +```{r figure-persistent-1, echo=FALSE, fig.cap="Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'} +df3 <- + structure( + list( + Case = c( + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2 + ), + Parameter = c( + 0.2, + 0.2, + 0.2, + 0.2, + 0.4, + 0.4, + 0.4, + 0.4, + 0.6, + 0.6, + 0.6, + 0.6, + 0.8, + 0.8, + 0.8, + 0.8, + 0.2, + 0.2, + 0.2, + 0.2, + 0.4, + 0.4, + 0.4, + 0.4, + 0.6, + 0.6, + 0.6, + 0.6, + 0.8, + 0.8, + 0.8, + 0.8 + ), + Sample = c( + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000 + ), + Power = c( + 0.073, + 0.065, + 0.046, + 0.019, + 0.092, + 0.096, + 0.092, + 0.097, + 0.126, + 0.282, + 0.261, + 0.276, + 0.139, + 0.435, + 0.695, + 0.999, + 0.057, + 0.076, + 0.15, + 0.256, + 0.085, + 0.14, + 0.209, + 0.715, + 0.071, + 0.087, + 0.142, + 0.315, + 0.053, + 0.103, + 0.362, + 0.599 + ), + Dimension = c( + 0.018, + 0.014, + 0.009, + 0, + 0.005, + 0.004, + 0.002, + 0.002, + 0.005, + 0.004, + 0.002, + 0.002, + 0.018, + 0.014, + 0.009, + 0, + 0.018, + 0.025, + 0.038, + 0.064, + 0.002, + 0.003, + 0.07, + 0.103, + 0.002, + 0.003, + 0.07, + 0.103, + 0.018, + 0.025, + 0.038, + 0.064 + ) + ), + row.names = c(NA, + 32L), + class = "data.frame" + ) +df3$Sample = as.factor(df3$Sample) +df3$Parameter = as.factor(df3$Parameter) + +df4 <- df3[df3$Case == 1, ] + +p5 <- + ggplot(df4, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.07)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p6 <- + ggplot(df4, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.02)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p6, p5, ncol=2) + +``` + +```{r figure-persistent-2, echo=FALSE, fig.cap="Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting.", fig.height=3, fig.width=6, warning=FALSE, fig.align='center'} + df5 <- df3[df3$Case == 2, ] +p7 <- + ggplot(df5, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.8)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + + +p8 <- + ggplot(df5, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.11)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 8, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p8, p7, ncol=2) + +``` + +When the states are persistent and the parameter's value is low (i.e., 0.2 and 0.4), we have low test power. By increasing this value, the power of test increases as well. When the states are not persistent, we do not have a clear pattern regarding the power of test, for a value of the parameter of 0.2, the power of test is still low (although not as low as the first scenario), increases when we have a value of 0.4, decreases when the value is 0.6 and increases again when the value is 0.8. Overall, the estimated standard errors seem high, leading to low test power. Regarding the test dimension, when we have a higher weight associated with the non-persistent states, the test dimension converges to 0. However, when this weight is associated with the persistent states, the test dimension increases with the sample size, reaching a value of 10% in some cases. Hence, one must use a 10% significance level to perform statistical inference on the parameters in this situation. + +## Software implementation + +Regarding the software implementation for each function, for the `multimtd` function the estimation method was presented in @Berchtold2001 applied to the multivariate case. For `multimtd_probit`, a package for numerical maximization of the log-likelihood, \CRANpkg{maxLik} [@maxLik], was used. This package performs Maximum Likelihood estimation through different optimization methods that the user can choose. The optimization methods available are Newton-Raphson, Broyden - Fletcher - Goldfarb - Shanno, BFGS al- algorithm, Berndt - Hall - Hall - Hausman, Simulated ANNealing, Conjugate Gradients, and Nelder-Mead. Finally, for the `mmcx` function, a different approach was used. Unlike the MTD- Probit, the model proposed has equality and inequality restrictions in the parameters. The \CRANpkg{maxLik} [@maxLik] package only allows one type of restriction for each Maximum Likelihood estimation, so it was not possible to use this package to estimate the proposed model with exogenous variables. Hence, the algorithm used was the Augmented Lagrangian method, available in the \CRANpkg{alabama} [@alabama] package through the function `auglag`. This estimation method for the proposed model is not very common, however, it has been applied to Markov chain models [@Rajarshi2013]. The GMMC model's probabilities were estimated through a Multinomial Logit using `rmultinom` of the \CRANpkg{nnet} package [@nnet]. + +Additionally, the hessian matrices were also computed, which allowed performing statistical inference. The `maxLik` and `auglag` compute the Hessian matrices with the estimates. For the function `multimtd`, since the optimization procedure of @Berchtold2001 was used, the hessian was computed through the second partial derivatives. The function `multi.mtd` requires the following elements: + +- `y`, a matrix of the categorical data sequences. + +- `deltaStop`, the delta below which the optimization phases of the parameters stop. + +- `is_constrained`, flag indicating whether the function will consider the usual set of constraints (usual set: \textit{TRUE}, new set of constraints: \textit{FALSE}). + +- `delta`, the amount of change to increase/decrease in the parameters for each iteration of the optimization algorithm. + +The last three arguments concern the optimization procedure. For more details see @Berchtold2001. Considering two vectors of two categorical data sequences, `s1` and `s2`, to estimate the model and obtain the results: + +```{r multi.mtd, eval=FALSE} +multi.mtd(y=cbind(s1,s2), deltaStop=0.0001, is_constrained=TRUE, delta=0.1) +``` + +The function `multi.mtd_probit` requires the following arguments: + +- `y`, a matrix of the categorical data sequences. +- `initial`, a vector of the initial values of the parameters. +- `nummethod`, the numerical maximization method, currently either "NR" (for Newton-Raphson), "BFGS" (for Broyden-Fletcher-Goldfarb-Shanno), "BFGSR" (for the BFGS algorithm implemented in R), "BHHH" (for Berndt-Hall-Hall-Hausman), "SANN" (for Simulated ANNealing), "CG" (for Conjugate Gradients), or "NM" (for Nelder-Mead). Lower-case letters (such as "nr" for Newton-Raphson) are allowed. The default method is "BFGS". For more details see \CRANpkg{maxLik} [@maxLik] package. + +Considering two vectors of two categorical data sequences, `s1` and `s2` again, to estimate the model an obtain the results with BFGS maximization method: + +```{r multi.mtd_probit, eval=FALSE} +multi.mtd_probit(y = cbind(s1,s2), initial=c(1,1,1), nummethod='bfgs') +``` + +Finally, the function `mmcx` requires the following elements: + +- `y`, a matrix of categorical data sequences. +- `x`, a matrix of covariates (exogeneous variables). +- `initial`, a vector of the initial values of the parameters. + +Considering two vectors of two categorical data sequences, `s1` and `s2`, and a vector of an exogeneous variables, `x`, to estimate the model and obtain the results: + +```{r mmcx-examp, eval=FALSE} +mmcx(y = cbind(s1,s2), x = cbind(x), initial=c(1,1)) +``` + +These functions return a list with the parameter estimates, standard errors, z-statistics, p- values, and the log-likelihood function value for each equation. + +The package offers an additional function that allows to obtain the transition probability matrices of `mmcx` considering a specific value of `x` defined by the user. The function is `MMC_tpm` and requires the following elements: + +- `s`, a matrix of categorical data sequences. +- `x`, a matrix of covariates (exogeneous variables). +- `value`, a single value of `x`, to condition the probability transition matrices. +- `result`, a list returned by the function `mmcx` containing the model's estimates. + +Considering two vectors of two categorical data sequences, `s1` and `s2`, a vector of an exogeneous variables, `x` and `res` the list returned by the function `mmcx`, to obtain the transition probability matrices: + +```{r mmc_tpm, eval=FALSE} +MMC_tpm(s = cbind(s1,s2), x = cbind(x), value = max(x), result = res) +``` + +The function returns an array containing the probability transition matrices, conditioned on a specific value of `x`, for each equation. + +# Illustration + +Markov chain models are used in interdisciplinary areas, such as economics, business, biology, and engineering, with applications to predict long-term behavior from traffic flow to stock market movements, among others. Modeling and predicting stock markets returns is particularly relevant for investors and policy makers. Since the stock market is a volatile environment, and the returns are difficult to predict, estimating the set of probabilities that describe these movements, might provide relevant input. Additionally, incorporating the effect of key macroeconomic variables could provide a more accurate picture of this specific environment. + +The following empirical illustration aims to model stock returns of two indexes as a function of the interest rate spread, specifically the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. + +The interest rate spread is a key macroeconomic variable and provides valuable information regarding the economy state. Specifically, it has been used to forecast recessions as in @Estrella1996, @Dombrosky1996, @Chauvet2016, @Tian2019 and @McMillan2021. Generically, short-term yields are lower than long-term yields when the economy is in expansion. On the other hand, short-term yields are higher than long-term yields when the economy is in recession. The difference between these yields (or, more specifically, the yield curve's slope) can be used to forecast the state of the economy. Hence, this indicator might provide relevant input for investors. + +We considered the 5-week-day daily stock returns ($r_t=100 \times \log(P_t/P_{t-1})$, where $P_t$ is the adjusted close price) of two indexes, S&P500 and DJIA, from November $11^{th}$ 2011 to September $1^{st}$ 2021 (2581 observations). Additionally, we considered the interest rate spread ($spread_{t}$), the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. The data was retrieved from FRED. Below, we have the descriptive statistics of these variables. + +```{r summary-stat-html, eval = knitr::is_html_output(), echo=FALSE} +library(GenMarkov) +data = rbind(c('$spread_{t}$', round(summary(stockreturns$spread_1), 3)), + c('$r_{t;SP500}$', round(summary(stockreturns$returns_sp500), 3)), + c('$r_{t;DJIA}$', round(summary(stockreturns$returns_djia), 3))) + +colnames(data) = c('Variable', 'Minimum', + '1$^{st}$ Quantile', 'Median', + 'Mean', '3$^{rd}$ Quantile', 'Maximum') +knitr::kable(data, format = "html", caption = "Summary statistics of $stockreturns$ dataset") +``` + +```{r summary-stat-tex, eval = knitr::is_latex_output(), echo=FALSE} +library(GenMarkov) +data = rbind(c('$spread_{t}$', round(summary(stockreturns$spread_1), 3)), + c('$r_{t;SP500}$', round(summary(stockreturns$returns_sp500), 3)), + c('$r_{t;DJIA}$', round(summary(stockreturns$returns_djia), 3))) + +colnames(data) = c('Variable', 'Minimum', + '1$^{st}$ Quantile', 'Median', + 'Mean', '3$^{rd}$ Quantile', 'Maximum') + +knitr::kable(data, format = "latex", caption = "Summary statistics of $stockreturns$ dataset", escape=FALSE) +``` + +Moreover, to apply the model proposed, it is necessary to have a categorical time series, thus we applied the following procedure: + +$$ +S_{st}= +\begin{cases} +1, r_t \leq \widehat{q}_{s;0.25}\\ +2, \widehat{q}_{s;0.25} < r_t < \widehat{q}_{s;0.75} \\ +3, r_t \geq \widehat{q}_{s;0.75}\\ +\end{cases} +$$ + +where $\widehat{q}_{s;\alpha}$ is the estimated quantile of order $\alpha$ of the marginal distribution of $r_t$. Considering this illustration and the model proposed, we will have two equations: + + +\begin{multline} +P(S_{sp500,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{11} P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{12} P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1}) (\#eq:eq13) +\end{multline} + + +\begin{multline} +P(S_{djia,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{21} P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{22} P(S_{djia,t} | S_{djia, t-1}, spread_{t-1}) (\#eq:eq14) +\end{multline} + + +In Figures \@ref(fig:fig11) to \@ref(fig:fig22) generate through \CRANpkg{ggplot2} [@ggplot2] and \CRANpkg{gridExtra} [@gridextra], we have the smoothed conditional probabilities of both series, depending on $spread_{t-1}$. The number of observations is high, and the probabilities varied abruptly in a small time frame, making the plots hard to read. To simplify, a moving average model (from \CRANpkg{pracma} [@pracma]) of order 5, due to the frequency of the data, was adjusted to these probabilities to illustrate how they evolve throughout time. These plots represent the probabilities associated with the parameters of the general model proposed, showcasing how these vary throughout time and the main of advantage of this generalization. Instead of having fixed matrices of transition probabilities, we allow for these to vary throughout time, depending on the values of $spread_{t-1}$. Specifically, Figures \@ref(fig:fig11) and \@ref(fig:fig12) correspond to the non-homogeneous Markov chain to build the SP&500's equation and Figures \@ref(fig:fig21) and Figures \@ref(fig:fig22) correspond to the non-homogeneous Markov chain to build DJIA's equation. We see a similar behavior within each series regardless of whether it depends on the previous states of $S_{1t}$ or $S_{2t}$. Additionally, the scales of the graphs are small, indicating that these probabilities vary around the same set of values. + +```{r generate-plots, echo=FALSE, warning=FALSE, message=FALSE} +library(GenMarkov) +library(ggplot2) +library(gridExtra) +#Define data and variables +s = cbind(stockreturns$sp500, stockreturns$djia) +m1 = max(s) +x = stockreturns$spread_1 + +########################################################### +### Code retrieved from ProbValuesXDependent() function ### +########################################################### + +# Create matrix with dummies for each state +dummies_list <- + apply(s, 2, function(x) { + fastDummies::dummy_cols(x, remove_selected_columns = TRUE) + }) +dummies <- matrix(unlist(dummies_list), + ncol = m1 * ncol(s), + nrow = nrow(s) +) + +# Create all possible combinations of column indices +combinations <- expand.grid(1:ncol(s), 1:ncol(dummies)) +# Order by the first variable +combinations <- combinations[order(combinations$Var1), ] + +# Extract columns from S and S_L based on the combinations +combined_list <- lapply(1:nrow(combinations), function(i, x) { + cbind(s[, combinations[i, 1]], x, dummies[, combinations[i, 2]]) +}, x = x) + +estimate_condprobs <- sapply(combined_list, function(data) { + # Define dependent variable + y <- factor(data[, 1], levels = 1:max(data[, 1])) + + # Define lagged St + s_l <- Hmisc::Lag(data[, 3]) + + # Estimate multinomial logistic regression + res <- suppressWarnings(nnet::multinom(y[s_l == 1] ~ data[, "x"][s_l == 1], trace = FALSE)) + + warn <- tryCatch( + { + nnet::multinom(y[s_l == 1] ~ data[, "x"][s_l == 1], trace = FALSE) + + if (length(warnings()) == 0) { + NULL # Return NULL if no warning occurs + } + + }, + warning = function(w) { + # Extracting the warning message without printing + warning_message <- conditionMessage(w) + return(warning_message) + } + ) + + + if(is.null(warn)){ + # Extract fitted values + px1 <- res$fitted.values + + }else if(length(warn) == 1){ + + if( (grepl("\\bgroup\\b.*\\bempty\\b", warn, ignore.case = TRUE) || grepl("\\bgroups\\b.*\\bempty\\b", warn, ignore.case = TRUE)) ){ + extracted_number <- as.numeric(regmatches(warn, gregexpr("\\d+", warn))[[1]]) + + # Extract fitted values + px1 <- res$fitted.values + + ##Add missing groups + px1 <- cbind(px1, matrix(rep(0, nrow(px1)*length(extracted_number)), + ncol=length(extracted_number), + nrow = nrow(px1), + dimnames = list(NULL, extracted_number))) + + #Re-order columns + px1 <- px1[, match(1:m1, colnames(px1))] + }else{ + warning(warn) + } + + }else{ + warning(warn) + } + + state = data[data[,3]==1,1][1] + colnames(px1) = rep(paste('From state ', state), 3) + + return(as.matrix(px1)) +}, simplify = "array") + +##### +#Subset each conditional probabilities + +##S1t, S1t-1 +estim_prob_11 = estimate_condprobs[1:3] + +##S1t, S2t-1 +estim_prob_12 = estimate_condprobs[4:6] + +##S2t, S1t-1 +estim_prob_21 = estimate_condprobs[7:9] + +##S2t, S2t-1 +estim_prob_22 = estimate_condprobs[10:12] + + +#Function to create plots +plots_estimprobs = function(df){ + plots_list <- list() + j = colnames(df)[1] + for(i in 1:3){ + ma = pracma::movavg(df[,i], n = 5, type = "s") + df_ma = data.frame(ma = ma, Time = seq(1, nrow(df))) + + plot = ggplot(df_ma, aes(x = Time, + y = ma)) + + geom_line(color = 'black') + + ylab(label = paste(j, 'to state ', i)) + + theme_minimal() + + theme(axis.title = element_text(size = 8)) + + plots_list[[i]] <- plot + } + + plots_res = arrangeGrob(grobs = plots_list, ncol = 3) + + return(plots_res) +} + +#Save plots list +plots11 = lapply(estim_prob_11, function(x) plots_estimprobs(x)) + +plots12 = lapply(estim_prob_12, function(x) plots_estimprobs(x)) + +plots21 = lapply(estim_prob_21, function(x) plots_estimprobs(x)) + +plots22 = lapply(estim_prob_22, function(x) plots_estimprobs(x)) + +``` + +```{r fig11, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", message=FALSE, warning=FALSE, out.width='70%'} +grid.arrange(grobs = plots11, nrow=3) +``` + +```{r fig12, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'} +grid.arrange(grobs = plots12, nrow=3) +``` + +```{r fig21, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'} +grid.arrange(grobs = plots21, nrow=3) +``` + +```{r fig22, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'} +grid.arrange(grobs = plots22, nrow=3) +``` + +\newpage + +The model can be estimated through the `mmcx` function: + +```{r mmcx} +attach(stockreturns) +res <- mmcx(cbind(sp500, djia), spread_1, initial=c(1,1)) +``` + +Considering the first equation, the effect of the probabilities depending on S&P500's previous state and the interest rate spread has a higher weight on the overall probability. Also, this estimate is highly significant, presenting a $p$-value close to zero. The effect of DJIA's previous state in S&P500 is lower but it is also significant for a 10% significance level. In the second equation, the effect of S&P500's previous state is higher than DJIA's and both estimates are highly significant. + +One of the advantages of this approach is the possibility to assess the transition probabilities for specific values of $x_t$, in this case, the interest rate spread. For both series, we calculated the transition probabilities for this variable's minimum and maximum value in the sample, which are -0.52 and 2.97, respectively. To obtain the probability transition matrices for these two cases, the code is the following: + +```{r tpm} +tpm_max <- MMC_tpm(cbind(sp500, djia), spread_1, + value = max(spread_1), result = res) + +tpm_min <- MMC_tpm(cbind(sp500, djia), spread_1, + value = min(spread_1), result = res) +``` +```{r tpm-figs, eval=FALSE} +library(markovchain) +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) # Generate figure 9 +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) # Generate figure 10 +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) # Generate figure 11 +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) # Generate figure 12 +``` + +In Figures \@ref(fig:fig-sp500-min) and \@ref(fig:fig-sp500-max), we have the transition probabilities network for S&P500, corresponding to the minimum and maximum value of the spread. The most noticeable difference between these two networks is regarding the transition probability from the second state to the third state. For the maximum value of $spread_{t-1}$, the transition probability from the second state to the third state is 0.6. So, when the economy is strong, one might expect to have higher returns, when $t-1$ was in the second state. However, this scenario shifts when considering the minimum value of $spread_{t-1}$. The probability of obtaining higher returns, that is, being in state three, becomes almost evenly distributed, regardless of the state in $t-1$. This indicates the instability of the stock market, when the economy is weaker. Another difference in these networks, is regarding the transition probability from the third state to the first state. For the maximum value of $spread_{t-1}$, this probability is 0.27 and for the minimum value increases to 0.44. This is also expected, since when the economy is weaker, the probability of having lower returns is greater. + +```{r fig-sp500-max, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread$_{t-1}$. The highest probability of 0.6 refers to the transition from state 2 to state 3.", out.width='60%', warning=FALSE, message=FALSE, echo=FALSE} +library(markovchain) +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) +``` + +```{r fig-sp500-min, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread$_{t-1}$. The highest probability of 0.56 refers to the transition from state 2 to state 2.", out.width='60%', echo=FALSE} +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) +``` + +Considering the second equation (Figures \ref{fig:fig-djia-max} and \ref{fig:fig-djia-min}), corresponding to the DJIA's returns, we see a similar behaviour as in S&P500's networks. The transition probability from the second state to the third state is higher for the maximum value of $spread_{t-1}$ and the transition probability from the third state to the first state is higher when we consider the minimum value of $spread_{t-1}$. Although, the difference of this last probability between the minimum and maximum value of $spread_{t-1}$ is not as big as in S&P500. Overall, the rest of the probabilities structure, remains the same. + +```{r fig-djia-max, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread$_{t-1}$. The probability of 0.58 refers to the transition from state 2 to state 3.", out.width='60%', echo=FALSE} +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) +``` + +```{r fig-djia-min, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread$_{t-1}$. The highest probability of 0.51 refers to the transition from state 2 to state 2.",out.width='60%', echo=FALSE} +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) +``` + +# Conclusions, limitations and further research + +Several proposals for including of exogenous variables in MMC models have been presented. The main limitations were associated with the high complexity of the models to be developed and estimated. Additionally, most models considered only categorical exogenous variables, existing a lack of focus on continuous exogenous variables. This work proposes a new approach to include continuous exogenous variables in @Ching2002 model for multivariate Markov chain. This is relevant because it allows studying the effect of previous series and exogenous variables on the transition probabilities. The model is based on @Ching2002 MMC model but considers non-homogeneous Markov chains. Thus, the probabilities that compose the model are dependent on exogenous variables. These probabilities are estimated as a usual non-homogeneous Markov chain through a multinomial logit model. The model parameters are then estimated through MLE, as well as the standard errors. We developed a package with the estimation function of the model proposed. In this, we considered the Augmented Lagrangian optimization method for estimating the parameters through MLE. Additionally, we designed a Monte Carlo simulation study to assess this model's test power and dimension. The results showed that the model detected a non-homogeneous Markov chain. Moreover, an empirical illustration demonstrated the relevance of this new model by estimating the probability transition matrix for different exogenous variable values. Ignoring the effect of exogenous variables in MMC means that we would not detect the probabilities' changes according to the covariates' values. In this setting, one would have a limited view of the studied process. Hence, this approach allows us to understand how a specific variable influences a specific process. The main contributions of this work are the development of a package with functions for multivariate Markov chains, addressing the statistical inference in these models and the inclusion of covariates. The limitations are related to the implementation in R, specifically the optimization algorithm applied is not common for MMC models, in that sense, it would be beneficial to study new approaches to optimizing the maximum likelihood function as further research. Additionally, extending this generalization to the MTD-probit model proposed by @Nicolau2014 would also be relevant, which removes the constraints of the model's parameters and allows the model to detect negative effects. diff --git a/_articles/RJ-2024-006/RJ-2024-006.html b/_articles/RJ-2024-006/RJ-2024-006.html new file mode 100644 index 0000000000..38df34b447 --- /dev/null +++ b/_articles/RJ-2024-006/RJ-2024-006.html @@ -0,0 +1,3403 @@ + + + + + + + + + + + + + + + + + + + + + + GenMarkov: Modeling Generalized Multivariate Markov Chains in R + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    GenMarkov: Modeling Generalized Multivariate Markov Chains in R

    + + + +

    This article proposes a new generalization of the Multivariate Markov Chains (MMC) model. The future values of a Markov chain commonly depend on only the past values of the chain in an autoregressive fashion. The generalization proposed in this work also considers exogenous variables that can be deterministic or stochastic. Furthermore, the effects of the MMC’s past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. The Monte Carlo simulation study findings showed that our model consistently detected a non-homogeneous Markov chain. Besides, an empirical illustration demonstrated the relevance of this new model by estimating probability transition matrices over the space state of the exogenous variable. An additional and practical contribution of this work is the development of a novel R package with this generalization.

    +
    + + + +
    +

    1 Introduction

    +

    Multivariate Markov chains (MMC) have a wide range of applications, in various fields. Hence, several studies and generalizations of the MMC models have been made. However, the availability of packages that allow the estimation and application of these models are scarce, and most of these methods use algorithms and software that are not broadly available or can only be applied in particular situations. In the last few years, R software has been gaining importance in the field of statistical computing. This phenomenon might be because it is free and open-source software, which compiles and runs on a wide variety of operating systems. Specifically, in R software, there are some available packages related to Markov chains (MC) and MMC. For example, the march package (Berchtold et al. 2020; Maitre and Emery 2020) allows the computation of various Markovian models for categorical data, including homogeneous Markov chains of any order, MTD models, Hidden Markov models, and Double Chain Markov Models. Ogier Maitre developed this package with contributions from Andre Berchtold, Kevin Emery, Oliver Buschor, and Andre Berchtold maintains it. All the models computed by this package are for univariate categorical data. The markovchain package (Spedicato 2017) contains functions and methods to create and manage discrete-time Markov chains. In addition, it includes functions to perform statistical and probabilistic analysis (analysis of their structural proprieties). Finally, the DTMCPack package (Nicholson 2013) contains a series of functions that aid in both simulating and determining the properties of finite, discrete-time, discrete-state Markov chains. There are two main functions: DTMC and MultDTMC, which produce \(n\) iterations of a Markov Chain(s) based on transition probabilities and an initial distribution given by the user, for the univariate and multivariate case, respectively. This last package is the only one available in R for MMC. In general, the work on MMC models is mostly based on improving the estimation methods and/or making the model more parsimonious. In this work, we aim to develop a new generalization that considers exogenous variables. Specifically, the effects of the MMC’s past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. Additionally, we address statistical inference and implement these methods in an R package. The R package includes three functions: multimtd, multimtd_probit and mmcx. The first two functions estimate the MTD model for multivariate categorical data, with Chings’s specification (Ching et al. 2002) and with the Probit specification (Nicolau 2014), respectively. The last function allows the estimation of our proposed model, the Generalized Multivariate Markov Chain (GMMC) model. The R package, GenMarkov, with these three functions is available in the Comprehensive R Archive Network (CRAN) at https://CRAN.R-project.org/package=GenMarkov.

    +

    2 Multivariate Markov chains

    +

    Markov chains can be appropriate for representing dependencies between successive observations of a random variable. However, when the order of the chain or the number of possible values increases, Markov chains have lack parsimony. In this context, Jacobs and Lewis (1978), Pegram (1980) and Logan (1981) proposed several models for HOMC. Notwithstanding these developments, the Mixture Transition Distribution model (Raftery 1985) proved to be more suitable to model HOMC, which overshadowed the previously proposed models. Several relevant extensions of the MTD model emerged: the Multimatrix MTD (Berchtold 1995, 1996), which allowed modeling the MTD by using a different \(m \times m\) transition matrix for each lag, the Infinite-Lag MTD model that assumes an infinite lag order (\(l = \infty\)), which was first considered by Mehran (1989) and later developed by Le et al. (1996) in a more general context. Finally, the MTD with General State Spaces allowed modeling more general processes with an arbitrary space state (Martin and Raftery 1987; Adke and Deshmukh 1988; Wong and Li 2001). Although the MTD model presents a more parsimonious approach to model Markov chains with order higher than one, it has weaknesses. Namely, when considering more than one data sequence, one represents the MMC as a HOMC, by expanding the state-space. This approach could result in a more complex probability transition matrix. Consequently, this can make the estimation unfeasible as the order, states, and the number of data sequences increase. Additionally, the model assumes the same transition matrix for each lag. In this setting, Ching et al. (2002) determined an alternative to handle the unfeasibility of the conventional multivariate Markov chain (MMC) by proposing a model with fewer parameters. The model developed is essentially the same as the MTD. However, it considers a different \(m \times m\) transition matrix for each lag and considers more than one data sequence. In the proposed multivariate Markov chain model, Ching et al. (2002) assume the following relationship:

    +

    Let \(x_t^{(j)}\) be the state vector of the \(j\)th sequence at time \(t\). If the \(j\)th sequence is in state \(l\) at time \(t\) then

    +

    \[\begin{equation} +x_{t+1}^{(j)} = \sum_{k=1}^s \lambda_{jk}P^{(jk)}x_{t}^{(k)}, \text{for } j =1, 2, \dots, s +\tag{1} +\end{equation}\] +where \(0 \leq \lambda_{jk} \leq 1\) for \(j \leq s, k \leq s\) and \(\sum_{k=1}^s \lambda_{jk} =1\) for \(j=1, 2, \dots, s\). The \(\lambda_{jk}\) can be interpreted as the mixing probability of the \(j\)th state to the \(k\)th state.

    +

    The state probability distribution of the \(k\)th sequence at time \((t + 1)\) depends on the weighted average of \(P^{(jk)}x_{t}^{(k)}\) . Here \(P^{(jk)}\) is a transition probability matrix from the states in the \(k\)th sequence to the states in the \(j\)th sequence and \(x_t^{(k)}\) is the state probability distribution of the \(k\)th sequences at time \(t\). In matrix form:

    +

    \[\begin{equation} +\underline{x}_{t+1}^{(j)} \equiv +\left[ +\begin{array}{c} +x_{t+1}^{(1)} \\ +\vdots \\ +x_{t+1}^{(s)} +\end{array} \right ] += +\left[ +\begin{array}{ccc} +\lambda_{11}P^{(11)} & \dots & \lambda_{1s}P^{(1s)}\\ +\vdots & \ddots & \vdots\\ +\lambda_{s1}P^{(s1)}& \dots & \lambda_{ss}P^{(ss)} +\end{array} \right ] +\left[ +\begin{array}{c} +x_{t}^{(1)} \\ +\vdots \\ +x_{t}^{(s)} +\end{array} \right ] +\equiv +Q \underline{x}_{t} +\tag{2} +\end{equation}\] where \(Q\) is an \(ms \times ms\) block matrix (\(s \times s\) blocks of \(m \times m\) matrices) and \(x_t\) is a stacked \(ms\) column vector (\(s\) vectors, each one with \(m\) rows).

    +

    The matrices \(P^{(jk)}\) can be estimated for each data sequence by counting the transition frequency from the states in the \(k\)th sequence to those in the \(j\)th sequence, obtaining the transition frequency matrix for the data sequence. After normalization, the estimates of the transition probability matrices, i.e., \(\widehat{P}^{(jk)}\), are obtained. Regarding the \(\lambda_{jk}\) coefficients, the estimation method proposed by Ching et al. (2002) involves the following optimization problem:

    +

    \[\begin{equation} +min_{\lambda} max_{i} \vert [ \sum_{k=1}^m \lambda_{jk} \widehat{P}^{(jk)} \widehat{\boldsymbol{x}}^{(k)} - \widehat{\boldsymbol{x}}^{(j)} ] \vert +\tag{3} +\end{equation}\]

    +

    \[ \text{s.t. } \sum_{k=1}^s \lambda_{jk} \text{ and } \lambda_{jk} \geq 0 \] Besides this, different models have been proposed for multiple categorical data sequences. Kijima et al. (2002) proposed a parsimonious MMC model to simulate correlated credit risks. Siu et al. (2005) proposed an easy to implement model; however, its applicability was limited by the number of parameters involved. Ching et al. (2008) proposed a simplified model based on an assumption proposed in Zhang et al. (2006). Zhu and Ching (2010) proposed a method of estimation based on minimizing the prediction error with equality and inequality restrictions and Nicolau and Riedlinger (2014) proposed a new approach to estimate MMC which avoids imposing restrictions on the parameters, based on non-linear least squares estimation, facilitating the model estimation and the statistical inference. Berchtold (2003) proposed a MTD model for heteroscedastic time series. Lastly, Wang et al. (2014) proposed a new multivariate Markov chain model to reduce the number of parameters. Thus, generally, the models used in the published papers were developed by Ching et al. (2002) or were a consequent generalization of them and addressed the MMC as an end in itself. In Damásio (2013) and Damásio and Nicolau (2014), a different and innovative concept was proposed: the usage of MMC as regressors in a certain model. Hence, given that the MMC Granger causes a specific dependent variable, and taking advantage of the information about the past state interactions between the MMC categories, it was possible to forecast the current dependent variable more accurately. Other relevant contributions are related to the optimization algorithm, as in Lèbre and Bourguignon (2008) and Chen and Lio (2009), and to empirical applications (Ching et al. 2003; Ching and Ng 2006; Damásio 2018; Damásio and Mendonça 2019, 2020). Also, Damásio and Nicolau (2020) proposed a new methodology for detecting and testing the presence multiple structural breaks in a Markov chain occurring at unknown dates. In the vast majority of MMC models’ studies, a positive correlation between the different data sequences is assumed due to the restrictions imposed. This aspect means it is always considered that at moment \(t\), an increase in a state probability for a data sequence has an increasing impact on another data sequence, for time \(t+1\). Thereupon, if one has a negative correlation between series, the parameter estimates are forced to be zero. The solution to this problem is very straightforward; one can relax the assumptions and not assume the constraints. However, that means the results produced by the model will no longer be probabilities. Raftery and Tavaré (1994) presented an alternative, by dropping the positivity condition and imposing another set of restrictions. Ching et al. (2008) also tackled this issue and proposed a method where one splits the \(Q\) matrix into the sum of two other matrices and one represents the positive correlations and another the negative correlations. Also, in Nicolau (2014), a specification completely free from constraints, inspired by the MTD model, was proposed, facilitating the estimation procedure and, at the same time, providing a more accurate specification for \(P_j(i_0 | i_1, \dots, i_s)\). The model was:

    +

    \[\begin{equation} +P_j(i_0 | i_1, \dots, i_s) = P_j^{\Phi}(i_0 | i_1, \dots, i_s) := +\\ +\frac{\Phi(\eta_{j0} + \eta_{j1}P(i_0|i_1) + \dots + \eta_{js}P(i_0|i_s))}{\sum_{k=1}^m \Phi(\eta_{j0} + \eta_{j1}P(k|i_1) + \dots + \eta_{js}P(k|i_s))} +\tag{4} +\end{equation}\] where \(n_{ji} \in \mathbb{R}(j = 1, \dots, s; i = 1, \dots, m)\) and \(\Phi\) is the (cumulative) standard normal distribution function.

    +

    This specification is denoted as and MTD-Probit model. The log-likelihood is given by: \[\begin{equation} +LL = \sum_{i_1, i_2, \dots, i_{i_s}, i_0} n_{i_1, i_2, \dots, i_{i_s}, i_0} log(P_j^{\Phi}(i_0 | i_1, \dots, i_s) ) \tag{5} +\end{equation}\] and the maximum likelihood estimator is defined, as usual, as \(\widehat{\eta} = \text{arg max}_{n_{j1}, \dots, n_{js}} LL\). The parameters \(P_{jk}(i_0|i_1)\), \(k\) =\(1, \dots, s\) can be estimated in advance, through the consistent and unbiased estimators proposed by Ching et al. (2002):

    +

    \[\begin{equation} +\widehat{P}_{jk}(i_0|i_1) = \frac{n_{i_1i_0}}{\sum_{i_0=1}^n n_{i_1 i_0}} \tag{6} +\end{equation}\] This specification can be superior to the MTD because the estimation procedure is easier, and the standard numerical optimization routines can be easily applied in the absence of constraints. However, similarly to the standard MTD, the likelihood is not a strictly concave function on the entire parameter state-space, thus the choice of starting values is still important. Additionally, the model describes a broader range of possible dependencies since the parameters are not constrained. Moreover, this proposed model is more accurate than the MTD model. For more details on this, see Nicolau (2014).

    +

    Overall, the published work on MMC models was mostly based on improving the estimation methods and/or making the model more parsimonious. In Damásio (2013) and Damásio and Nicolau (2014), a different approach was used, and the work developed focused on the usage of MMC as regressors in a certain model. Notably, it showed that an MMC can improve the forecast of a dependent variable. In a way, it demonstrated that an MMC can be an end in itself, but it can be an instrument to reach an end or a purpose. In this work, the opposite will be developed: instead of considering an MMC as regressors, a model in which a vector with pre-determined exogenous variables is part of \(\mathcal{F}_{t-1}\) is proposed.

    +

    3 Covariates in Markov chain models

    +

    Regarding the inclusion of covariates in Markov chains models, Regier (1968) proposed a two-state Markov chain model, where the transition matrix probabilities were a function of a parameter, \(q\), that described the tendency of the subject to move from state to state. Kalbfleisch and Lawless (1985) proposed a panel data analysis method under a continuous-time Markov model that could be generalized to handle covariate analysis and the fitting of certain non-homogeneous models. This work overcame the limitations of Bartholomew (1968), Spilerman and Singer (1976) and Wasserman (1980) methodologies, by developing a new algorithm that provided a very efficient way of obtaining maximum likelihood estimates. Also, Muenz and Rubinstein (1985) developed a Markov model for covariates dependence of binary sequences, where the transitions probabilities were estimated through two logistic regressions that depended on a set of covariates. Essentially, Muenz and Rubinstein (1985) modeled a non-homogeneous Markov chain through logistic regression, considering only two states. Islam et al. (2004) developed an extension of this model considering three states, and Islam and Chowdhury (2006) generalized this approach for HOMC. Additionally, Azzalini (1994) proposed a model to study the influence of time-dependent covariates on the marginal distribution of a binary response in serially correlated binary data, where Markov chains are expressed in terms of transitional probabilities. Jackson (2011) proposed a Markov model for panel data, which allowed for the transitions intensities to vary between individuals or constant time-dependent covariates. Specifically, this work allowed to account for different intensities throughout transitions of states and include individual-specific covariates. The time-inhomogeneos model proposed is restricted to piecewise-constant intensities. The implementation of this work is available in the package msm. More recently, Bolano (2020) proposed an MTD-based approach to handle categorical covariates, that considers each covariate separately and combines the effects of the lags of the MTD and the covariates employing a mixture model. Specifically, the model is given by:

    +

    \[\begin{equation} +P(X_t = k \mid X_{t-1} = i, C_1 = c_1, \dots, C_l = c_l) \approx \theta_0 a_{ik} + \sum_{h=1}^l \theta_h d_{c_{h}k} \tag{7} +\end{equation}\]

    +

    where \(a_{ik}\) is the transition probability from state \(i\) to state \(k\), as in a conventional Markov chains and \(d_{c_{h}k}\) is the probability of observing the states \(k\) given the modality \(c_h\) of the covariate \(h\). Lastly, \(\theta_0, \dots, \theta_l\) are the weights of the explanatory elements of the model.

    +

    According to the literature presented, several researchers have proposed methodologies or generalizations to include covariates in Markov chain models. Primarily for social sciences and health applications, where the transition probabilities were generally modeled through logistic regression. However, there has been an increased focus on categorical covariates, opposing continuous covariates and a lack of approaches to multivariate Markov chain models. Thus, with this work, we aim to tackle this research gap.

    +

    4 Multivariate Markov chains with covariates

    +

    4.1 Theoretical model

    +

    In this work, a new generalization of Ching et al. (2002) MMC model is presented: the GMMC model, that is, we will consider exogeneous or pre-determined covariates in the \(\sigma\) - algebra generated by the available information until \(t-1\) (\(\mathcal{F}_{t-1}\)). These variables can be deterministic or stochastic and do not necessarily need to be reported at time \(t\). Broadly, the model is given by:

    +

    \[\begin{equation} +P(S_{jt} = k | \mathcal{ F}_{t-1} ) = P(S_{jt} = k | S_{1t-1} = i_1, S_{2t-1} = i_2, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) \tag{8} +\end{equation}\] We can specify this model as proposed by Ching et al. (2002) with Raftery’s notation:

    +

    \[\begin{multline} +P(S_{jt} = i_0 | S_{1t-1} = i_1,\dots, S_{st-1} = i_s, \boldsymbol{x}_t) \equiv \\ +\lambda_{j1}P(S_{jt} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_t) + \dots + \lambda_{js}P(S_{jt} = i_0 | S_{st-1} = i_s, \boldsymbol{x}_t) \tag{9} +\end{multline}\] subject to the usual constraints.

    +

    4.2 Estimation and inference

    +

    This proposed model is estimated through MLE, similar to the standard MTD model. The log-likelihood is given by:

    +

    \[\begin{equation} +LL = \sum_{t = 1}^n log P(S_{jt} = i_0 | S_{1t-1} = i_1, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) \tag{10} +\end{equation}\]

    +

    Additionally, the probabilities can be estimated through an multinomial logit model. The proof for consistency and asymptotic distribution is available in the Supplementary Material section.

    +

    4.3 Monte Carlo simulation study

    +

    A Monte Carlo simulation study was designed to evaluate the dimension and power of the test parameters of the proposed model. The R statistical environment was used for all computations. This simulation study was comprised of two parts.

    +

    Part I: Detect a non-homogeneous Markov chain

    +

    First, we considered two sequences with two and three states. The main goal was to assess if the model detected the presence of a non-homogeneous Markov chain correctly and if the estimate of the parameter would correspond to the expected. So, given two sequences, one generated through a non-homogeneous Markov chain and the other generated through a homogeneous Markov chain, it would be expected that the parameter associated with the transition probabilities of the first sequence would be one and the parameter associated with the transition probabilities of the second sequence would be zero. With this in mind, the transitions probabilities of the first sequence were estimated through a logistic regression, where parameters of this regression were randomly generated in R, and the second sequence was generated through a first-order Markov chain. Hence, for both states cases considered, it was expected that the estimated regression would be:

    +

    \[\begin{multline} +P(S_{1t} = i_0 | S_{1t-1} = i_1, S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) = \\ +1 \times P(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}) + 0 \times P(S_{1t} = i_0 | S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) \tag{11} +\end{multline}\]

    +

    To assess the test power and dimension, we used the Wald test with the following hypothesis:

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 1: Power and dimension of test assessment +
    + +Hypothesis + +Test +
    +Power + +\(H_0: \lambda_{11} = 0\) + +\(\frac{\widehat{\lambda}_{11}^2}{se(\widehat{\lambda}_{11})^2} \sim \chi^2_{(1)}\) +
    + +\(H_0: \lambda_{12} = 1\) + +\(\frac{(\widehat{\lambda}_{12}-1)^2}{se(\widehat{\lambda}_{12})^2} \sim \chi^2_{(1)}\) +
    +Dimension + +\(H_0: \lambda_{11} = 1\) + +\(\frac{(\widehat{\lambda}_{11}-1)^2}{se(\widehat{\lambda}_{11})^2} \sim \chi^2_{(1)}\) +
    + +\(H_0: \lambda_{12} = 0\) + +\(\frac{\widehat{\lambda}_{12}^2}{se(\widehat{\lambda}_{12})^2} \sim \chi^2_{(1)}\) +
    +
    +
    + +
    +

    The simulation procedure was performed as follows:

    +
      +
    1. Generate the values of the coefficients for the probability transition matrix of series \(S_{1t}\) randomly;
    2. +
    3. Generate the probability transition matrix of series \(S_{2t}\) randomly;
    4. +
    5. Set the initial value of \(S_{2t}\) to 1 and simulate the following from the defined probability transition matrix;
    6. +
    7. In each iteration (of 1000 repetitions), +
        +
      • Generate \(X_t \sim N(2,25)\);
      • +
      • Generate the time-varying probabilities of series \(S_{1t}\) through the values of the fixed coefficients and the lagged variable \(x_t\);
      • +
      • Set the initial values of the series \(S_{1t}\) as 1;
      • +
      • For each period \(t\), simulate the next state of \(S_{1t}\) from the probabilities simulated for that moment;
      • +
      • Estimate the model through the function mmcx;
      • +
      • Calculate the Wald test and add to the counter if it is rejected.
      • +
    8. +
    +
    +
    +Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain. +

    +Figure 1: Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain. +

    +
    +
    +

    Considering two states, the test dimension was at 5.7% with a sample size of 100 observations, sightly increased with 500 observations, and returned to the expected values in 1000 and 5000 observations. For a sample size of 100, 500, and 1000 observations, we have low test power. So, when considering two states, the sample must have at least 5000 observations, or, if that is not possible, consider a higher significance level when testing for individual significance.

    +
    +
    +Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain. +

    +Figure 2: Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain. +

    +
    +
    +

    Considering three states, the test dimension was 9.7% for a sample size of 100 observations, 0.2% for a sample size of 500 observations, and 0.3% for a sample size of 1000. Regarding the test power, we see similar behavior, for a sample of 100 observations, the test power was 90.5%, and from a sample of 500 observations, we reach a test power of 100%. Thus, when considering three states, one may consider a sample of 500 observations without compromising the test power and dimension.

    +
    +

    Part II: Detecting Parameters Assigned Values

    +

    Secondly, we performed a simulation study where we considered two non-homogeneous Markov chain with two states. Here, the main goal was to assess if the model correctly detected the parameters assigned. So, in this case, we started by generating the terms of the model proposed. These terms were estimated through logistic regression, and the parameters of this regression were randomly generated in R. Similarly to Part I, we considered a Wald test to assess the power and dimension of the test. The simulation procedure was performed as follows:

    +
      +
    1. Generate the values of the coefficients to calculate the probability transition matrices randomly;
    2. +
    3. In each iteration (of 1000 repetitions), +
        +
      • Generate \(\{x_t\} \sim N(2,25)\);
      • +
      • Generate the probabilities \(P \left(S_{jt}|S_{st-1}, x_{t-1} \right)\), with \(j=1,2\) and \(s=1,2\).
      • +
      • Set the initial values of the series \(S_{1t}\) and \(S_{2t}\) as 1;
      • +
      • For each period \(t\), calculate the probabilities \(P \left(S_{1t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)\) and \(P \left( S_{2t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)\) through the assigned values of the \(\lambda\)’s. Considering the calculated probabilities, simulate the next state for each series, \(S_{1t}\) and \(S_{2t}\).
      • +
      • Estimate the model through the function mmcx;
      • +
      • Calculate the Wald test and add to the counter if it is rejected.
      • +
    4. +
    +

    The probabilities \(P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)\) and \(P\left(S_{1t}|S_{2t-1}, x_{t-1}\right)\) presented some differences regarding its values’ distributions. Specifically, \(P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)\) had more extreme probabilities values, with the minimum value being close to 0 and the maximum value being close to 1. And, the probabilities \(P\left(S_{1t}|S_{2t-1}, x_{t-1} \right)\) had more moderate values, with the minimum value being, on average, 0.3 and the maximum value, 0.7. When the probabilities have values close to 1, one says that the states/regimes are persistent. We calculated the power and dimension of test for each value of \(\lambda\) when the estimated probabilities are moderate and when they are extreme. Hence, considering equation 1:

    +

    \[\begin{multline} +P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\dots, S_{2t-1} = i_2, \boldsymbol{x}_{t-1} \right) = \\ +\lambda_{11}P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}\right) + \lambda_{12}P\left(S_{1t} = i_0 | S_{2t-1} = i_s, \boldsymbol{x}_{t-1} \right) \tag{12} +\end{multline}\]

    +

    The parameter \(\lambda_{11}\) will be associated with more extreme probabilities and \(\lambda_{12}\) will be associated with more moderate probabilities.

    +
    +
    +Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states. +

    +Figure 3: Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states. +

    +
    +
    +
    +
    +Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting. +

    +Figure 4: Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting. +

    +
    +
    +

    When the states are persistent and the parameter’s value is low (i.e., 0.2 and 0.4), we have low test power. By increasing this value, the power of test increases as well. When the states are not persistent, we do not have a clear pattern regarding the power of test, for a value of the parameter of 0.2, the power of test is still low (although not as low as the first scenario), increases when we have a value of 0.4, decreases when the value is 0.6 and increases again when the value is 0.8. Overall, the estimated standard errors seem high, leading to low test power. Regarding the test dimension, when we have a higher weight associated with the non-persistent states, the test dimension converges to 0. However, when this weight is associated with the persistent states, the test dimension increases with the sample size, reaching a value of 10% in some cases. Hence, one must use a 10% significance level to perform statistical inference on the parameters in this situation.

    +

    4.4 Software implementation

    +

    Regarding the software implementation for each function, for the multimtd function the estimation method was presented in Berchtold (2001) applied to the multivariate case. For multimtd_probit, a package for numerical maximization of the log-likelihood, maxLik (Henningsen and Toomet 2011), was used. This package performs Maximum Likelihood estimation through different optimization methods that the user can choose. The optimization methods available are Newton-Raphson, Broyden - Fletcher - Goldfarb - Shanno, BFGS al- algorithm, Berndt - Hall - Hall - Hausman, Simulated ANNealing, Conjugate Gradients, and Nelder-Mead. Finally, for the mmcx function, a different approach was used. Unlike the MTD- Probit, the model proposed has equality and inequality restrictions in the parameters. The maxLik (Henningsen and Toomet 2011) package only allows one type of restriction for each Maximum Likelihood estimation, so it was not possible to use this package to estimate the proposed model with exogenous variables. Hence, the algorithm used was the Augmented Lagrangian method, available in the alabama (Varadhan 2015) package through the function auglag. This estimation method for the proposed model is not very common, however, it has been applied to Markov chain models (Rajarshi 2013). The GMMC model’s probabilities were estimated through a Multinomial Logit using rmultinom of the nnet package (Venables and Ripley 2002).

    +

    Additionally, the hessian matrices were also computed, which allowed performing statistical inference. The maxLik and auglag compute the Hessian matrices with the estimates. For the function multimtd, since the optimization procedure of Berchtold (2001) was used, the hessian was computed through the second partial derivatives. The function multi.mtd requires the following elements:

    +
      +
    • y, a matrix of the categorical data sequences.

    • +
    • deltaStop, the delta below which the optimization phases of the parameters stop.

    • +
    • is_constrained, flag indicating whether the function will consider the usual set of constraints (usual set: , new set of constraints: ).

    • +
    • delta, the amount of change to increase/decrease in the parameters for each iteration of the optimization algorithm.

    • +
    +

    The last three arguments concern the optimization procedure. For more details see Berchtold (2001). Considering two vectors of two categorical data sequences, s1 and s2, to estimate the model and obtain the results:

    +
    +
    +
    multi.mtd(y=cbind(s1,s2), deltaStop=0.0001, is_constrained=TRUE, delta=0.1)
    +
    +
    +

    The function multi.mtd_probit requires the following arguments:

    +
      +
    • y, a matrix of the categorical data sequences.
    • +
    • initial, a vector of the initial values of the parameters.
    • +
    • nummethod, the numerical maximization method, currently either “NR” (for Newton-Raphson), “BFGS” (for Broyden-Fletcher-Goldfarb-Shanno), “BFGSR” (for the BFGS algorithm implemented in R), “BHHH” (for Berndt-Hall-Hall-Hausman), “SANN” (for Simulated ANNealing), “CG” (for Conjugate Gradients), or “NM” (for Nelder-Mead). Lower-case letters (such as “nr” for Newton-Raphson) are allowed. The default method is “BFGS”. For more details see maxLik (Henningsen and Toomet 2011) package.
    • +
    +

    Considering two vectors of two categorical data sequences, s1 and s2 again, to estimate the model an obtain the results with BFGS maximization method:

    +
    +
    +
    multi.mtd_probit(y = cbind(s1,s2), initial=c(1,1,1), nummethod='bfgs')
    +
    +
    +

    Finally, the function mmcx requires the following elements:

    +
      +
    • y, a matrix of categorical data sequences.
    • +
    • x, a matrix of covariates (exogeneous variables).
    • +
    • initial, a vector of the initial values of the parameters.
    • +
    +

    Considering two vectors of two categorical data sequences, s1 and s2, and a vector of an exogeneous variables, x, to estimate the model and obtain the results:

    +
    +
    +
    mmcx(y = cbind(s1,s2), x = cbind(x), initial=c(1,1))
    +
    +
    +

    These functions return a list with the parameter estimates, standard errors, z-statistics, p- values, and the log-likelihood function value for each equation.

    +

    The package offers an additional function that allows to obtain the transition probability matrices of mmcx considering a specific value of x defined by the user. The function is MMC_tpm and requires the following elements:

    +
      +
    • s, a matrix of categorical data sequences.
    • +
    • x, a matrix of covariates (exogeneous variables).
    • +
    • value, a single value of x, to condition the probability transition matrices.
    • +
    • result, a list returned by the function mmcx containing the model’s estimates.
    • +
    +

    Considering two vectors of two categorical data sequences, s1 and s2, a vector of an exogeneous variables, x and res the list returned by the function mmcx, to obtain the transition probability matrices:

    +
    +
    +
    MMC_tpm(s = cbind(s1,s2), x = cbind(x), value = max(x), result = res)
    +
    +
    +

    The function returns an array containing the probability transition matrices, conditioned on a specific value of x, for each equation.

    +

    5 Illustration

    +

    Markov chain models are used in interdisciplinary areas, such as economics, business, biology, and engineering, with applications to predict long-term behavior from traffic flow to stock market movements, among others. Modeling and predicting stock markets returns is particularly relevant for investors and policy makers. Since the stock market is a volatile environment, and the returns are difficult to predict, estimating the set of probabilities that describe these movements, might provide relevant input. Additionally, incorporating the effect of key macroeconomic variables could provide a more accurate picture of this specific environment.

    +

    The following empirical illustration aims to model stock returns of two indexes as a function of the interest rate spread, specifically the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity.

    +

    The interest rate spread is a key macroeconomic variable and provides valuable information regarding the economy state. Specifically, it has been used to forecast recessions as in Estrella and Mishkin (1996), Dombrosky and Haubrich (1996), Chauvet and Senyuz (2016), Tian and Shen (2019) and McMillan (2021). Generically, short-term yields are lower than long-term yields when the economy is in expansion. On the other hand, short-term yields are higher than long-term yields when the economy is in recession. The difference between these yields (or, more specifically, the yield curve’s slope) can be used to forecast the state of the economy. Hence, this indicator might provide relevant input for investors.

    +

    We considered the 5-week-day daily stock returns (\(r_t=100 \times \log(P_t/P_{t-1})\), where \(P_t\) is the adjusted close price) of two indexes, S&P500 and DJIA, from November \(11^{th}\) 2011 to September \(1^{st}\) 2021 (2581 observations). Additionally, we considered the interest rate spread (\(spread_{t}\)), the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. The data was retrieved from FRED. Below, we have the descriptive statistics of these variables.

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 2: Summary statistics of \(stockreturns\) dataset +
    +Variable + +Minimum + +1\(^{st}\) Quantile + +Median + +Mean + +3\(^{rd}\) Quantile + +Maximum +
    +\(spread_{t}\) + +-0.52 + +0.92 + +1.54 + +1.454 + +2.03 + +2.97 +
    +\(r_{t;SP500}\) + +-12.765 + +-0.32 + +0.07 + +0.054 + +0.518 + +8.968 +
    +\(r_{t;DJIA}\) + +-13.842 + +-0.327 + +0.071 + +0.046 + +0.508 + +10.764 +
    +
    +
    + +
    +

    Moreover, to apply the model proposed, it is necessary to have a categorical time series, thus we applied the following procedure:

    +

    \[ +S_{st}= +\begin{cases} +1, r_t \leq \widehat{q}_{s;0.25}\\ +2, \widehat{q}_{s;0.25} < r_t < \widehat{q}_{s;0.75} \\ +3, r_t \geq \widehat{q}_{s;0.75}\\ +\end{cases} +\]

    +

    where \(\widehat{q}_{s;\alpha}\) is the estimated quantile of order \(\alpha\) of the marginal distribution of \(r_t\). Considering this illustration and the model proposed, we will have two equations:

    +

    \[\begin{multline} +P(S_{sp500,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{11} P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{12} P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1}) \tag{13} +\end{multline}\]

    +

    \[\begin{multline} +P(S_{djia,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{21} P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{22} P(S_{djia,t} | S_{djia, t-1}, spread_{t-1}) \tag{14} +\end{multline}\]

    +

    In Figures 5 to 8 generate through ggplot2 (Wickham 2016) and gridExtra (Auguie 2017), we have the smoothed conditional probabilities of both series, depending on \(spread_{t-1}\). The number of observations is high, and the probabilities varied abruptly in a small time frame, making the plots hard to read. To simplify, a moving average model (from pracma (Borchers 2022)) of order 5, due to the frequency of the data, was adjusted to these probabilities to illustrate how they evolve throughout time. These plots represent the probabilities associated with the parameters of the general model proposed, showcasing how these vary throughout time and the main of advantage of this generalization. Instead of having fixed matrices of transition probabilities, we allow for these to vary throughout time, depending on the values of \(spread_{t-1}\). Specifically, Figures 5 and 6 correspond to the non-homogeneous Markov chain to build the SP&500’s equation and Figures 7 and Figures 8 correspond to the non-homogeneous Markov chain to build DJIA’s equation. We see a similar behavior within each series regardless of whether it depends on the previous states of \(S_{1t}\) or \(S_{2t}\). Additionally, the scales of the graphs are small, indicating that these probabilities vary around the same set of values.

    +
    + +
    +
    +
    +Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +Figure 5: Estimated conditional probabilities of series 1 (SP500) depending on \(spread_{t-1}\) and on series 1 (SP500) previous state: \(P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})\). This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +
    +
    +
    +
    +Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +Figure 6: Estimated conditional probabilities of series 1 (SP500) depending on \(spread_{t-1}\) and on series 2 (DJIA) previous state: \(P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})\). This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +
    +
    +
    +
    +Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +Figure 7: Estimated conditional probabilities of series 2 (DJIA) depending on \(spread_{t-1}\) and on series 1 (SP500) previous state: \(P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})\). This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +
    +
    +
    +
    +Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +Figure 8: Estimated conditional probabilities of series 2 (DJIA) depending on \(spread_{t-1}\) and on series 2 (DJIA) previous state: \(P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})\). This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function. +

    +
    +
    +
    +

    The model can be estimated through the mmcx function:

    +
    +
    +
    attach(stockreturns)
    +res <- mmcx(cbind(sp500, djia), spread_1, initial=c(1,1))
    +
    +
    --------------------------------------------
    +Equation 1 
    +  Estimate Std. Error t value Pr(>|t|)    
    +1 0.685660   0.171346   4.002    0.000 ***
    +2 0.314340   0.171346   1.835    0.067 *  
    +
    +Log-Likelihood: -2636.355 
    +--------------------------------------------
    +--------------------------------------------
    +Equation 2 
    +  Estimate Std. Error t value Pr(>|t|)    
    +1 0.629993   0.176383   3.572    0.000 ***
    +2 0.370007   0.176383   2.098    0.036 ** 
    +
    +Log-Likelihood: -2636.622 
    +--------------------------------------------
    +
    +

    Considering the first equation, the effect of the probabilities depending on S&P500’s previous state and the interest rate spread has a higher weight on the overall probability. Also, this estimate is highly significant, presenting a \(p\)-value close to zero. The effect of DJIA’s previous state in S&P500 is lower but it is also significant for a 10% significance level. In the second equation, the effect of S&P500’s previous state is higher than DJIA’s and both estimates are highly significant.

    +

    One of the advantages of this approach is the possibility to assess the transition probabilities for specific values of \(x_t\), in this case, the interest rate spread. For both series, we calculated the transition probabilities for this variable’s minimum and maximum value in the sample, which are -0.52 and 2.97, respectively. To obtain the probability transition matrices for these two cases, the code is the following:

    +
    +
    +
    tpm_max <- MMC_tpm(cbind(sp500, djia), spread_1, 
    +                   value = max(spread_1), result = res)
    +
    +tpm_min <- MMC_tpm(cbind(sp500, djia), spread_1, 
    +                   value = min(spread_1), result = res)
    +
    +
    +
    +
    +
    library(markovchain)
    +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) # Generate figure 9
    +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) # Generate figure 10
    +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) # Generate figure 11
    +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) # Generate figure 12
    +
    +
    +

    In Figures 10 and 9, we have the transition probabilities network for S&P500, corresponding to the minimum and maximum value of the spread. The most noticeable difference between these two networks is regarding the transition probability from the second state to the third state. For the maximum value of \(spread_{t-1}\), the transition probability from the second state to the third state is 0.6. So, when the economy is strong, one might expect to have higher returns, when \(t-1\) was in the second state. However, this scenario shifts when considering the minimum value of \(spread_{t-1}\). The probability of obtaining higher returns, that is, being in state three, becomes almost evenly distributed, regardless of the state in \(t-1\). This indicates the instability of the stock market, when the economy is weaker. Another difference in these networks, is regarding the transition probability from the third state to the first state. For the maximum value of \(spread_{t-1}\), this probability is 0.27 and for the minimum value increases to 0.44. This is also expected, since when the economy is weaker, the probability of having lower returns is greater.

    +
    +
    +Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread$_{t-1}$. The highest probability of 0.6 refers to the transition from state 2 to state 3. +

    +Figure 9: Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread\(_{t-1}\). The highest probability of 0.6 refers to the transition from state 2 to state 3. +

    +
    +
    +
    +
    +Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread$_{t-1}$. The highest probability of 0.56 refers to the transition from state 2 to state 2. +

    +Figure 10: Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread\(_{t-1}\). The highest probability of 0.56 refers to the transition from state 2 to state 2. +

    +
    +
    +

    Considering the second equation (Figures \(\ref{fig:fig-djia-max}\) and \(\ref{fig:fig-djia-min}\)), corresponding to the DJIA’s returns, we see a similar behaviour as in S&P500’s networks. The transition probability from the second state to the third state is higher for the maximum value of \(spread_{t-1}\) and the transition probability from the third state to the first state is higher when we consider the minimum value of \(spread_{t-1}\). Although, the difference of this last probability between the minimum and maximum value of \(spread_{t-1}\) is not as big as in S&P500. Overall, the rest of the probabilities structure, remains the same.

    +
    +
    +Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread$_{t-1}$. The probability of 0.58 refers to the transition from state 2 to state 3. +

    +Figure 11: Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread\(_{t-1}\). The probability of 0.58 refers to the transition from state 2 to state 3. +

    +
    +
    +
    +
    +Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread$_{t-1}$. The highest probability of 0.51 refers to the transition from state 2 to state 2. +

    +Figure 12: Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread\(_{t-1}\). The highest probability of 0.51 refers to the transition from state 2 to state 2. +

    +
    +
    +

    6 Conclusions, limitations and further research

    +

    Several proposals for including of exogenous variables in MMC models have been presented. The main limitations were associated with the high complexity of the models to be developed and estimated. Additionally, most models considered only categorical exogenous variables, existing a lack of focus on continuous exogenous variables. This work proposes a new approach to include continuous exogenous variables in Ching et al. (2002) model for multivariate Markov chain. This is relevant because it allows studying the effect of previous series and exogenous variables on the transition probabilities. The model is based on Ching et al. (2002) MMC model but considers non-homogeneous Markov chains. Thus, the probabilities that compose the model are dependent on exogenous variables. These probabilities are estimated as a usual non-homogeneous Markov chain through a multinomial logit model. The model parameters are then estimated through MLE, as well as the standard errors. We developed a package with the estimation function of the model proposed. In this, we considered the Augmented Lagrangian optimization method for estimating the parameters through MLE. Additionally, we designed a Monte Carlo simulation study to assess this model’s test power and dimension. The results showed that the model detected a non-homogeneous Markov chain. Moreover, an empirical illustration demonstrated the relevance of this new model by estimating the probability transition matrix for different exogenous variable values. Ignoring the effect of exogenous variables in MMC means that we would not detect the probabilities’ changes according to the covariates’ values. In this setting, one would have a limited view of the studied process. Hence, this approach allows us to understand how a specific variable influences a specific process. The main contributions of this work are the development of a package with functions for multivariate Markov chains, addressing the statistical inference in these models and the inclusion of covariates. The limitations are related to the implementation in R, specifically the optimization algorithm applied is not common for MMC models, in that sense, it would be beneficial to study new approaches to optimizing the maximum likelihood function as further research. Additionally, extending this generalization to the MTD-probit model proposed by Nicolau (2014) would also be relevant, which removes the constraints of the model’s parameters and allows the model to detect negative effects.

    +
    +

    6.1 Supplementary materials

    +

    Supplementary materials are available in addition to this article. It can be downloaded at +RJ-2024-006.zip

    +

    6.2 CRAN packages used

    +

    march, markovchain, DTMCPack, GenMarkov, msm, maxLik, alabama, nnet, ggplot2, gridExtra, pracma

    +

    6.3 CRAN Task Views implied by cited packages

    +

    ChemPhys, DifferentialEquations, Distributions, Econometrics, Finance, MachineLearning, NumericalMathematics, Optimization, Phylogenetics, Spatial, Survival, TeachingStatistics

    +
    +
    +S. R. Adke and S. R. Deshmukh. Limit Distribution of a High Order Markov Chain. Journal of the Royal Statistical Society. Series B (Methodological), 50(1): 105–108, 1988. URL https://www.jstor.org/stable/2345812. +
    +
    +B. Auguie. gridExtra: Miscellaneous functions for "grid" graphics. 2017. URL https://CRAN.R-project.org/package=gridExtra. R package version 2.3. +
    +
    +A. Azzalini. Logistic regression for autocorrelated data with application to repeated measures. Biometrika, 81(4): 767–775, 1994. DOI 10.1093/biomet/81.4.767. +
    +
    +J. Bartholomew. Stochastic Models for Social Processes. The Australian and New Zealand Journal of Sociology, 4(2): 171–172, 1968. DOI https://doi.org/10.1177/144078336800400215. +
    +
    +A. Berchtold. Autoregressive Modelling of Markov Chains. Proc. 10th International Workshop on Statistical Modelling, 104: 19–26, 1995. DOI 10.1007/978-1-4612-0789-4_3. +
    +
    +A. Berchtold. Estimation in the mixture transition distribution model. Journal of Time Series Analysis, 22(4): 379–397, 2001. DOI https://doi.org/10.1111/1467-9892.00231. +
    +
    +A. Berchtold. Mixture transition distribution (MTD) modeling of heteroscedastic time series. Computational Statistics and Data Analysis, 41(3-4): 399–411, 2003. DOI 10.1016/S0167-9473(02)00191-3. +
    +
    +A. Berchtold. Modélisation autorégressive des chaines de Markov : utilisation d’une matrice différente pour chaque retard. Revue de Statistique Appliquée, 44(3): 5–25, 1996. URL http://www.numdam.org/item/RSA_1996__44_3_5_0/. +
    +
    +A. Berchtold, O. Maitre and K. Emery. Optimization of the mixture transition distribution model using the march package for R. Symmetry, 12(12): 1–14, 2020. DOI 10.3390/sym12122031. +
    +
    +D. Bolano. Handling covariates in markovian models with a mixture transition distribution based approach. Symmetry, 12(4): 2020. DOI 10.3390/SYM12040558. +
    +
    +H. W. Borchers. Pracma: Practical numerical math functions. 2022. URL https://CRAN.R-project.org/package=pracma. R package version 2.4.2. +
    +
    +M. Chauvet and Z. Senyuz. A dynamic factor model of the yield curve components as a predictor of the economy. International Journal of Forecasting, 32(2): 324–343, 2016. DOI https://doi.org/10.1016/j.ijforecast.2015.05.007. +
    +
    +D. G. Chen and Y. L. Lio. A Novel Estimation Approach for Mixture Transition Distribution Model in High-Order Markov Chains. Communications in Statistics - Simulation and Computation, 38(5): 990–1003, 2009. DOI 10.1080/03610910802715009. +
    +
    +W. K. Ching, E. S. Fung and M. K. Ng. A higher-order markov model for the newsboy’s problem. The Journal of the Operational Research Society, 54(3): 291–298, 2003. +
    +
    +W. K. Ching, E. S. Fung and M. K. Ng. A multivariate markov chain model for categorical data sequences and its applications in demand predictions. IMA Journal of Management Mathematics, 13(3): 187–199, 2002. DOI 10.1093/imaman/13.3.187. +
    +
    +W. K. Ching and M. K. Ng. Markov chains: Models, algorithms and applications. Springer, 2006. DOI 10.1007/0-387-29337-X. +
    +
    +W. K. Ching, M. K. Ng and E. S. Fung. Higher-order multivariate Markov chains and their applications. Linear Algebra and its Applications, 428(2-3): 492–507, 2008. DOI 10.1016/j.laa.2007.05.021. +
    +
    +B. Damásio. Essays on Econometrics: Multivariate Markov Chains. 2018. URL https://www.repository.utl.pt/bitstream/10400.5/18128/1/TD-BD-2019.pdf. +
    +
    +B. Damásio. Multivariate Markov Chains - Estimation, Inference and Forecast. A New Approach: What If We Use Them As Stochastic Covariates? 2013. URL http://hdl.handle.net/10400.5/6397. +
    +
    +B. Damásio and S. Mendonça. Leader-follower dynamics in real historical time: A markovian test of non-linear causality between sail and steam (co-)development, mimeo. 2020. +
    +
    +B. Damásio and S. Mendonça. Modelling insurgent-incumbent dynamics: Vector autoregressions, multivariate Markov chains, and the nature of technological competition. Applied Economics Letters, 26(10): 843–849, 2019. DOI 10.1080/13504851.2018.1502863. +
    +
    +B. Damásio and J. Nicolau. Combining a regression model with a multivariate Markov chain in a forecasting problem. Statistics & Probability Letters, 90: 108–113, 2014. DOI https://doi.org/10.1016/j.spl.2014.03.026. +
    +
    +B. Damásio and J. Nicolau. Time inhomogeneous multivariate Markov chains : detecting and testing multiple structural breaks occurring at unknown dates. REM working papers 0136–2020. Instituto Superior de Economia e Gestão. 2020. URL http://hdl.handle.net/10400.5/20164. +
    +
    +A. M. Dombrosky and J. Haubrich. Predicting real growth using the yield curve. Economic Review, I(Q): 26–35, 1996. URL https://EconPapers.repec.org/RePEc:fip:fedcer:y:1996:i:qi:p:26-35. +
    +
    +A. Estrella and F. S. Mishkin. The yield curve as a predictor of U.S. recessions. Current Issues in Economics and Finance, 2(Jun): 1996. URL https://www.newyorkfed.org/research/current_issues/ci2-7.html. +
    +
    +A. Henningsen and O. Toomet. maxLik: A package for maximum likelihood estimation in R. Computational Statistics, 26(3): 443–458, 2011. URL http://dx.doi.org/10.1007/s00180-010-0217-1. +
    +
    +M. A. Islam, S. Arabia and R. I. Chowdhury. A Three State Markov Model for Analyzing Covariate Dependence. International Journal of Statistical Sciences, 3(i): 241–249, 2004. URL http://www.ru.ac.bd/stat/wp-content/uploads/sites/25/2019/01/P21.V3s.pdf. +
    +
    +M. A. Islam and R. I. Chowdhury. A higher order Markov model for analyzing covariate dependence. Applied Mathematical Modelling, 30(6): 477–488, 2006. DOI 10.1016/j.apm.2005.05.006. +
    +
    +C. Jackson. Multi-state models for panel data: The msm package for r. Journal of statistical software, 38: 1–28, 2011. DOI 10.18637/jss.v038.i0810.18637/jss.v038.i08. +
    +
    +P. A. Jacobs and A. W. Lewis. Discrete Time Series Generated by Mixtures II : Asymptotic Properties. Journal of the Royal Statistical Society: Series B (Methodological), 40(2): 222–228, 1978. URL https://www.jstor.org/stable/2984759. +
    +
    +J. D. Kalbfleisch and J. F. Lawless. The analysis of panel data under a Markov assumption. Journal of the American Statistical Association, 80(392): 863–871, 1985. DOI 10.1080/01621459.1985.10478195. +
    +
    +M. Kijima, K. Komoribayashi and E. Suzuki. A multivariate Markov model for simulating correlated defaults. Journal of Risk, 4: 2002. DOI 10.21314/JOR.2002.066. +
    +
    +N. D. Le, R. D. Martin and A. Raftery. Modeling Flat Stretches, Brusts, and Outliers in Time Series Using Mixture Transition Distribution Models. Journal of the American Statistical Association, 91(436): 1504–1515, 1996. DOI 10.1111/j.2517-6161.1985.tb01383.x. +
    +
    +S. Lèbre and P. Y. Bourguignon. An EM algorithm for estimation in the mixture transition distribution model. Journal of Statistical Computation and Simulation, 78(8): 713–729, 2008. DOI 10.1080/00949650701266666. +
    +
    +J. Logan. A structural model of the higher‐order Markov process incorporating reversion effects. The Journal of Mathematical Sociology, 8(1): 75–89, 1981. DOI 10.1080/0022250X.1981.9989916. +
    +
    +O. Maitre and K. Emery. March: Markov chains. 2020. URL https://CRAN.R-project.org/package=march. R package version 3.3.2. +
    +
    +R. D. Martin and A. Raftery. Non-Gaussian State-Space Modeling of Nonstationary Time Series: Comment: Robustness, Computation, and Non-Euclidean Models. Journal of the American Statistical Association, 82(400): 1044–1050, 1987. DOI 10.2307/2289377. +
    +
    +D. G. McMillan. Predicting GDP growth with stock and bond markets: Do they contain different information? International Journal of Finance & Economics, 26(3): 3651–3675, 2021. DOI https://doi.org/10.1002/ijfe.1980. +
    +
    +F. Mehran. Analysis of Discrete Longitudinal Data: Infinite-Lag Markov Models. In Statistical data analysis and inference, pages. 533–541 1989. Amsterdam: North-Holland. ISBN 978-0-444-88029-1. DOI https://doi.org/10.1016/B978-0-444-88029-1.50053-8. +
    +
    +L. R. Muenz and L. V. Rubinstein. Markov Models for Covariate Dependence of Binary Sequences . Biometrics, 41(1): 91–101, 1985. URL http://www.jstor.org/stable/2530646. +
    +
    +W. Nicholson. DTMCPack: Suite of functions related to discrete-time discrete-state markov chains. 2013. URL https://CRAN.R-project.org/package=DTMCPack. R package version 0.1-2. +
    +
    +J. Nicolau. A new model for multivariate markov chains. Scandinavian Journal of Statistics, 41(4): 1124–1135, 2014. DOI 10.1111/sjos.12087. +
    +
    +J. Nicolau and F. I. Riedlinger. Estimation and inference in multivariate Markov chains. Statistical Papers, 56(4): 1163–1173, 2014. DOI 10.1007/s00362-014-0630-6. +
    +
    +G. Pegram. An Autoregressive Model for Multilag Markov Chains. Journal of Applied Probability, 17(2): 350–362, 1980. DOI 10.2307/3213025. +
    +
    +A. Raftery. A Model for High-Order Markov Chains. Journal of the Royal Statistical Society: Series B (Methodological), 47(3): 528–539, 1985. DOI 10.1111/j.2517-6161.1985.tb01383.x. +
    +
    +A. Raftery and S. Tavaré. Estimation and Modelling Repeated Patterns in High Order Markov Chains with the Mixture Transition Distribution Model. Applied Statistics, 43(1): 179–199, 1994. DOI 10.2307/2986120. +
    +
    +M. B. Rajarshi. Statistical inference for discrete time stochastic processes. SpringerBriefs in Statistics, 2013. URL http://www.springer.com/978-81-322-0762-7. +
    +
    +M. H. Regier. A Two-State Markov Model for Behavioral Change. Journal of the American Statistical Association, 63(323): 993–999, 1968. DOI 10.1080/01621459.1968.11009325. +
    +
    +T. K. Siu, W. K. Ching, E. S. Fung and M. K. Ng. On a multivariate Markov chain model for credit risk measurement. Quantitative Finance, 5(6): 543–556, 2005. DOI 10.1080/14697680500383714. +
    +
    +G. A. Spedicato. Discrete time markov chains with r. The R Journal, 2017. URL https://journal.r-project.org/archive/2017/RJ-2017-036/index.html. R package version 0.6.9.7. +
    +
    +S. Spilerman and B. Singer. The Representation of Social Processes by Markov Models. American Journal of Sociology, 82(1): 1–54, 1976. URL https://www.jstor.org/stable/2777460. +
    +
    +R. Tian and G. Shen. Predictive power of markovian models: Evidence from US recession forecasting. Journal of Forecasting, 38(6): 525–551, 2019. DOI https://doi.org/10.1002/for.2579. +
    +
    +R. Varadhan. Alabama: Constrained nonlinear optimization. 2015. URL https://CRAN.R-project.org/package=alabama. R package version 2015.3-1. +
    +
    +W. N. Venables and B. D. Ripley. Modern applied statistics with s. Fourth New York: Springer, 2002. URL https://www.stats.ox.ac.uk/pub/MASS4/. ISBN 0-387-95457-0. +
    +
    +C. Wang, T. Z. Huang and W. K. Ching. A new multivariate Markov chain model for adding a new categorical data sequence. Mathematical Problems in Engineering, 2014: 2014. DOI 10.1155/2014/502808. +
    +
    +S. Wasserman. Analyzing social networks as stochastic processes. Journal of the American Statistical Association, 75(370): 280–294, 1980. DOI 10.1080/01621459.1980.10477465. +
    +
    +H. Wickham. ggplot2: Elegant graphics for data analysis. Springer-Verlag New York, 2016. URL https://ggplot2.tidyverse.org. +
    +
    +C. S. Wong and W. K. Li. On a mixture autoregressive conditional heteroscedastic model. Journal of the American Statistical Association, 96(455): 982–995, 2001. DOI 10.1198/016214501753208645. +
    +
    +X. Zhang, M. L. King and R. J. Hyndman. A Bayesian approach to bandwidth selection for multivariate kernel density estimation. Computational Statistics and Data Analysis, 50(11): 3009–3031, 2006. DOI 10.1016/j.csda.2005.06.019. +
    +
    +D. M. Zhu and W. K. Ching. A new estimation method for multivariate Markov chain model with application in demand predictions. Proceedings - 3rd International Conference on Business Intelligence and Financial Engineering, BIFE 2010, 126–130, 2010. DOI 10.1109/BIFE.2010.39. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Vasconcelos & Damásio, "GenMarkov:  Modeling Generalized Multivariate Markov Chains in R", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-006,
    +  author = {Vasconcelos, Carolina and Damásio, Bruno},
    +  title = {GenMarkov:  Modeling Generalized Multivariate Markov Chains in R},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-006},
    +  doi = {10.32614/RJ-2024-006},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {96-113}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-006/RJ-2024-006.pdf b/_articles/RJ-2024-006/RJ-2024-006.pdf new file mode 100644 index 0000000000..25ee47b8e2 Binary files /dev/null and b/_articles/RJ-2024-006/RJ-2024-006.pdf differ diff --git a/_articles/RJ-2024-006/RJ-2024-006.tex b/_articles/RJ-2024-006/RJ-2024-006.tex new file mode 100644 index 0000000000..4e1c43c5d8 --- /dev/null +++ b/_articles/RJ-2024-006/RJ-2024-006.tex @@ -0,0 +1,548 @@ +% !TeX root = RJwrapper.tex +\title{GenMarkov: Modeling Generalized Multivariate Markov Chains in R} + + +\author{by Carolina Vasconcelos and Bruno Damásio} + +\maketitle + +\abstract{% +This article proposes a new generalization of the Multivariate Markov Chains (MMC) model. The future values of a Markov chain commonly depend on only the past values of the chain in an autoregressive fashion. The generalization proposed in this work also considers exogenous variables that can be deterministic or stochastic. Furthermore, the effects of the MMC's past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. The Monte Carlo simulation study findings showed that our model consistently detected a non-homogeneous Markov chain. Besides, an empirical illustration demonstrated the relevance of this new model by estimating probability transition matrices over the space state of the exogenous variable. An additional and practical contribution of this work is the development of a novel R package with this generalization. +} + +\section{Introduction}\label{introduction} + +Multivariate Markov chains (MMC) have a wide range of applications, in various fields. Hence, several studies and generalizations of the MMC models have been made. However, the availability of packages that allow the estimation and application of these models are scarce, and most of these methods use algorithms and software that are not broadly available or can only be applied in particular situations. In the last few years, R software has been gaining importance in the field of statistical computing. This phenomenon might be because it is free and open-source software, which compiles and runs on a wide variety of operating systems. Specifically, in R software, there are some available packages related to Markov chains (MC) and MMC. For example, the \CRANpkg{march} package \citep{march, Berchtold2020} allows the computation of various Markovian models for categorical data, including homogeneous Markov chains of any order, MTD models, Hidden Markov models, and Double Chain Markov Models. Ogier Maitre developed this package with contributions from Andre Berchtold, Kevin Emery, Oliver Buschor, and Andre Berchtold maintains it. All the models computed by this package are for univariate categorical data. The \CRANpkg{markovchain} package \citep{markovchains} contains functions and methods to create and manage discrete-time Markov chains. In addition, it includes functions to perform statistical and probabilistic analysis (analysis of their structural proprieties). Finally, the \CRANpkg{DTMCPack} package \citep{DTMCPack} contains a series of functions that aid in both simulating and determining the properties of finite, discrete-time, discrete-state Markov chains. There are two main functions: \texttt{DTMC} and \texttt{MultDTMC}, which produce \(n\) iterations of a Markov Chain(s) based on transition probabilities and an initial distribution given by the user, for the univariate and multivariate case, respectively. This last package is the only one available in R for MMC. In general, the work on MMC models is mostly based on improving the estimation methods and/or making the model more parsimonious. In this work, we aim to develop a new generalization that considers exogenous variables. Specifically, the effects of the MMC's past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. Additionally, we address statistical inference and implement these methods in an R package. The R package includes three functions: \texttt{multimtd}, \texttt{multimtd\_probit} and \texttt{mmcx}. The first two functions estimate the MTD model for multivariate categorical data, with Chings's specification \citep{Ching2002} and with the Probit specification \citep{Nicolau2014}, respectively. The last function allows the estimation of our proposed model, the Generalized Multivariate Markov Chain (GMMC) model. The R package, \CRANpkg{GenMarkov}, with these three functions is available in the Comprehensive R Archive Network (CRAN) at \url{https://CRAN.R-project.org/package=GenMarkov}. + +\section{Multivariate Markov chains}\label{multivariate-markov-chains} + +Markov chains can be appropriate for representing dependencies between successive observations of a random variable. However, when the order of the chain or the number of possible values increases, Markov chains have lack parsimony. In this context, \citet{JacobLewis1978}, \citet{Pegram1980} and \citet{Logan1981} proposed several models for HOMC. Notwithstanding these developments, the Mixture Transition Distribution model \citep{Raftery1985} proved to be more suitable to model HOMC, which overshadowed the previously proposed models. Several relevant extensions of the MTD model emerged: the Multimatrix MTD \citep{Berchtold1995, Berchtold1996}, which allowed modeling the MTD by using a different \(m \times m\) transition matrix for each lag, the Infinite-Lag MTD model that assumes an infinite lag order (\(l = \infty\)), which was first considered by \citet{Mehran1989} and later developed by \citet{Le1996} in a more general context. Finally, the MTD with General State Spaces allowed modeling more general processes with an arbitrary space state \citep{Martin1987, Adke1988, Wong2001}. Although the MTD model presents a more parsimonious approach to model Markov chains with order higher than one, it has weaknesses. Namely, when considering more than one data sequence, one represents the MMC as a HOMC, by expanding the state-space. This approach could result in a more complex probability transition matrix. Consequently, this can make the estimation unfeasible as the order, states, and the number of data sequences increase. Additionally, the model assumes the same transition matrix for each lag. In this setting, \citet{Ching2002} determined an alternative to handle the unfeasibility of the conventional multivariate Markov chain (MMC) by proposing a model with fewer parameters. The model developed is essentially the same as the MTD. However, it considers a different \(m \times m\) transition matrix for each lag and considers more than one data sequence. In the proposed multivariate Markov chain model, \citet{Ching2002} assume the following relationship: + +Let \(x_t^{(j)}\) be the state vector of the \(j\)th sequence at time \(t\). If the \(j\)th sequence is in state \(l\) at time \(t\) then + +\begin{equation} +x_{t+1}^{(j)} = \sum_{k=1}^s \lambda_{jk}P^{(jk)}x_{t}^{(k)}, \text{for } j =1, 2, \dots, s +\label{eq:eq1} +\end{equation} +where \(0 \leq \lambda_{jk} \leq 1\) for \(j \leq s, k \leq s\) and \(\sum_{k=1}^s \lambda_{jk} =1\) for \(j=1, 2, \dots, s\). The \(\lambda_{jk}\) can be interpreted as the mixing probability of the \(j\)th state to the \(k\)th state. + +The state probability distribution of the \(k\)th sequence at time \((t + 1)\) depends on the weighted average of \(P^{(jk)}x_{t}^{(k)}\) . Here \(P^{(jk)}\) is a transition probability matrix from the states in the \(k\)th sequence to the states in the \(j\)th sequence and \(x_t^{(k)}\) is the state probability distribution of the \(k\)th sequences at time \(t\). In matrix form: + +\begin{equation} +\underline{x}_{t+1}^{(j)} \equiv +\left[ +\begin{array}{c} + x_{t+1}^{(1)} \\ + \vdots \\ + x_{t+1}^{(s)} +\end{array} \right ] += +\left[ +\begin{array}{ccc} +\lambda_{11}P^{(11)} & \dots & \lambda_{1s}P^{(1s)}\\ +\vdots & \ddots & \vdots\\ +\lambda_{s1}P^{(s1)}& \dots & \lambda_{ss}P^{(ss)} +\end{array} \right ] +\left[ +\begin{array}{c} + x_{t}^{(1)} \\ + \vdots \\ + x_{t}^{(s)} +\end{array} \right ] +\equiv +Q \underline{x}_{t} +\label{eq:eq2} +\end{equation} where \(Q\) is an \(ms \times ms\) block matrix (\(s \times s\) blocks of \(m \times m\) matrices) and \(x_t\) is a stacked \(ms\) column vector (\(s\) vectors, each one with \(m\) rows). + +The matrices \(P^{(jk)}\) can be estimated for each data sequence by counting the transition frequency from the states in the \(k\)th sequence to those in the \(j\)th sequence, obtaining the transition frequency matrix for the data sequence. After normalization, the estimates of the transition probability matrices, i.e., \(\widehat{P}^{(jk)}\), are obtained. Regarding the \(\lambda_{jk}\) coefficients, the estimation method proposed by \citet{Ching2002} involves the following optimization problem: + +\begin{equation} +min_{\lambda} max_{i} \vert [ \sum_{k=1}^m \lambda_{jk} \widehat{P}^{(jk)} \widehat{\boldsymbol{x}}^{(k)} - \widehat{\boldsymbol{x}}^{(j)} ] \vert +\label{eq:eq3} +\end{equation} + +\[ \text{s.t. } \sum_{k=1}^s \lambda_{jk} \text{ and } \lambda_{jk} \geq 0 \] Besides this, different models have been proposed for multiple categorical data sequences. \citet{Kijima2002} proposed a parsimonious MMC model to simulate correlated credit risks. \citet{Siu2005} proposed an easy to implement model; however, its applicability was limited by the number of parameters involved. \citet{Ching2008} proposed a simplified model based on an assumption proposed in \citet{Zhang2006}. \citet{Zhu2010} proposed a method of estimation based on minimizing the prediction error with equality and inequality restrictions and \citet{Nicolau_2014} proposed a new approach to estimate MMC which avoids imposing restrictions on the parameters, based on non-linear least squares estimation, facilitating the model estimation and the statistical inference. \citet{Berchtold2003} proposed a MTD model for heteroscedastic time series. Lastly, \citet{Wang2014} proposed a new multivariate Markov chain model to reduce the number of parameters. Thus, generally, the models used in the published papers were developed by \citet{Ching2002} or were a consequent generalization of them and addressed the MMC as an end in itself. In \citet{Damasio2013} and \citet{DAMASIO2014}, a different and innovative concept was proposed: the usage of MMC as regressors in a certain model. Hence, given that the MMC Granger causes a specific dependent variable, and taking advantage of the information about the past state interactions between the MMC categories, it was possible to forecast the current dependent variable more accurately. Other relevant contributions are related to the optimization algorithm, as in \citet{Lebre2008} and \citet{ChenLio2009}, and to empirical applications \citep{Ching2003, Ching2006, Damasio2018, Damasio2019, DamasioM2020}. Also, \citet{Damasio2020} proposed a new methodology for detecting and testing the presence multiple structural breaks in a Markov chain occurring at unknown dates. In the vast majority of MMC models' studies, a positive correlation between the different data sequences is assumed due to the restrictions imposed. This aspect means it is always considered that at moment \(t\), an increase in a state probability for a data sequence has an increasing impact on another data sequence, for time \(t+1\). Thereupon, if one has a negative correlation between series, the parameter estimates are forced to be zero. The solution to this problem is very straightforward; one can relax the assumptions and not assume the constraints. However, that means the results produced by the model will no longer be probabilities. \citet{Tavare1994} presented an alternative, by dropping the positivity condition and imposing another set of restrictions. \citet{Ching2008} also tackled this issue and proposed a method where one splits the \(Q\) matrix into the sum of two other matrices and one represents the positive correlations and another the negative correlations. Also, in \citet{Nicolau2014}, a specification completely free from constraints, inspired by the MTD model, was proposed, facilitating the estimation procedure and, at the same time, providing a more accurate specification for \(P_j(i_0 | i_1, \dots, i_s)\). The model was: + +\begin{equation} +P_j(i_0 | i_1, \dots, i_s) = P_j^{\Phi}(i_0 | i_1, \dots, i_s) := +\\ + \frac{\Phi(\eta_{j0} + \eta_{j1}P(i_0|i_1) + \dots + \eta_{js}P(i_0|i_s))}{\sum_{k=1}^m \Phi(\eta_{j0} + \eta_{j1}P(k|i_1) + \dots + \eta_{js}P(k|i_s))} + \label{eq:eq4} +\end{equation} where \(n_{ji} \in \mathbb{R}(j = 1, \dots, s; i = 1, \dots, m)\) and \(\Phi\) is the (cumulative) standard normal distribution function. + +This specification is denoted as and MTD-Probit model. The log-likelihood is given by: \begin{equation} +LL = \sum_{i_1, i_2, \dots, i_{i_s}, i_0} n_{i_1, i_2, \dots, i_{i_s}, i_0} log(P_j^{\Phi}(i_0 | i_1, \dots, i_s) ) \label{eq:eq5} +\end{equation} and the maximum likelihood estimator is defined, as usual, as \(\widehat{\eta} = \text{arg max}_{n_{j1}, \dots, n_{js}} LL\). The parameters \(P_{jk}(i_0|i_1)\), \(k\) =\(1, \dots, s\) can be estimated in advance, through the consistent and unbiased estimators proposed by \citet{Ching2002}: + +\begin{equation} +\widehat{P}_{jk}(i_0|i_1) = \frac{n_{i_1i_0}}{\sum_{i_0=1}^n n_{i_1 i_0}} \label{eq:eq6} +\end{equation} This specification can be superior to the MTD because the estimation procedure is easier, and the standard numerical optimization routines can be easily applied in the absence of constraints. However, similarly to the standard MTD, the likelihood is not a strictly concave function on the entire parameter state-space, thus the choice of starting values is still important. Additionally, the model describes a broader range of possible dependencies since the parameters are not constrained. Moreover, this proposed model is more accurate than the MTD model. For more details on this, see \citet{Nicolau2014}. + +Overall, the published work on MMC models was mostly based on improving the estimation methods and/or making the model more parsimonious. In \citet{Damasio2013} and \citet{DAMASIO2014}, a different approach was used, and the work developed focused on the usage of MMC as regressors in a certain model. Notably, it showed that an MMC can improve the forecast of a dependent variable. In a way, it demonstrated that an MMC can be an end in itself, but it can be an instrument to reach an end or a purpose. In this work, the opposite will be developed: instead of considering an MMC as regressors, a model in which a vector with pre-determined exogenous variables is part of \(\mathcal{F}_{t-1}\) is proposed. + +\section{Covariates in Markov chain models}\label{covariates-in-markov-chain-models} + +Regarding the inclusion of covariates in Markov chains models, \citet{Regier1968} proposed a two-state Markov chain model, where the transition matrix probabilities were a function of a parameter, \(q\), that described the tendency of the subject to move from state to state. \citet{Kalbfleisch1985} proposed a panel data analysis method under a continuous-time Markov model that could be generalized to handle covariate analysis and the fitting of certain non-homogeneous models. This work overcame the limitations of \citet{Bart1968}, \citet{Spilerman1976} and \citet{Wasserman1980} methodologies, by developing a new algorithm that provided a very efficient way of obtaining maximum likelihood estimates. Also, \citet{Muenz1985} developed a Markov model for covariates dependence of binary sequences, where the transitions probabilities were estimated through two logistic regressions that depended on a set of covariates. Essentially, \citet{Muenz1985} modeled a non-homogeneous Markov chain through logistic regression, considering only two states. \citet{Islam2004} developed an extension of this model considering three states, and \citet{IslamAtaharul2006} generalized this approach for HOMC. Additionally, \citet{Azzalini1994} proposed a model to study the influence of time-dependent covariates on the marginal distribution of a binary response in serially correlated binary data, where Markov chains are expressed in terms of transitional probabilities. \citet{jackson2011multi} proposed a Markov model for panel data, which allowed for the transitions intensities to vary between individuals or constant time-dependent covariates. Specifically, this work allowed to account for different intensities throughout transitions of states and include individual-specific covariates. The time-inhomogeneos model proposed is restricted to piecewise-constant intensities. The implementation of this work is available in the package \CRANpkg{msm}. More recently, \citet{Bolano2020} proposed an MTD-based approach to handle categorical covariates, that considers each covariate separately and combines the effects of the lags of the MTD and the covariates employing a mixture model. Specifically, the model is given by: + +\begin{equation} +P(X_t = k \mid X_{t-1} = i, C_1 = c_1, \dots, C_l = c_l) \approx \theta_0 a_{ik} + \sum_{h=1}^l \theta_h d_{c_{h}k} \label{eq:eq7} +\end{equation} + +where \(a_{ik}\) is the transition probability from state \(i\) to state \(k\), as in a conventional Markov chains and \(d_{c_{h}k}\) is the probability of observing the states \(k\) given the modality \(c_h\) of the covariate \(h\). Lastly, \(\theta_0, \dots, \theta_l\) are the weights of the explanatory elements of the model. + +According to the literature presented, several researchers have proposed methodologies or generalizations to include covariates in Markov chain models. Primarily for social sciences and health applications, where the transition probabilities were generally modeled through logistic regression. However, there has been an increased focus on categorical covariates, opposing continuous covariates and a lack of approaches to multivariate Markov chain models. Thus, with this work, we aim to tackle this research gap. + +\section{Multivariate Markov chains with covariates}\label{multivariate-markov-chains-with-covariates} + +\subsection{Theoretical model}\label{theoretical-model} + +In this work, a new generalization of \citet{Ching2002} MMC model is presented: the GMMC model, that is, we will consider exogeneous or pre-determined covariates in the \(\sigma\) - algebra generated by the available information until \(t-1\) (\(\mathcal{F}_{t-1}\)). These variables can be deterministic or stochastic and do not necessarily need to be reported at time \(t\). Broadly, the model is given by: + +\begin{equation} +P(S_{jt} = k | \mathcal{ F}_{t-1} ) = P(S_{jt} = k | S_{1t-1} = i_1, S_{2t-1} = i_2, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) \label{eq:eq8} +\end{equation} We can specify this model as proposed by \citet{Ching2002} with Raftery's notation: + +\begin{multline} +P(S_{jt} = i_0 | S_{1t-1} = i_1,\dots, S_{st-1} = i_s, \boldsymbol{x}_t) \equiv \\ +\lambda_{j1}P(S_{jt} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_t) + \dots + \lambda_{js}P(S_{jt} = i_0 | S_{st-1} = i_s, \boldsymbol{x}_t) \label{eq:eq9} +\end{multline} subject to the usual constraints. + +\subsection{Estimation and inference}\label{estimation-and-inference} + +This proposed model is estimated through MLE, similar to the standard MTD model. The log-likelihood is given by: + +\begin{equation} +LL = \sum_{t = 1}^n log P(S_{jt} = i_0 | S_{1t-1} = i_1, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) \label{eq:eq10} +\end{equation} + +Additionally, the probabilities can be estimated through an multinomial logit model. The proof for consistency and asymptotic distribution is available in the Supplementary Material section. + +\subsection{Monte Carlo simulation study}\label{monte-carlo-simulation-study} + +A Monte Carlo simulation study was designed to evaluate the dimension and power of the test parameters of the proposed model. The R statistical environment was used for all computations. This simulation study was comprised of two parts. + +\subsubsection{Part I: Detect a non-homogeneous Markov chain}\label{part-i-detect-a-non-homogeneous-markov-chain} + +First, we considered two sequences with two and three states. The main goal was to assess if the model detected the presence of a non-homogeneous Markov chain correctly and if the estimate of the parameter would correspond to the expected. So, given two sequences, one generated through a non-homogeneous Markov chain and the other generated through a homogeneous Markov chain, it would be expected that the parameter associated with the transition probabilities of the first sequence would be one and the parameter associated with the transition probabilities of the second sequence would be zero. With this in mind, the transitions probabilities of the first sequence were estimated through a logistic regression, where parameters of this regression were randomly generated in R, and the second sequence was generated through a first-order Markov chain. Hence, for both states cases considered, it was expected that the estimated regression would be: + +\begin{multline} +P(S_{1t} = i_0 | S_{1t-1} = i_1, S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) = \\ +1 \times P(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}) + 0 \times P(S_{1t} = i_0 | S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) \label{eq:eq11} +\end{multline} + +To assess the test power and dimension, we used the Wald test with the following hypothesis: + +\begin{table} + +\caption{\label{tab:dim-pow-tex}Power and dimension of test assessment} +\centering +\begin{tabular}[t]{l|l|l} +\hline + & Hypothesis & Test\\ +\hline +Power & $H_0: \lambda_{11} = 0$ & $\frac{\widehat{\lambda}_{11}^2}{se(\widehat{\lambda}_{11})^2} \sim \chi^2_{(1)}$\\ +\hline + & $H_0: \lambda_{12} = 1$ & $\frac{(\widehat{\lambda}_{12}-1)^2}{se(\widehat{\lambda}_{12})^2} \sim \chi^2_{(1)}$\\ +\hline +Dimension & $H_0: \lambda_{11} = 1$ & $\frac{(\widehat{\lambda}_{11}-1)^2}{se(\widehat{\lambda}_{11})^2} \sim \chi^2_{(1)}$\\ +\hline + & $H_0: \lambda_{12} = 0$ & $\frac{\widehat{\lambda}_{12}^2}{se(\widehat{\lambda}_{12})^2} \sim \chi^2_{(1)}$\\ +\hline +\end{tabular} +\end{table} + +The simulation procedure was performed as follows: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\tightlist +\item + Generate the values of the coefficients for the probability transition matrix of series \(S_{1t}\) randomly; +\item + Generate the probability transition matrix of series \(S_{2t}\) randomly; +\item + Set the initial value of \(S_{2t}\) to 1 and simulate the following from the defined probability transition matrix; +\item + In each iteration (of 1000 repetitions), + + \begin{itemize} + \tightlist + \item + Generate \(X_t \sim N(2,25)\); + \item + Generate the time-varying probabilities of series \(S_{1t}\) through the values of the fixed coefficients and the lagged variable \(x_t\); + \item + Set the initial values of the series \(S_{1t}\) as 1; + \item + For each period \(t\), simulate the next state of \(S_{1t}\) from the probabilities simulated for that moment; + \item + Estimate the model through the function \texttt{mmcx}; + \item + Calculate the Wald test and add to the counter if it is rejected. + \end{itemize} +\end{enumerate} + +\begin{figure} + +{\centering \includegraphics{RJ-2024-006_files/figure-latex/figure-2states-1} + +} + +\caption{Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain.}\label{fig:figure-2states} +\end{figure} + +Considering two states, the test dimension was at 5.7\% with a sample size of 100 observations, sightly increased with 500 observations, and returned to the expected values in 1000 and 5000 observations. For a sample size of 100, 500, and 1000 observations, we have low test power. So, when considering two states, the sample must have at least 5000 observations, or, if that is not possible, consider a higher significance level when testing for individual significance. + +\begin{figure} + +{\centering \includegraphics{RJ-2024-006_files/figure-latex/figure-3states-1} + +} + +\caption{Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain.}\label{fig:figure-3states} +\end{figure} + +Considering three states, the test dimension was 9.7\% for a sample size of 100 observations, 0.2\% for a sample size of 500 observations, and 0.3\% for a sample size of 1000. Regarding the test power, we see similar behavior, for a sample of 100 observations, the test power was 90.5\%, and from a sample of 500 observations, we reach a test power of 100\%. Thus, when considering three states, one may consider a sample of 500 observations without compromising the test power and dimension. + +\newpage + +\subsubsection{Part II: Detecting Parameters Assigned Values}\label{part-ii-detecting-parameters-assigned-values} + +Secondly, we performed a simulation study where we considered two non-homogeneous Markov chain with two states. Here, the main goal was to assess if the model correctly detected the parameters assigned. So, in this case, we started by generating the terms of the model proposed. These terms were estimated through logistic regression, and the parameters of this regression were randomly generated in R. Similarly to Part I, we considered a Wald test to assess the power and dimension of the test. The simulation procedure was performed as follows: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\tightlist +\item + Generate the values of the coefficients to calculate the probability transition matrices randomly; +\item + In each iteration (of 1000 repetitions), + + \begin{itemize} + \tightlist + \item + Generate \(\{x_t\} \sim N(2,25)\); + \item + Generate the probabilities \(P \left(S_{jt}|S_{st-1}, x_{t-1} \right)\), with \(j=1,2\) and \(s=1,2\). + \item + Set the initial values of the series \(S_{1t}\) and \(S_{2t}\) as 1; + \item + For each period \(t\), calculate the probabilities \(P \left(S_{1t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)\) and \(P \left( S_{2t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)\) through the assigned values of the \(\lambda\)'s. Considering the calculated probabilities, simulate the next state for each series, \(S_{1t}\) and \(S_{2t}\). + \item + Estimate the model through the function \texttt{mmcx}; + \item + Calculate the Wald test and add to the counter if it is rejected. + \end{itemize} +\end{enumerate} + +The probabilities \(P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)\) and \(P\left(S_{1t}|S_{2t-1}, x_{t-1}\right)\) presented some differences regarding its values' distributions. Specifically, \(P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)\) had more extreme probabilities values, with the minimum value being close to 0 and the maximum value being close to 1. And, the probabilities \(P\left(S_{1t}|S_{2t-1}, x_{t-1} \right)\) had more moderate values, with the minimum value being, on average, 0.3 and the maximum value, 0.7. When the probabilities have values close to 1, one says that the states/regimes are persistent. We calculated the power and dimension of test for each value of \(\lambda\) when the estimated probabilities are moderate and when they are extreme. Hence, considering equation 1: + +\begin{multline} +P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\dots, S_{2t-1} = i_2, \boldsymbol{x}_{t-1} \right) = \\ +\lambda_{11}P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}\right) + \lambda_{12}P\left(S_{1t} = i_0 | S_{2t-1} = i_s, \boldsymbol{x}_{t-1} \right) \label{eq:eq12} +\end{multline} + +The parameter \(\lambda_{11}\) will be associated with more extreme probabilities and \(\lambda_{12}\) will be associated with more moderate probabilities. + +\begin{figure} + +{\centering \includegraphics{RJ-2024-006_files/figure-latex/figure-persistent-1-1} + +} + +\caption{Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states.}\label{fig:figure-persistent-1} +\end{figure} + +\begin{figure} + +{\centering \includegraphics{RJ-2024-006_files/figure-latex/figure-persistent-2-1} + +} + +\caption{Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting.}\label{fig:figure-persistent-2} +\end{figure} + +When the states are persistent and the parameter's value is low (i.e., 0.2 and 0.4), we have low test power. By increasing this value, the power of test increases as well. When the states are not persistent, we do not have a clear pattern regarding the power of test, for a value of the parameter of 0.2, the power of test is still low (although not as low as the first scenario), increases when we have a value of 0.4, decreases when the value is 0.6 and increases again when the value is 0.8. Overall, the estimated standard errors seem high, leading to low test power. Regarding the test dimension, when we have a higher weight associated with the non-persistent states, the test dimension converges to 0. However, when this weight is associated with the persistent states, the test dimension increases with the sample size, reaching a value of 10\% in some cases. Hence, one must use a 10\% significance level to perform statistical inference on the parameters in this situation. + +\subsection{Software implementation}\label{software-implementation} + +Regarding the software implementation for each function, for the \texttt{multimtd} function the estimation method was presented in \citet{Berchtold2001} applied to the multivariate case. For \texttt{multimtd\_probit}, a package for numerical maximization of the log-likelihood, \CRANpkg{maxLik} \citep{maxLik}, was used. This package performs Maximum Likelihood estimation through different optimization methods that the user can choose. The optimization methods available are Newton-Raphson, Broyden - Fletcher - Goldfarb - Shanno, BFGS al- algorithm, Berndt - Hall - Hall - Hausman, Simulated ANNealing, Conjugate Gradients, and Nelder-Mead. Finally, for the \texttt{mmcx} function, a different approach was used. Unlike the MTD- Probit, the model proposed has equality and inequality restrictions in the parameters. The \CRANpkg{maxLik} \citep{maxLik} package only allows one type of restriction for each Maximum Likelihood estimation, so it was not possible to use this package to estimate the proposed model with exogenous variables. Hence, the algorithm used was the Augmented Lagrangian method, available in the \CRANpkg{alabama} \citep{alabama} package through the function \texttt{auglag}. This estimation method for the proposed model is not very common, however, it has been applied to Markov chain models \citep{Rajarshi2013}. The GMMC model's probabilities were estimated through a Multinomial Logit using \texttt{rmultinom} of the \CRANpkg{nnet} package \citep{nnet}. + +Additionally, the hessian matrices were also computed, which allowed performing statistical inference. The \texttt{maxLik} and \texttt{auglag} compute the Hessian matrices with the estimates. For the function \texttt{multimtd}, since the optimization procedure of \citet{Berchtold2001} was used, the hessian was computed through the second partial derivatives. The function \texttt{multi.mtd} requires the following elements: + +\begin{itemize} +\item + \texttt{y}, a matrix of the categorical data sequences. +\item + \texttt{deltaStop}, the delta below which the optimization phases of the parameters stop. +\item + \texttt{is\_constrained}, flag indicating whether the function will consider the usual set of constraints (usual set: \textit{TRUE}, new set of constraints: \textit{FALSE}). +\item + \texttt{delta}, the amount of change to increase/decrease in the parameters for each iteration of the optimization algorithm. +\end{itemize} + +The last three arguments concern the optimization procedure. For more details see \citet{Berchtold2001}. Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2}, to estimate the model and obtain the results: + +\begin{verbatim} +multi.mtd(y=cbind(s1,s2), deltaStop=0.0001, is_constrained=TRUE, delta=0.1) +\end{verbatim} + +The function \texttt{multi.mtd\_probit} requires the following arguments: + +\begin{itemize} +\tightlist +\item + \texttt{y}, a matrix of the categorical data sequences. +\item + \texttt{initial}, a vector of the initial values of the parameters. +\item + \texttt{nummethod}, the numerical maximization method, currently either ``NR'' (for Newton-Raphson), ``BFGS'' (for Broyden-Fletcher-Goldfarb-Shanno), ``BFGSR'' (for the BFGS algorithm implemented in R), ``BHHH'' (for Berndt-Hall-Hall-Hausman), ``SANN'' (for Simulated ANNealing), ``CG'' (for Conjugate Gradients), or ``NM'' (for Nelder-Mead). Lower-case letters (such as ``nr'' for Newton-Raphson) are allowed. The default method is ``BFGS''. For more details see \CRANpkg{maxLik} \citep{maxLik} package. +\end{itemize} + +Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2} again, to estimate the model an obtain the results with BFGS maximization method: + +\begin{verbatim} +multi.mtd_probit(y = cbind(s1,s2), initial=c(1,1,1), nummethod='bfgs') +\end{verbatim} + +Finally, the function \texttt{mmcx} requires the following elements: + +\begin{itemize} +\tightlist +\item + \texttt{y}, a matrix of categorical data sequences. +\item + \texttt{x}, a matrix of covariates (exogeneous variables). +\item + \texttt{initial}, a vector of the initial values of the parameters. +\end{itemize} + +Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2}, and a vector of an exogeneous variables, \texttt{x}, to estimate the model and obtain the results: + +\begin{verbatim} +mmcx(y = cbind(s1,s2), x = cbind(x), initial=c(1,1)) +\end{verbatim} + +These functions return a list with the parameter estimates, standard errors, z-statistics, p- values, and the log-likelihood function value for each equation. + +The package offers an additional function that allows to obtain the transition probability matrices of \texttt{mmcx} considering a specific value of \texttt{x} defined by the user. The function is \texttt{MMC\_tpm} and requires the following elements: + +\begin{itemize} +\tightlist +\item + \texttt{s}, a matrix of categorical data sequences. +\item + \texttt{x}, a matrix of covariates (exogeneous variables). +\item + \texttt{value}, a single value of \texttt{x}, to condition the probability transition matrices. +\item + \texttt{result}, a list returned by the function \texttt{mmcx} containing the model's estimates. +\end{itemize} + +Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2}, a vector of an exogeneous variables, \texttt{x} and \texttt{res} the list returned by the function \texttt{mmcx}, to obtain the transition probability matrices: + +\begin{verbatim} +MMC_tpm(s = cbind(s1,s2), x = cbind(x), value = max(x), result = res) +\end{verbatim} + +The function returns an array containing the probability transition matrices, conditioned on a specific value of \texttt{x}, for each equation. + +\section{Illustration}\label{illustration} + +Markov chain models are used in interdisciplinary areas, such as economics, business, biology, and engineering, with applications to predict long-term behavior from traffic flow to stock market movements, among others. Modeling and predicting stock markets returns is particularly relevant for investors and policy makers. Since the stock market is a volatile environment, and the returns are difficult to predict, estimating the set of probabilities that describe these movements, might provide relevant input. Additionally, incorporating the effect of key macroeconomic variables could provide a more accurate picture of this specific environment. + +The following empirical illustration aims to model stock returns of two indexes as a function of the interest rate spread, specifically the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. + +The interest rate spread is a key macroeconomic variable and provides valuable information regarding the economy state. Specifically, it has been used to forecast recessions as in \citet{Estrella1996}, \citet{Dombrosky1996}, \citet{Chauvet2016}, \citet{Tian2019} and \citet{McMillan2021}. Generically, short-term yields are lower than long-term yields when the economy is in expansion. On the other hand, short-term yields are higher than long-term yields when the economy is in recession. The difference between these yields (or, more specifically, the yield curve's slope) can be used to forecast the state of the economy. Hence, this indicator might provide relevant input for investors. + +We considered the 5-week-day daily stock returns (\(r_t=100 \times \log(P_t/P_{t-1})\), where \(P_t\) is the adjusted close price) of two indexes, S\&P500 and DJIA, from November \(11^{th}\) 2011 to September \(1^{st}\) 2021 (2581 observations). Additionally, we considered the interest rate spread (\(spread_{t}\)), the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. The data was retrieved from FRED. Below, we have the descriptive statistics of these variables. + +\begin{table} + +\caption{\label{tab:summary-stat-tex}Summary statistics of $stockreturns$ dataset} +\centering +\begin{tabular}[t]{l|l|l|l|l|l|l} +\hline +Variable & Minimum & 1$^{st}$ Quantile & Median & Mean & 3$^{rd}$ Quantile & Maximum\\ +\hline +$spread_{t}$ & -0.52 & 0.92 & 1.54 & 1.454 & 2.03 & 2.97\\ +\hline +$r_{t;SP500}$ & -12.765 & -0.32 & 0.07 & 0.054 & 0.518 & 8.968\\ +\hline +$r_{t;DJIA}$ & -13.842 & -0.327 & 0.071 & 0.046 & 0.508 & 10.764\\ +\hline +\end{tabular} +\end{table} + +Moreover, to apply the model proposed, it is necessary to have a categorical time series, thus we applied the following procedure: + +\[ +S_{st}= +\begin{cases} +1, r_t \leq \widehat{q}_{s;0.25}\\ +2, \widehat{q}_{s;0.25} < r_t < \widehat{q}_{s;0.75} \\ +3, r_t \geq \widehat{q}_{s;0.75}\\ +\end{cases} +\] + +where \(\widehat{q}_{s;\alpha}\) is the estimated quantile of order \(\alpha\) of the marginal distribution of \(r_t\). Considering this illustration and the model proposed, we will have two equations: + +\begin{multline} +P(S_{sp500,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{11} P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{12} P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1}) \label{eq:eq13} +\end{multline} + +\begin{multline} +P(S_{djia,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{21} P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{22} P(S_{djia,t} | S_{djia, t-1}, spread_{t-1}) \label{eq:eq14} +\end{multline} + +In Figures \ref{fig:fig11} to \ref{fig:fig22} generate through \CRANpkg{ggplot2} \citep{ggplot2} and \CRANpkg{gridExtra} \citep{gridextra}, we have the smoothed conditional probabilities of both series, depending on \(spread_{t-1}\). The number of observations is high, and the probabilities varied abruptly in a small time frame, making the plots hard to read. To simplify, a moving average model (from \CRANpkg{pracma} \citep{pracma}) of order 5, due to the frequency of the data, was adjusted to these probabilities to illustrate how they evolve throughout time. These plots represent the probabilities associated with the parameters of the general model proposed, showcasing how these vary throughout time and the main of advantage of this generalization. Instead of having fixed matrices of transition probabilities, we allow for these to vary throughout time, depending on the values of \(spread_{t-1}\). Specifically, Figures \ref{fig:fig11} and \ref{fig:fig12} correspond to the non-homogeneous Markov chain to build the SP\&500's equation and Figures \ref{fig:fig21} and Figures \ref{fig:fig22} correspond to the non-homogeneous Markov chain to build DJIA's equation. We see a similar behavior within each series regardless of whether it depends on the previous states of \(S_{1t}\) or \(S_{2t}\). Additionally, the scales of the graphs are small, indicating that these probabilities vary around the same set of values. + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{RJ-2024-006_files/figure-latex/fig11-1} + +} + +\caption{Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig11} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{RJ-2024-006_files/figure-latex/fig12-1} + +} + +\caption{Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig12} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{RJ-2024-006_files/figure-latex/fig21-1} + +} + +\caption{Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig21} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{RJ-2024-006_files/figure-latex/fig22-1} + +} + +\caption{Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig22} +\end{figure} + +\newpage + +The model can be estimated through the \texttt{mmcx} function: + +\begin{verbatim} +attach(stockreturns) +res <- mmcx(cbind(sp500, djia), spread_1, initial=c(1,1)) +\end{verbatim} + +\begin{verbatim} +#> -------------------------------------------- +#> Equation 1 +#> Estimate Std. Error t value Pr(>|t|) +#> 1 0.685660 0.171346 4.002 0.000 *** +#> 2 0.314340 0.171346 1.835 0.067 * +#> +#> Log-Likelihood: -2636.355 +#> -------------------------------------------- +#> -------------------------------------------- +#> Equation 2 +#> Estimate Std. Error t value Pr(>|t|) +#> 1 0.629993 0.176383 3.572 0.000 *** +#> 2 0.370007 0.176383 2.098 0.036 ** +#> +#> Log-Likelihood: -2636.622 +#> -------------------------------------------- +\end{verbatim} + +Considering the first equation, the effect of the probabilities depending on S\&P500's previous state and the interest rate spread has a higher weight on the overall probability. Also, this estimate is highly significant, presenting a \(p\)-value close to zero. The effect of DJIA's previous state in S\&P500 is lower but it is also significant for a 10\% significance level. In the second equation, the effect of S\&P500's previous state is higher than DJIA's and both estimates are highly significant. + +One of the advantages of this approach is the possibility to assess the transition probabilities for specific values of \(x_t\), in this case, the interest rate spread. For both series, we calculated the transition probabilities for this variable's minimum and maximum value in the sample, which are -0.52 and 2.97, respectively. To obtain the probability transition matrices for these two cases, the code is the following: + +\begin{verbatim} +tpm_max <- MMC_tpm(cbind(sp500, djia), spread_1, + value = max(spread_1), result = res) + +tpm_min <- MMC_tpm(cbind(sp500, djia), spread_1, + value = min(spread_1), result = res) +\end{verbatim} + +\begin{verbatim} +library(markovchain) +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) # Generate figure 9 +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) # Generate figure 10 +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) # Generate figure 11 +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) # Generate figure 12 +\end{verbatim} + +In Figures \ref{fig:fig-sp500-min} and \ref{fig:fig-sp500-max}, we have the transition probabilities network for S\&P500, corresponding to the minimum and maximum value of the spread. The most noticeable difference between these two networks is regarding the transition probability from the second state to the third state. For the maximum value of \(spread_{t-1}\), the transition probability from the second state to the third state is 0.6. So, when the economy is strong, one might expect to have higher returns, when \(t-1\) was in the second state. However, this scenario shifts when considering the minimum value of \(spread_{t-1}\). The probability of obtaining higher returns, that is, being in state three, becomes almost evenly distributed, regardless of the state in \(t-1\). This indicates the instability of the stock market, when the economy is weaker. Another difference in these networks, is regarding the transition probability from the third state to the first state. For the maximum value of \(spread_{t-1}\), this probability is 0.27 and for the minimum value increases to 0.44. This is also expected, since when the economy is weaker, the probability of having lower returns is greater. + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{RJ-2024-006_files/figure-latex/fig-sp500-max-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread$_{t-1}$. The highest probability of 0.6 refers to the transition from state 2 to state 3.}\label{fig:fig-sp500-max} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{RJ-2024-006_files/figure-latex/fig-sp500-min-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread$_{t-1}$. The highest probability of 0.56 refers to the transition from state 2 to state 2.}\label{fig:fig-sp500-min} +\end{figure} + +Considering the second equation (Figures \ref{fig:fig-djia-max} and \ref{fig:fig-djia-min}), corresponding to the DJIA's returns, we see a similar behaviour as in S\&P500's networks. The transition probability from the second state to the third state is higher for the maximum value of \(spread_{t-1}\) and the transition probability from the third state to the first state is higher when we consider the minimum value of \(spread_{t-1}\). Although, the difference of this last probability between the minimum and maximum value of \(spread_{t-1}\) is not as big as in S\&P500. Overall, the rest of the probabilities structure, remains the same. + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{RJ-2024-006_files/figure-latex/fig-djia-max-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread$_{t-1}$. The probability of 0.58 refers to the transition from state 2 to state 3.}\label{fig:fig-djia-max} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{RJ-2024-006_files/figure-latex/fig-djia-min-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread$_{t-1}$. The highest probability of 0.51 refers to the transition from state 2 to state 2.}\label{fig:fig-djia-min} +\end{figure} + +\section{Conclusions, limitations and further research}\label{conclusions-limitations-and-further-research} + +Several proposals for including of exogenous variables in MMC models have been presented. The main limitations were associated with the high complexity of the models to be developed and estimated. Additionally, most models considered only categorical exogenous variables, existing a lack of focus on continuous exogenous variables. This work proposes a new approach to include continuous exogenous variables in \citet{Ching2002} model for multivariate Markov chain. This is relevant because it allows studying the effect of previous series and exogenous variables on the transition probabilities. The model is based on \citet{Ching2002} MMC model but considers non-homogeneous Markov chains. Thus, the probabilities that compose the model are dependent on exogenous variables. These probabilities are estimated as a usual non-homogeneous Markov chain through a multinomial logit model. The model parameters are then estimated through MLE, as well as the standard errors. We developed a package with the estimation function of the model proposed. In this, we considered the Augmented Lagrangian optimization method for estimating the parameters through MLE. Additionally, we designed a Monte Carlo simulation study to assess this model's test power and dimension. The results showed that the model detected a non-homogeneous Markov chain. Moreover, an empirical illustration demonstrated the relevance of this new model by estimating the probability transition matrix for different exogenous variable values. Ignoring the effect of exogenous variables in MMC means that we would not detect the probabilities' changes according to the covariates' values. In this setting, one would have a limited view of the studied process. Hence, this approach allows us to understand how a specific variable influences a specific process. The main contributions of this work are the development of a package with functions for multivariate Markov chains, addressing the statistical inference in these models and the inclusion of covariates. The limitations are related to the implementation in R, specifically the optimization algorithm applied is not common for MMC models, in that sense, it would be beneficial to study new approaches to optimizing the maximum likelihood function as further research. Additionally, extending this generalization to the MTD-probit model proposed by \citet{Nicolau2014} would also be relevant, which removes the constraints of the model's parameters and allows the model to detect negative effects. + +\bibliography{genmarkov.bib} + +\address{% +Carolina Vasconcelos\\ +NOVA Information Management School (NOVA IMS)\\% +Campus de Campolide, 1070-312 Lisboa, Portugal\\ +% +% +% +\href{mailto:cvasconcelos@novaims.unl.pt}{\nolinkurl{cvasconcelos@novaims.unl.pt}}% +} + +\address{% +Bruno Damásio\\ +NOVA Information Management School (NOVA IMS)\\% +Campus de Campolide, 1070-312 Lisboa, Portugal\\ +% +% +% +\href{mailto:bdamasio@novaims.unl.pt}{\nolinkurl{bdamasio@novaims.unl.pt}}% +} diff --git a/_articles/RJ-2024-006/RJ-2024-006.zip b/_articles/RJ-2024-006/RJ-2024-006.zip new file mode 100644 index 0000000000..390641cdeb Binary files /dev/null and b/_articles/RJ-2024-006/RJ-2024-006.zip differ diff --git a/_articles/RJ-2024-006/RJournal.sty b/_articles/RJ-2024-006/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-006/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-006/RJwrapper.tex b/_articles/RJ-2024-006/RJwrapper.tex new file mode 100644 index 0000000000..c0055abdf7 --- /dev/null +++ b/_articles/RJ-2024-006/RJwrapper.tex @@ -0,0 +1,70 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + + +% tightlist command for lists without linebreak +\providecommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} + +\usepackage{longtable} + +% Always define CSL refs as bib entries are contained in separate doc +% Pandoc citation processing +%From Pandoc 3.1.8 +% definitions for citeproc citations +\NewDocumentCommand\citeproctext{}{} +\NewDocumentCommand\citeproc{mm}{% + \begingroup\def\citeproctext{#2}\cite{#1}\endgroup} +\makeatletter + % allow citations to break across lines + \let\@cite@ofmt\@firstofone + % avoid brackets around text for \cite: + \def\@biblabel#1{} + \def\@cite#1#2{{#1\if@tempswa , #2\fi}} +\makeatother +\newlength{\cslhangindent} +\setlength{\cslhangindent}{1.5em} +\newlength{\csllabelwidth} +\setlength{\csllabelwidth}{3em} +\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing + {\begin{list}{}{% + \setlength{\itemindent}{0pt} + \setlength{\leftmargin}{0pt} + \setlength{\parsep}{0pt} + % turn on hanging indent if param 1 is 1 + \ifodd #1 + \setlength{\leftmargin}{\cslhangindent} + \setlength{\itemindent}{-1\cslhangindent} + \fi + % set entry spacing + \setlength{\itemsep}{#2\baselineskip}}} + {\end{list}} +\usepackage{calc} +\newcommand{\CSLBlock}[1]{#1\hfill\break} +\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}} +\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break} +\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} + + + +\begin{document} + + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{96} + +\begin{article} + \input{RJ-2024-006} +\end{article} + + +\end{document} diff --git a/_articles/RJ-2024-006/Rlogo-5.png b/_articles/RJ-2024-006/Rlogo-5.png new file mode 100644 index 0000000000..077505788a Binary files /dev/null and b/_articles/RJ-2024-006/Rlogo-5.png differ diff --git a/_articles/RJ-2024-006/genmarkov.R b/_articles/RJ-2024-006/genmarkov.R new file mode 100644 index 0000000000..45a4ce30ee --- /dev/null +++ b/_articles/RJ-2024-006/genmarkov.R @@ -0,0 +1,810 @@ +# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand +# Please edit genmarkov.Rmd to modify this file + +## ----dim-pow-html, eval = knitr::is_html_output(), echo=FALSE----------------- +#> +#> data = matrix(c('Power', '$H_0: \\lambda_{11} = 0$', +#> '$\\frac{\\widehat{\\lambda}_{11}^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', +#> '', '$H_0: \\lambda_{12} = 1$', +#> '$\\frac{(\\widehat{\\lambda}_{12}-1)^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$', +#> 'Dimension', '$H_0: \\lambda_{11} = 1$', +#> '$\\frac{(\\widehat{\\lambda}_{11}-1)^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', +#> '', '$H_0: \\lambda_{12} = 0$', +#> '$\\frac{\\widehat{\\lambda}_{12}^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$'), ncol=3, nrow=4, byrow=T) +#> colnames(data) = c('', 'Hypothesis', 'Test') +#> +#> knitr::kable(data, format = "html", caption = "Power and dimension of test assessment") + + +## ----dim-pow-tex, eval = knitr::is_latex_output(), echo=FALSE----------------- + +data = matrix(c('Power', '$H_0: \\lambda_{11} = 0$', + '$\\frac{\\widehat{\\lambda}_{11}^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 1$', + '$\\frac{(\\widehat{\\lambda}_{12}-1)^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$', + 'Dimension', '$H_0: \\lambda_{11} = 1$', + '$\\frac{(\\widehat{\\lambda}_{11}-1)^2}{se(\\widehat{\\lambda}_{11})^2} \\sim \\chi^2_{(1)}$', + '', '$H_0: \\lambda_{12} = 0$', + '$\\frac{\\widehat{\\lambda}_{12}^2}{se(\\widehat{\\lambda}_{12})^2} \\sim \\chi^2_{(1)}$'), ncol=3, nrow=4, byrow=T) +colnames(data) = c('', 'Hypothesis', 'Test') + +knitr::kable(data, format = "latex", caption = "Power and dimension of test assessment", escape = FALSE) + + +## ----figure-2states, echo=FALSE, fig.cap="Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'---- +library(ggplot2) +df <- structure( + list( + States = c(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 3), + Parameter = c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0), + Sample = c( + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 100, + 500, + 1000 + ), + Power = c( + 0.082, + 0.252, + 0.466, + 0.994, + 0.082, + 0.252, + 0.466, + 0.994, + 0.905, + 1, + 1, + 0.905, + 1, + 1 + ), + Dimension = c( + 0.057, + 0.076, + 0.058, + 0.06, + 0.057, + 0.076, + 0.058, + 0.06, + 0.097, + 0.002, + 0.003, + 0.097, + 0.002, + 0.003 + ) + ), + row.names = c(NA, 14L), + class = "data.frame" +) + +df$Sample = as.factor(df$Sample) +df$Parameter = as.factor(df$Parameter) + +df1 <- df[df$States == 2, ] +p1 <- + ggplot(df1, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.08)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p2 <- + ggplot(df1, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.05)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p1, p2, ncol=2) + + + +## ----figure-3states, echo=FALSE, fig.cap="Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'---- +df2 <- df[df$States == 3, ] +p3 <- + ggplot(df2, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.11)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p4 <- + ggplot(df2, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.05)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p3, p4, ncol=2) + + +## ----figure-persistent-1, echo=FALSE, fig.cap="Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states.", warning=FALSE, message=FALSE, fig.height=3, fig.width=6, fig.align='center'---- +df3 <- + structure( + list( + Case = c( + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2 + ), + Parameter = c( + 0.2, + 0.2, + 0.2, + 0.2, + 0.4, + 0.4, + 0.4, + 0.4, + 0.6, + 0.6, + 0.6, + 0.6, + 0.8, + 0.8, + 0.8, + 0.8, + 0.2, + 0.2, + 0.2, + 0.2, + 0.4, + 0.4, + 0.4, + 0.4, + 0.6, + 0.6, + 0.6, + 0.6, + 0.8, + 0.8, + 0.8, + 0.8 + ), + Sample = c( + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000, + 100, + 500, + 1000, + 5000 + ), + Power = c( + 0.073, + 0.065, + 0.046, + 0.019, + 0.092, + 0.096, + 0.092, + 0.097, + 0.126, + 0.282, + 0.261, + 0.276, + 0.139, + 0.435, + 0.695, + 0.999, + 0.057, + 0.076, + 0.15, + 0.256, + 0.085, + 0.14, + 0.209, + 0.715, + 0.071, + 0.087, + 0.142, + 0.315, + 0.053, + 0.103, + 0.362, + 0.599 + ), + Dimension = c( + 0.018, + 0.014, + 0.009, + 0, + 0.005, + 0.004, + 0.002, + 0.002, + 0.005, + 0.004, + 0.002, + 0.002, + 0.018, + 0.014, + 0.009, + 0, + 0.018, + 0.025, + 0.038, + 0.064, + 0.002, + 0.003, + 0.07, + 0.103, + 0.002, + 0.003, + 0.07, + 0.103, + 0.018, + 0.025, + 0.038, + 0.064 + ) + ), + row.names = c(NA, + 32L), + class = "data.frame" + ) +df3$Sample = as.factor(df3$Sample) +df3$Parameter = as.factor(df3$Parameter) + +df4 <- df3[df3$Case == 1, ] + +p5 <- + ggplot(df4, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 1.07)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +p6 <- + ggplot(df4, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.02)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p6, p5, ncol=2) + + + +## ----figure-persistent-2, echo=FALSE, fig.cap="Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting.", fig.height=3, fig.width=6, warning=FALSE, fig.align='center'---- + df5 <- df3[df3$Case == 2, ] +p7 <- + ggplot(df5, aes( + x = Sample, + y = Power, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.8)) + + geom_text( + aes(label = Power * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Power of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 7, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + + +p8 <- + ggplot(df5, + aes( + x = Sample, + y = Dimension, + group = Parameter, + fill = Parameter + )) + + geom_col(position = "dodge") + + scale_y_continuous(labels = scales::percent, limits = c(0, 0.11)) + + geom_text( + aes(label = Dimension * 100), + size = 2.5, + hjust = -0.15, + position = position_dodge(.9), + angle = 90 + ) + + scale_fill_brewer(palette = 'Reds') + + theme_minimal() + + labs(title='Dimension of test', + x='Sample size', + y = 'Proportion of rejections of the null hypothesis', + fill = expression(lambda[jk])) + + theme( + axis.text.x = element_text(face = "bold", color = 'black', + size = 7), + axis.text.y = element_text(face = "bold", color = 'black', + size = 7), + axis.title = element_text(size = 7), + legend.title = element_text(size = 8, color = 'black'), + legend.text = element_text(size = 7), + title = element_text(size=7) + ) + +gridExtra::grid.arrange(p8, p7, ncol=2) + + + +## ----multi.mtd, eval=FALSE---------------------------------------------------- +#> multi.mtd(y=cbind(s1,s2), deltaStop=0.0001, is_constrained=TRUE, delta=0.1) + + +## ----multi.mtd_probit, eval=FALSE--------------------------------------------- +#> multi.mtd_probit(y = cbind(s1,s2), initial=c(1,1,1), nummethod='bfgs') + + +## ----mmcx-examp, eval=FALSE--------------------------------------------------- +#> mmcx(y = cbind(s1,s2), x = cbind(x), initial=c(1,1)) + + +## ----mmc_tpm, eval=FALSE------------------------------------------------------ +#> MMC_tpm(s = cbind(s1,s2), x = cbind(x), value = max(x), result = res) + + +## ----summary-stat-html, eval = knitr::is_html_output(), echo=FALSE------------ +#> library(GenMarkov) +#> data = rbind(c('$spread_{t}$', round(summary(stockreturns$spread_1), 3)), +#> c('$r_{t;SP500}$', round(summary(stockreturns$returns_sp500), 3)), +#> c('$r_{t;DJIA}$', round(summary(stockreturns$returns_djia), 3))) +#> +#> colnames(data) = c('Variable', 'Minimum', +#> '1$^{st}$ Quantile', 'Median', +#> 'Mean', '3$^{rd}$ Quantile', 'Maximum') +#> knitr::kable(data, format = "html", caption = "Summary statistics of $stockreturns$ dataset") + + +## ----summary-stat-tex, eval = knitr::is_latex_output(), echo=FALSE------------ +library(GenMarkov) +data = rbind(c('$spread_{t}$', round(summary(stockreturns$spread_1), 3)), + c('$r_{t;SP500}$', round(summary(stockreturns$returns_sp500), 3)), + c('$r_{t;DJIA}$', round(summary(stockreturns$returns_djia), 3))) + +colnames(data) = c('Variable', 'Minimum', + '1$^{st}$ Quantile', 'Median', + 'Mean', '3$^{rd}$ Quantile', 'Maximum') + +knitr::kable(data, format = "latex", caption = "Summary statistics of $stockreturns$ dataset", escape=FALSE) + + +## ----generate-plots, echo=FALSE, warning=FALSE, message=FALSE----------------- +library(GenMarkov) +library(ggplot2) +library(gridExtra) +#Define data and variables +s = cbind(stockreturns$sp500, stockreturns$djia) +m1 = max(s) +x = stockreturns$spread_1 + +########################################################### +### Code retrieved from ProbValuesXDependent() function ### +########################################################### + +# Create matrix with dummies for each state +dummies_list <- + apply(s, 2, function(x) { + fastDummies::dummy_cols(x, remove_selected_columns = TRUE) + }) +dummies <- matrix(unlist(dummies_list), + ncol = m1 * ncol(s), + nrow = nrow(s) +) + +# Create all possible combinations of column indices +combinations <- expand.grid(1:ncol(s), 1:ncol(dummies)) +# Order by the first variable +combinations <- combinations[order(combinations$Var1), ] + +# Extract columns from S and S_L based on the combinations +combined_list <- lapply(1:nrow(combinations), function(i, x) { + cbind(s[, combinations[i, 1]], x, dummies[, combinations[i, 2]]) +}, x = x) + +estimate_condprobs <- sapply(combined_list, function(data) { + # Define dependent variable + y <- factor(data[, 1], levels = 1:max(data[, 1])) + + # Define lagged St + s_l <- Hmisc::Lag(data[, 3]) + + # Estimate multinomial logistic regression + res <- suppressWarnings(nnet::multinom(y[s_l == 1] ~ data[, "x"][s_l == 1], trace = FALSE)) + + warn <- tryCatch( + { + nnet::multinom(y[s_l == 1] ~ data[, "x"][s_l == 1], trace = FALSE) + + if (length(warnings()) == 0) { + NULL # Return NULL if no warning occurs + } + + }, + warning = function(w) { + # Extracting the warning message without printing + warning_message <- conditionMessage(w) + return(warning_message) + } + ) + + + if(is.null(warn)){ + # Extract fitted values + px1 <- res$fitted.values + + }else if(length(warn) == 1){ + + if( (grepl("\\bgroup\\b.*\\bempty\\b", warn, ignore.case = TRUE) || grepl("\\bgroups\\b.*\\bempty\\b", warn, ignore.case = TRUE)) ){ + extracted_number <- as.numeric(regmatches(warn, gregexpr("\\d+", warn))[[1]]) + + # Extract fitted values + px1 <- res$fitted.values + + ##Add missing groups + px1 <- cbind(px1, matrix(rep(0, nrow(px1)*length(extracted_number)), + ncol=length(extracted_number), + nrow = nrow(px1), + dimnames = list(NULL, extracted_number))) + + #Re-order columns + px1 <- px1[, match(1:m1, colnames(px1))] + }else{ + warning(warn) + } + + }else{ + warning(warn) + } + + state = data[data[,3]==1,1][1] + colnames(px1) = rep(paste('From state ', state), 3) + + return(as.matrix(px1)) +}, simplify = "array") + +##### +#Subset each conditional probabilities + +##S1t, S1t-1 +estim_prob_11 = estimate_condprobs[1:3] + +##S1t, S2t-1 +estim_prob_12 = estimate_condprobs[4:6] + +##S2t, S1t-1 +estim_prob_21 = estimate_condprobs[7:9] + +##S2t, S2t-1 +estim_prob_22 = estimate_condprobs[10:12] + + +#Function to create plots +plots_estimprobs = function(df){ + plots_list <- list() + j = colnames(df)[1] + for(i in 1:3){ + ma = pracma::movavg(df[,i], n = 5, type = "s") + df_ma = data.frame(ma = ma, Time = seq(1, nrow(df))) + + plot = ggplot(df_ma, aes(x = Time, + y = ma)) + + geom_line(color = 'black') + + ylab(label = paste(j, 'to state ', i)) + + theme_minimal() + + theme(axis.title = element_text(size = 8)) + + plots_list[[i]] <- plot + } + + plots_res = arrangeGrob(grobs = plots_list, ncol = 3) + + return(plots_res) +} + +#Save plots list +plots11 = lapply(estim_prob_11, function(x) plots_estimprobs(x)) + +plots12 = lapply(estim_prob_12, function(x) plots_estimprobs(x)) + +plots21 = lapply(estim_prob_21, function(x) plots_estimprobs(x)) + +plots22 = lapply(estim_prob_22, function(x) plots_estimprobs(x)) + + + +## ----fig11, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", message=FALSE, warning=FALSE, out.width='70%'---- +grid.arrange(grobs = plots11, nrow=3) + + +## ----fig12, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'---- +grid.arrange(grobs = plots12, nrow=3) + + +## ----fig21, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'---- +grid.arrange(grobs = plots21, nrow=3) + + +## ----fig22, echo=FALSE, fig.align='center', fig.cap="Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.", , out.width='70%'---- +grid.arrange(grobs = plots22, nrow=3) + + +## ----mmcx--------------------------------------------------------------------- +attach(stockreturns) +res <- mmcx(cbind(sp500, djia), spread_1, initial=c(1,1)) + + +## ----tpm---------------------------------------------------------------------- +tpm_max <- MMC_tpm(cbind(sp500, djia), spread_1, + value = max(spread_1), result = res) + +tpm_min <- MMC_tpm(cbind(sp500, djia), spread_1, + value = min(spread_1), result = res) + +## ----tpm-figs, eval=FALSE----------------------------------------------------- +#> library(markovchain) +#> plot(new('markovchain', transitionMatrix = tpm_max[,,1])) # Generate figure 9 +#> plot(new('markovchain', transitionMatrix = tpm_min[,,1])) # Generate figure 10 +#> plot(new('markovchain', transitionMatrix = tpm_max[,,2])) # Generate figure 11 +#> plot(new('markovchain', transitionMatrix = tpm_min[,,2])) # Generate figure 12 + + +## ----fig-sp500-max, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread$_{t-1}$. The highest probability of 0.6 refers to the transition from state 2 to state 3.", out.width='60%', warning=FALSE, message=FALSE, echo=FALSE---- +library(markovchain) +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) + + +## ----fig-sp500-min, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread$_{t-1}$. The highest probability of 0.56 refers to the transition from state 2 to state 2.", out.width='60%', echo=FALSE---- +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) + + +## ----fig-djia-max, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread$_{t-1}$. The probability of 0.58 refers to the transition from state 2 to state 3.", out.width='60%', echo=FALSE---- +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) + + +## ----fig-djia-min, fig.align='center', fig.cap="Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread$_{t-1}$. The highest probability of 0.51 refers to the transition from state 2 to state 2.",out.width='60%', echo=FALSE---- +set.seed(123) +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) + diff --git a/_articles/RJ-2024-006/genmarkov.bib b/_articles/RJ-2024-006/genmarkov.bib new file mode 100644 index 0000000000..0f525d4870 --- /dev/null +++ b/_articles/RJ-2024-006/genmarkov.bib @@ -0,0 +1,747 @@ +@article{Adke1988, +author = {Adke, S.R. and Deshmukh, S.R.}, +journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, +number = {1}, +pages = {105--108}, +title = {{Limit Distribution of a High Order Markov Chain}}, +volume = {50}, +publisher = {Wiley for the Royal Statistical Society}, +url = {https://www.jstor.org/stable/2345812}, +year = {1988} +} + +@article{Nicolau_2014, +author = {Nicolau, J. and Riedlinger, F. I.}, +doi = {10.1007/s00362-014-0630-6}, +issn = {09325026}, +journal = {Statistical Papers}, +number = {4}, +pages = {1163--1173}, +publisher = {Springer Berlin Heidelberg}, +title = {{Estimation and inference in multivariate Markov chains}}, +volume = {56}, +year = {2014} +} + +@techreport{Damasio2020, +author = {Dam\'asio, B. and Nicolau, J.}, +institution = {Instituto Superior de Economia e Gestão}, +title = {{Time inhomogeneous multivariate Markov chains : detecting and testing multiple structural breaks occurring at unknown dates}}, +series = {REM Working Papers}, +number = {0136--2020}, +type = {REM Working Papers }, +url = {http://hdl.handle.net/10400.5/20164}, +year = {2020} +} +@article{Bolano2020, +author = {Bolano, D.}, +doi = {10.3390/SYM12040558}, +issn = {20738994}, +journal = {Symmetry}, +keywords = {Covariates in markovian modeling,Hidden markov model,MTD model,Social sciences}, +number = {4}, +title = {{Handling covariates in markovian models with a mixture transition distribution based approach}}, +volume = {12}, +year = {2020} +} + +@article{Spilerman1976, +author = {Spilerman, S. and Singer, B.}, +file = {:C$\backslash$:/Users/carol/OneDrive/Documents/Tese/Ref/Singer1976.pdf:pdf}, +number = {1}, +pages = {1--54}, +title = {{The Representation of Social Processes by Markov Models}}, +journal = {American Journal of Sociology}, +volume = {82}, +year = {1976}, +url ={https://www.jstor.org/stable/2777460} +} + +@article{Wang2014, +author = {Wang, C. and Huang, T. Z. and Ching, W. K.}, +doi = {10.1155/2014/502808}, +journal = {Mathematical Problems in Engineering}, +title = {{A new multivariate Markov chain model for adding a new categorical data sequence}}, +volume = {2014}, +year = {2014} +} + +@article{Muenz1985, +author = {Muenz, L. R. and Rubinstein, L. V.}, +number = {1}, +pages = {91--101}, +title = {{Markov Models for Covariate Dependence of Binary Sequences }}, +volume = {41}, +year = {1985}, +journal = {Biometrics}, +publisher = {International Biometric Society}, +url ={http://www.jstor.org/stable/2530646} +} +@article{Zhu2010, +author = {Zhu, D. M. and Ching, W. K.}, +doi = {10.1109/BIFE.2010.39}, +journal = {Proceedings - 3rd International Conference on Business Intelligence and Financial Engineering, BIFE 2010}, +keywords = {Demand prediction,Multivariate Markov chain model}, +pages = {126--130}, +title = {{A new estimation method for multivariate Markov chain model with application in demand predictions}}, +year = {2010} +} +@article{Nicolau2014, +author = {Nicolau, J.}, +doi = {10.1111/sjos.12087}, +issn = {14679469}, +journal = {Scandinavian Journal of Statistics}, +keywords = {High-order markov chains,Maximum likelihood method,Mixture transition distribution,Multivariate markov chains}, +number = {4}, +pages = {1124--1135}, +title = {{A new model for multivariate markov chains}}, +volume = {41}, +year = {2014} +} +@article{Martin1987, +author = {Martin, R. D. and Raftery, A.}, +journal = {Journal of the American Statistical Association}, +doi = {10.2307/2289377}, +number = {400}, +pages = {1044-1050}, +title = {{Non-Gaussian State-Space Modeling of Nonstationary Time Series: Comment: Robustness, Computation, and Non-Euclidean Models}}, +volume = {82}, +year = {1987} +} +@book{Taylor1984, +author = {Taylor, H. M. and Karlin, S.}, +title = {An Introduction to Stochastic Modeling - 3rd ed}, +doi = {10.1016/C2013-0-11589-9}, +isbn = {9780126848878}, +publisher = {Academic Press}, +year = {1984} +} +@incollection{Newey1994, +author = {Newey, W.K and Mcfadden, D.}, +title = {Chapter 36 Large sample estimation and hypothesis testing}, +booktitle= {Handbook of Econometrics}, +publisher = {Elsevier}, +volume = {4}, +pages = {2111-2245}, +year = {1994}, +issn = {1573-4412}, +doi = {https://doi.org/10.1016/S1573-4412(05)80005-4} +} +@article{JacobLewis1978, +author = {Jacobs, P.A. and Lewis, A.W.}, +journal = {Journal of the Royal Statistical Society: Series B (Methodological)}, +number = {2}, +pages = {222-228}, +title = {{Discrete Time Series Generated by Mixtures II : Asymptotic Properties}}, +volume = {40}, +year = {1978}, +url = {https://www.jstor.org/stable/2984759} +} +@article{Berchtold2020, +author = {Berchtold, A. and Maitre, O. and Emery, K.}, +doi = {10.3390/sym12122031}, +issn = {20738994}, +journal = {Symmetry}, +keywords = {Double chain Markov model,Evolutionary algorithm,General EM algorithm,Hidden Markov model,Hill-climbing algorithm,MTD model,Markov chain,Optimization}, +number = {12}, +pages = {1--14}, +title = {{Optimization of the mixture transition distribution model using the march package for R}}, +volume = {12}, +year = {2020} +} +@book{Ching2006, +author = {Ching, W. K. and Ng, M. K.}, +title = {Markov Chains: Models, Algorithms and Applications}, +isbn = {9780387293370}, +year = {2006}, +publisher = {Springer}, +doi = {10.1007/0-387-29337-X} +} +@article{Berchtold2003, +author = {Berchtold, A.}, +doi = {10.1016/S0167-9473(02)00191-3}, +issn = {01679473}, +journal = {Computational Statistics and Data Analysis}, +number = {3-4}, +pages = {399--411}, +title = {{Mixture transition distribution (MTD) modeling of heteroscedastic time series}}, +volume = {41}, +year = {2003} +} +@book{Rajarshi2013, +author = {Rajarshi, M.B.}, +isbn = {9783642179792}, +title ={Statistical Inference for Discrete Time Stochastic Processes}, +publisher = {{SpringerBriefs in Statistics}}, +url = {http://www.springer.com/978-81-322-0762-7}, +year = {2013} +} +@article{IslamAtaharul2006, +author = {Islam, M. A. and Chowdhury, R. I.}, +doi = {10.1016/j.apm.2005.05.006}, +issn = {0307904X}, +journal = {Applied Mathematical Modelling}, +keywords = {Binary outcome,Higher order Markov chain,Logistic regression,Markov model,Repeated measures}, +number = {6}, +pages = {477--488}, +title = {{A higher order Markov model for analyzing covariate dependence}}, +volume = {30}, +year = {2006} +} +@article{Azzalini1994, +author = {Azzalini, A.}, +doi = {10.1093/biomet/81.4.767}, +issn = {00063444}, +journal = {Biometrika}, +keywords = {Correlated binary data,Discrete time series,Logistic regression,Longitudinal data,Markov chain,Missing data,Odds ratio,Repeated measures,Serial dependence}, +number = {4}, +pages = {767--775}, +title = {{Logistic regression for autocorrelated data with application to repeated measures}}, +volume = {81}, +year = {1994} +} +@article{Berchtold2001, +author = {Berchtold, A.}, +title = {Estimation in the Mixture Transition Distribution Model}, +journal = {Journal of Time Series Analysis}, +volume = {22}, +number = {4}, +pages = {379-397}, +doi = {https://doi.org/10.1111/1467-9892.00231}, +year = {2001} +} +@article{Pegram1980, +author = {Pegram, G.}, +journal = {Journal of Applied Probability}, +doi = {10.2307/3213025}, +number = {2}, +pages = {350--362}, +title = {{An Autoregressive Model for Multilag Markov Chains}}, +volume = {17}, +year = {1980} +} +@article{Siu2005, +author = {Siu, T. K. and Ching, W. K. and Fung, E. S. and Ng, M. K.}, +doi = {10.1080/14697680500383714}, +issn = {14697688}, +journal = {Quantitative Finance}, +number = {6}, +pages = {543--556}, +title = {{On a multivariate Markov chain model for credit risk measurement}}, +volume = {5}, +year = {2005} +} +@article{Islam2004, +author = {Islam, M. A. and Arabia, S. and Chowdhury, R. I.}, +file = {:C$\backslash$:/Users/carol/OneDrive/Documents/Tese/Ref/Islam2004.pdf:pdf}, +journal = {International Journal of Statistical Sciences}, +keywords = {60j20,ams classification,and phrases,covariate dependence,higher order,logistic,markov model,regression,three states}, +number = {i}, +pages = {241--249}, +title = {{A Three State Markov Model for Analyzing Covariate Dependence}}, +volume = {3}, +year = {2004}, +url = {http://www.ru.ac.bd/stat/wp-content/uploads/sites/25/2019/01/P21.V3s.pdf} +} +@article{Zhang2006, +author = {Zhang, X. and King, M. L. and Hyndman, R. J.}, +doi = {10.1016/j.csda.2005.06.019}, +journal = {Computational Statistics and Data Analysis}, +number = {11}, +pages = {3009--3031}, +title = {{A Bayesian approach to bandwidth selection for multivariate kernel density estimation}}, +volume = {50}, +year = {2006} +} +@phdthesis{Damasio2018, +author = {Dam\'asio, B.}, +title = {{Essays on Econometrics: Multivariate Markov Chains}}, +school = {Universidade de Lisboa, Instituto Superior de Economia e Gest\~ao}, +type = {{PhD} dissertation}, +url = {https://www.repository.utl.pt/bitstream/10400.5/18128/1/TD-BD-2019.pdf}, +year = {2018} +} +@article{Raftery1985, +author = {Raftery, A.}, +journal = {Journal of the Royal Statistical Society: Series B (Methodological)}, +doi = {10.1111/j.2517-6161.1985.tb01383.x}, +issn = {0035-9246}, +number = {3}, +pages = {528--539}, +title = {{A Model for High-Order Markov Chains}}, +volume = {47}, +year = {1985} +} +@article{Kalbfleisch1985, +author = {Kalbfleisch, J. D. and Lawless, J. F.}, +doi = {10.1080/01621459.1985.10478195}, +issn = {1537274X}, +journal = {Journal of the American Statistical Association}, +number = {392}, +pages = {863--871}, +title = {{The analysis of panel data under a Markov assumption}}, +volume = {80}, +year = {1985} +} +@article{Wong2001, +author = {Wong, C. S. and Li, W. K.}, +doi = {10.1198/016214501753208645}, +journal = {Journal of the American Statistical Association}, +keywords = {Autocorrelation,EM algorithm,Model selection,Predictive distributions,Stationarity}, +number = {455}, +pages = {982--995}, +title = {{On a mixture autoregressive conditional heteroscedastic model}}, +volume = {96}, +year = {2001} +} +@article{Wasserman1980, +author = {Wasserman, S.}, +doi = {10.1080/01621459.1980.10477465}, +journal = {Journal of the American Statistical Association}, +number = {370}, +pages = {280--294}, +title = {{Analyzing social networks as stochastic processes}}, +volume = {75}, +year = {1980} +} +@phdthesis{Damasio2013, +author = {Dam\'asio, B.}, +title = {{Multivariate Markov Chains - Estimation, Inference and Forecast. A New Approach: What If We Use Them As Stochastic Covariates?}}, +school = {Universidade de Lisboa, Instituto Superior de Economia e Gest\~ao}, +type = {Master dissertation}, +url = {http://hdl.handle.net/10400.5/6397}, +year = {2013} +} +@article{Berchtold2002, +author = {{Berchtold}, A. and {Raftery }, A.}, +doi = {10.1214/ss/1042727943}, +issn = {08834237}, +journal = {Statistical Science}, +keywords = {DNA,EM algorithm,Financial time series,GMTD model,High-order dependences,Markov chains,Mixture transition distribution (MTD) model,Social behavior,Spatial statistics,Time series,Wind}, +number = {3}, +pages = {328--356}, +title = {{The mixture transition distribution model for high-order Markov chains and non-Gaussian time series}}, +volume = {17}, +year = {2002} +} +@article{Tavare1994, +author = {Raftery, A. and Tavaré, S.}, +journal = {Applied Statistics}, +doi = {10.2307/2986120}, +title = {{Estimation and Modelling Repeated Patterns in High Order Markov Chains with the Mixture Transition Distribution Model}}, +volume = {43}, +number = {1}, +pages = {179--199}, +year = {1994} +} +@article{Le1996, +author = {Le, N. D. and Martin, R. D. and Raftery, A.}, +journal = {Journal of the American Statistical Association}, +doi = {10.1111/j.2517-6161.1985.tb01383.x}, +number = {436}, +pages = {1504--1515}, +title = {{Modeling Flat Stretches, Brusts, and Outliers in Time Series Using Mixture Transition Distribution Models}}, +volume = {91}, +year = {1996} +} +@article{Ching2002, + author = {Ching, W. K. and Fung, E. S. and Ng, M. K.}, + journal={IMA Journal of Management Mathematics}, + title={A multivariate Markov chain model for categorical data sequences and its applications in demand predictions}, + year={2002}, + volume={13}, + number={3}, + pages={187-199}, + doi={10.1093/imaman/13.3.187} + } +@article{Logan1981, +author = { Logan, J. }, +title = {{A structural model of the higher‐order Markov process incorporating reversion effects}}, +journal = {The Journal of Mathematical Sociology}, +volume = {8}, +number = {1}, +pages = {75-89}, +year = {1981}, +publisher = {Routledge}, +doi = {10.1080/0022250X.1981.9989916} +} +@article{Lebre2008, +author = {Lèbre, S. and Bourguignon, P. Y. }, +title = {{An EM algorithm for estimation in the mixture transition distribution model}}, +journal = {Journal of Statistical Computation and Simulation}, +volume = {78}, +number = {8}, +pages = {713-729}, +year = {2008}, +publisher = {Taylor \& Francis}, +doi = {10.1080/00949650701266666} +} +@article{ChenLio2009, +author = {Chen, D. G. and Lio, Y. L.}, +title = {{A Novel Estimation Approach for Mixture Transition Distribution Model in High-Order Markov Chains}}, +journal = {Communications in Statistics - Simulation and Computation}, +volume = {38}, +number = {5}, +pages = {990-1003}, +year = {2009}, +publisher = {Taylor \& Francis}, +doi = {10.1080/03610910802715009} +} +@article{Berchtold1995, +author = {Berchtold, A.}, +title = {{Autoregressive Modelling of Markov Chains}}, +journal = {Proc. 10th International Workshop on Statistical Modelling}, +volume = {104}, +number = {}, +pages = {19-26}, +year = {1995}, +publisher = { Springer, New York, NY}, +doi = {10.1007/978-1-4612-0789-4_3} +} +@article{Berchtold1996, + author = {Berchtold, A.}, + title = {{Mod\'elisation autor\'egressive des chaines de Markov : utilisation d'une matrice diff\'erente pour chaque retard}}, + journal = {Revue de Statistique Appliqu\'ee}, + pages = {5--25}, + publisher = {Soci\'et\'e de Statistique de France}, + volume = {44}, + number = {3}, + year = {1996}, + language = {fr}, + url = {http://www.numdam.org/item/RSA_1996__44_3_5_0/} +} +@incollection{Mehran1989, +title = {{Analysis of Discrete Longitudinal Data: Infinite-Lag Markov Models}}, +booktitle = {Statistical Data Analysis and Inference}, +publisher = {North-Holland}, +address = {Amsterdam}, +pages = {533-541}, +year = {1989}, +isbn = {978-0-444-88029-1}, +doi = {https://doi.org/10.1016/B978-0-444-88029-1.50053-8}, +author = {Mehran, F.}, +} +@article{Kijima2002, +author = {Kijima, M. and Komoribayashi, K. and Suzuki, E.}, +year = {2002}, +month = {07}, +pages = {}, +title = {{A multivariate Markov model for simulating correlated defaults}}, +volume = {4}, +journal = {Journal of Risk}, +doi = {10.21314/JOR.2002.066} +} +@article{Regier1968, +author = {Regier, M. H.}, +title = {{A Two-State Markov Model for Behavioral Change}}, +journal = {Journal of the American Statistical Association}, +volume = {63}, +number = {323}, +pages = {993-999}, +year = {1968}, +publisher = {Taylor & Francis}, +doi = {10.1080/01621459.1968.11009325} +} +@article{Bart1968, +author = {Bartholomew, J.}, +title ={{Stochastic Models for Social Processes}}, +journal = {The Australian and New Zealand Journal of Sociology}, +publisher = {J. Wiley}, +volume = {4}, +number = {2}, +pages = {171-172}, +year = {1968}, +doi = {https://doi.org/10.1177/144078336800400215} +} +@Manual{march, + title = {march: Markov Chains}, + author = {Maitre, O. and Emery, K.}, + year = {2020}, + note = {R package version 3.3.2}, + url = {https://CRAN.R-project.org/package=march}, + } +@Article{markovchains, + title = {Discrete Time Markov Chains with R}, + author = {Spedicato, G. A.}, + month = {07}, + year = {2017}, + journal = {The R Journal}, + url = {https://journal.r-project.org/archive/2017/RJ-2017-036/index.html}, + note = {R package version 0.6.9.7}, + } + @Manual{alabama, + title = {alabama: Constrained Nonlinear Optimization}, + author = {Varadhan, R.}, + year = {2015}, + note = {R package version 2015.3-1}, + url = {https://CRAN.R-project.org/package=alabama}, + } + @Manual{msm, + title = {msm: Multi-State Markov and Hidden Markov Models in Continuous Time}, + author = {Jackson, C}, + year = {2023}, + url = {https://cran.r-project.org/web/packages/msm/index.html}, + } + @Manual{matrixcalc, + title = {matrixcalc: Collection of functions for matrix calculations}, + author = {Novomestky, F.}, + year = {2012}, + note = {R package version 1.0-3}, + url = {https://CRAN.R-project.org/package=matrixcalc}, + } +@Manual{Hmisc, + title = {Hmisc: Harrell Miscellaneous}, + author = {Harrell Jr, F. E.}, + year = {2021}, + note = {R package version 4.5-0}, + url = {https://CRAN.R-project.org/package=Hmisc}, + } + @Article{maxLik, + title = {maxLik: A package for maximum likelihood estimation in {R}}, + author = {Henningsen, A. and Toomet, O.}, + journal = {Computational Statistics}, + year = {2011}, + volume = {26}, + number = {3}, + pages = {443-458}, + doi = {10.1007/s00180-010-0217-1}, + url = {http://dx.doi.org/10.1007/s00180-010-0217-1}, + } +@Book{nnet, + title = {Modern Applied Statistics with S}, + author = {Venables, W. N. and Ripley, B. D.}, + publisher = {Springer}, + edition = {Fourth}, + address = {New York}, + year = {2002}, + note = {ISBN 0-387-95457-0}, + url = {https://www.stats.ox.ac.uk/pub/MASS4/}, + } +@Manual{fastDummies, + title = {fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from +Categorical Variables}, + author = {Kaplan, J.}, + year = {2020}, + note = {R package version 1.6.3}, + url = {https://CRAN.R-project.org/package=fastDummies}, + } +@Manual{DTMCPack, + title = {DTMCPack: Suite of functions related to discrete-time discrete-state +Markov Chains}, + author = {Nicholson, W.}, + year = {2013}, + note = {R package version 0.1-2}, + url = {https://CRAN.R-project.org/package=DTMCPack}, + } +@article{DAMASIO2014, +title = {{Combining a regression model with a multivariate Markov chain in a forecasting problem}}, +journal = {Statistics \& Probability Letters}, +volume = {90}, +pages = {108-113}, +year = {2014}, +issn = {0167-7152}, +doi = {https://doi.org/10.1016/j.spl.2014.03.026}, +author = {Dam\'asio, B. and Nicolau, J.}, +keywords = {Multivariate Markov chain, Higher-order Markov chain, Forecasting}, +} +@article{Damasio2019, +author = {Dam\'asio, B. and Mendon\c{c}a, S.}, +title = {{Modelling insurgent-incumbent dynamics: Vector autoregressions, multivariate Markov chains, and the nature of technological competition}}, +journal = {Applied Economics Letters}, +volume = {26}, +number = {10}, +pages = {843-849}, +year = {2019}, +publisher = {Routledge}, +doi = {10.1080/13504851.2018.1502863} +} +@article{hajnal1956, +title={The ergodic properties of non-homogeneous finite Markov chains}, +volume={52}, +DOI={10.1017/S0305004100030991}, +number={1}, +journal={Mathematical Proceedings of the Cambridge Philosophical Society}, +publisher={Cambridge University Press}, +author={Hajnal, J. and Bartlett, M. S.}, +year={1956}, +pages={67–77} +} +@article{Ching2008, +author = {Ching, W. K. and Ng, M. K. and Fung, E. S.}, +journal = {Linear Algebra and its Applications}, +doi = {10.1016/j.laa.2007.05.021}, +volume = {428}, +number = {2-3}, +pages = {492--507}, +title = {{Higher-order multivariate Markov chains and their applications}}, +year = {2008} +} +@article{Ching2003, + author = {Ching, W. K. and Fung, E. S. and Ng, M. K.}, + journal = {The Journal of the Operational Research Society}, + number = {3}, + pages = {291--298}, + publisher = {Palgrave Macmillan Journals}, + title = {A Higher-Order Markov Model for the Newsboy's Problem}, + volume = {54}, + year = {2003} +} +@article{Ching2004, + author = {Ching, W. K. and Fung, E. S. and Ng, M. K.}, + journal = {International Naval Research Logistics}, + pages = {557--574}, + publisher = {Wiley Periodicals, Inc}, + title = {Higher-Order Markov Chain Models for Categorical Data +Sequences}, + volume = {51}, + year = {2004}, + doi = {10.1002/nav.20017} +} +@book{Hayashi, +publisher = {Princeton University Press}, +booktitle = {Econometrics}, +isbn = {0691010188}, +year = {2000}, +title = {Econometrics / Fumio Hayashi.}, +language = {eng}, +address = {Princeton, N.J.}, +author = {Hayashi, F.}, +keywords = {Econometrics}, +lccn = {00034665}, +} +@article{Billingsey1961, + URL = {http://www.jstor.org/stable/2034876}, + author = {Billingsley, P.}, + journal = {Proceedings of the American Mathematical Society}, + number = {5}, + pages = {788--792}, + publisher = {American Mathematical Society}, + title = {The Lindeberg-Lévy Theorem for Martingales}, + volume = {12}, + year = {1961} +} +@misc{DamasioM2020, + author = {Dam\'asio, B. and Mendonça, S.}, + title = {Leader-follower dynamics in real historical time: A Markovian test of +non-linear causality between sail and steam (co-)development, mimeo}, + year = {2020} +} +@article{Estrella1996, + author={Estrella, A. and Mishkin, F. S.}, + title={{The yield curve as a predictor of U.S. recessions}}, + journal={Current Issues in Economics and Finance}, + year=1996, + volume={2}, + number={Jun}, + pages={}, + month={}, + keywords={Forecasting; Recessions; Treasury bills}, + url={https://www.newyorkfed.org/research/current_issues/ci2-7.html} +} +@article{Dombrosky1996, +title = {Predicting real growth using the yield curve}, +author = {Dombrosky, A. M. and Haubrich, J.}, +year = {1996}, +journal = {Economic Review}, +number = {Q}, +volume = {I}, +pages = {26-35}, +url = {https://EconPapers.repec.org/RePEc:fip:fedcer:y:1996:i:qi:p:26-35} +} +@misc{data1, +author={FRED, Federal Reserve Bank of St. Louis}, +title={10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity [T10Y3M]}, +publisher={FRED, Federal Reserve Bank of St. Louis}, +year={2021}, +url={https://fred.stlouisfed.org/series/T10Y3M} +} +@misc{data2, +author={FRED, Federal Reserve Bank of St. Louis}, +title={S\&P Dow Jones Indices LLC, Dow Jones Industrial Average [DJIA]}, +publisher={FRED, Federal Reserve Bank of St. Louis}, +year={2021}, +url={https://fred.stlouisfed.org/series/DJIA} +} +@misc{data3, +author={FRED, Federal Reserve Bank of St. Louis}, +title={S\&P Dow Jones Indices LLC, S\&P 500 [SP500]}, +publisher={FRED, Federal Reserve Bank of St. Louis}, +year={2021}, +url= {https://fred.stlouisfed.org/series/SP500} +} +@article{Chauvet2016, +title = {A dynamic factor model of the yield curve components as a predictor of the economy}, +journal = {International Journal of Forecasting}, +volume = {32}, +number = {2}, +pages = {324-343}, +year = {2016}, +issn = {0169-2070}, +doi = {https://doi.org/10.1016/j.ijforecast.2015.05.007}, +author = {Chauvet, M. and Senyuz, Z.}, +} +@article{Tian2019, +author = {Tian, R. and Shen, G.}, +title = {Predictive power of Markovian models: Evidence from US recession forecasting}, +journal = {Journal of Forecasting}, +volume = {38}, +number = {6}, +pages = {525-551}, +doi = {https://doi.org/10.1002/for.2579}, +year = {2019} +} +@article{McMillan2021, +author = {McMillan, D. G.}, +title = {Predicting GDP growth with stock and bond markets: Do they contain different information?}, +journal = {International Journal of Finance \& Economics}, +volume = {26}, +number = {3}, +pages = {3651-3675}, +doi = {https://doi.org/10.1002/ijfe.1980}, +year = {2021} +} +@article{Damasio20182, +title = {The changing economic regimes and expected time to recover of the peripheral countries under the euro: A nonparametric approach}, +journal = {Physica A: Statistical Mechanics and its Applications}, +volume = {507}, +pages = {524-533}, +year = {2018}, +issn = {0378-4371}, +doi = {https://doi.org/10.1016/j.physa.2018.05.089}, +author = {Damásio, B. and Louçã, F. and Nicolau, J.} +} + +@article{jackson2011multi, + title={Multi-state models for panel data: the msm package for R}, + author={Jackson, Christopher}, + journal={Journal of statistical software}, + volume={38}, + pages={1--28}, + year={2011}, + doi = {10.18637/jss.v038.i0810.18637/jss.v038.i08} +} + +@Manual{pracma, + title = {pracma: Practical Numerical Math Functions}, + author = {Hans W. Borchers}, + year = {2022}, + note = {R package version 2.4.2}, + url = {https://CRAN.R-project.org/package=pracma}, + } + +@Book{ggplot2, + author = {Hadley Wickham}, + title = {ggplot2: Elegant Graphics for Data Analysis}, + publisher = {Springer-Verlag New York}, + year = {2016}, + isbn = {978-3-319-24277-4}, + url = {https://ggplot2.tidyverse.org}, +} + +@Manual{gridextra, + title = {gridExtra: Miscellaneous Functions for "Grid" Graphics}, + author = {Baptiste Auguie}, + year = {2017}, + note = {R package version 2.3}, + url = {https://CRAN.R-project.org/package=gridExtra}, +} \ No newline at end of file diff --git a/_articles/RJ-2024-006/genmarkov.tex b/_articles/RJ-2024-006/genmarkov.tex new file mode 100644 index 0000000000..b06592fe3d --- /dev/null +++ b/_articles/RJ-2024-006/genmarkov.tex @@ -0,0 +1,747 @@ +% !TeX root = RJwrapper.tex +\title{GenMarkov: Modeling Generalized Multivariate Markov Chains in R} + + +\author{by Carolina Vasconcelos and Bruno Damásio} + +\maketitle + +\abstract{% +This article proposes a new generalization of the Multivariate Markov Chains (MMC) model. The future values of a Markov chain commonly depend on only the past values of the chain in an autoregressive fashion. The generalization proposed in this work also considers exogenous variables that can be deterministic or stochastic. Furthermore, the effects of the MMC's past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. The Monte Carlo simulation study findings showed that our model consistently detected a non-homogeneous Markov chain. Besides, an empirical illustration demonstrated the relevance of this new model by estimating probability transition matrices over the space state of the exogenous variable. An additional and practical contribution of this work is the development of a novel R package with this generalization. +} + +\hypertarget{introduction}{% +\section{Introduction}\label{introduction}} + +Multivariate Markov chains (MMC) have a wide range of applications, in various fields. Hence, several studies and generalizations of the MMC models have been made. However, the availability of packages that allow the estimation and application of these models are scarce, and most of these methods use algorithms and software that are not broadly available or can only be applied in particular situations. In the last few years, R software has been gaining importance in the field of statistical computing. This phenomenon might be because it is free and open-source software, which compiles and runs on a wide variety of operating systems. Specifically, in R software, there are some available packages related to Markov chains (MC) and MMC. For example, the \CRANpkg{march} package (Maitre and Emery 2020; Berchtold, Maitre, and Emery 2020) allows the computation of various Markovian models for categorical data, including homogeneous Markov chains of any order, MTD models, Hidden Markov models, and Double Chain Markov Models. Ogier Maitre developed this package with contributions from Andre Berchtold, Kevin Emery, Oliver Buschor, and Andre Berchtold maintains it. All the models computed by this package are for univariate categorical data. The \CRANpkg{markovchain} package (Spedicato 2017) contains functions and methods to create and manage discrete-time Markov chains. In addition, it includes functions to perform statistical and probabilistic analysis (analysis of their structural proprieties). Finally, the \CRANpkg{DTMCPack} package (Nicholson 2013) contains a series of functions that aid in both simulating and determining the properties of finite, discrete-time, discrete-state Markov chains. There are two main functions: \texttt{DTMC} and \texttt{MultDTMC}, which produce \(n\) iterations of a Markov Chain(s) based on transition probabilities and an initial distribution given by the user, for the univariate and multivariate case, respectively. This last package is the only one available in R for MMC. In general, the work on MMC models is mostly based on improving the estimation methods and/or making the model more parsimonious. In this work, we aim to develop a new generalization that considers exogenous variables. Specifically, the effects of the MMC's past values and the effects of pre-determined or exogenous covariates are considered in our model by considering a non-homogeneous Markov chain. Additionally, we address statistical inference and implement these methods in an R package. The R package includes three functions: \texttt{multimtd}, \texttt{multimtd\_probit} and \texttt{mmcx}. The first two functions estimate the MTD model for multivariate categorical data, with Chings's specification (Ching, Fung, and Ng 2002) and with the Probit specification (Nicolau 2014), respectively. The last function allows the estimation of our proposed model, the Generalized Multivariate Markov Chain (GMMC) model. The R package, \CRANpkg{GenMarkov}, with these three functions is available in the Comprehensive R Archive Network (CRAN) at \url{https://CRAN.R-project.org/package=GenMarkov}. + +\hypertarget{multivariate-markov-chains}{% +\section{Multivariate Markov chains}\label{multivariate-markov-chains}} + +Markov chains can be appropriate for representing dependencies between successive observations of a random variable. However, when the order of the chain or the number of possible values increases, Markov chains have lack parsimony. In this context, Jacobs and Lewis (1978), Pegram (1980) and Logan (1981) proposed several models for HOMC. Notwithstanding these developments, the Mixture Transition Distribution model (Raftery 1985) proved to be more suitable to model HOMC, which overshadowed the previously proposed models. Several relevant extensions of the MTD model emerged: the Multimatrix MTD (Berchtold 1995, 1996), which allowed modeling the MTD by using a different \(m \times m\) transition matrix for each lag, the Infinite-Lag MTD model that assumes an infinite lag order (\(l = \infty\)), which was first considered by Mehran (1989) and later developed by Le, Martin, and Raftery (1996) in a more general context. Finally, the MTD with General State Spaces allowed modeling more general processes with an arbitrary space state (Martin and Raftery 1987; Adke and Deshmukh 1988; Wong and Li 2001). Although the MTD model presents a more parsimonious approach to model Markov chains with order higher than one, it has weaknesses. Namely, when considering more than one data sequence, one represents the MMC as a HOMC, by expanding the state-space. This approach could result in a more complex probability transition matrix. Consequently, this can make the estimation unfeasible as the order, states, and the number of data sequences increase. Additionally, the model assumes the same transition matrix for each lag. In this setting, Ching, Fung, and Ng (2002) determined an alternative to handle the unfeasibility of the conventional multivariate Markov chain (MMC) by proposing a model with fewer parameters. The model developed is essentially the same as the MTD. However, it considers a different \(m \times m\) transition matrix for each lag and considers more than one data sequence. In the proposed multivariate Markov chain model, Ching, Fung, and Ng (2002) assume the following relationship: + +Let \(x_t^{(j)}\) be the state vector of the \(j\)th sequence at time \(t\). If the \(j\)th sequence is in state \(l\) at time \(t\) then + +\begin{equation} +x_{t+1}^{(j)} = \sum_{k=1}^s \lambda_{jk}P^{(jk)}x_{t}^{(k)}, \text{for } j =1, 2, \dots, s +\label{eq:eq1} +\end{equation} +where \(0 \leq \lambda_{jk} \leq 1\) for \(j \leq s, k \leq s\) and \(\sum_{k=1}^s \lambda_{jk} =1\) for \(j=1, 2, \dots, s\). The \(\lambda_{jk}\) can be interpreted as the mixing probability of the \(j\)th state to the \(k\)th state. + +The state probability distribution of the \(k\)th sequence at time \((t + 1)\) depends on the weighted average of \(P^{(jk)}x_{t}^{(k)}\) . Here \(P^{(jk)}\) is a transition probability matrix from the states in the \(k\)th sequence to the states in the \(j\)th sequence and \(x_t^{(k)}\) is the state probability distribution of the \(k\)th sequences at time \(t\). In matrix form: + +\begin{equation} +\underline{x}_{t+1}^{(j)} \equiv +\left[ +\begin{array}{c} + x_{t+1}^{(1)} \\ + \vdots \\ + x_{t+1}^{(s)} +\end{array} \right ] += +\left[ +\begin{array}{ccc} +\lambda_{11}P^{(11)} & \dots & \lambda_{1s}P^{(1s)}\\ +\vdots & \ddots & \vdots\\ +\lambda_{s1}P^{(s1)}& \dots & \lambda_{ss}P^{(ss)} +\end{array} \right ] +\left[ +\begin{array}{c} + x_{t}^{(1)} \\ + \vdots \\ + x_{t}^{(s)} +\end{array} \right ] +\equiv +Q \underline{x}_{t} +\label{eq:eq2} +\end{equation} where \(Q\) is an \(ms \times ms\) block matrix (\(s \times s\) blocks of \(m \times m\) matrices) and \(x_t\) is a stacked \(ms\) column vector (\(s\) vectors, each one with \(m\) rows). + +The matrices \(P^{(jk)}\) can be estimated for each data sequence by counting the transition frequency from the states in the \(k\)th sequence to those in the \(j\)th sequence, obtaining the transition frequency matrix for the data sequence. After normalization, the estimates of the transition probability matrices, i.e., \(\widehat{P}^{(jk)}\), are obtained. Regarding the \(\lambda_{jk}\) coefficients, the estimation method proposed by Ching, Fung, and Ng (2002) involves the following optimization problem: + +\begin{equation} +min_{\lambda} max_{i} \vert [ \sum_{k=1}^m \lambda_{jk} \widehat{P}^{(jk)} \widehat{\boldsymbol{x}}^{(k)} - \widehat{\boldsymbol{x}}^{(j)} ] \vert +\label{eq:eq3} +\end{equation} + +\[ \text{s.t. } \sum_{k=1}^s \lambda_{jk} \text{ and } \lambda_{jk} \geq 0 \] Besides this, different models have been proposed for multiple categorical data sequences. Kijima, Komoribayashi, and Suzuki (2002) proposed a parsimonious MMC model to simulate correlated credit risks. Siu et al. (2005) proposed an easy to implement model; however, its applicability was limited by the number of parameters involved. Ching, Ng, and Fung (2008) proposed a simplified model based on an assumption proposed in Zhang, King, and Hyndman (2006). Zhu and Ching (2010) proposed a method of estimation based on minimizing the prediction error with equality and inequality restrictions and Nicolau and Riedlinger (2014) proposed a new approach to estimate MMC which avoids imposing restrictions on the parameters, based on non-linear least squares estimation, facilitating the model estimation and the statistical inference. Berchtold (2003) proposed a MTD model for heteroscedastic time series. Lastly, Wang, Huang, and Ching (2014) proposed a new multivariate Markov chain model to reduce the number of parameters. Thus, generally, the models used in the published papers were developed by Ching, Fung, and Ng (2002) or were a consequent generalization of them and addressed the MMC as an end in itself. In Damásio (2013) and Damásio and Nicolau (2014), a different and innovative concept was proposed: the usage of MMC as regressors in a certain model. Hence, given that the MMC Granger causes a specific dependent variable, and taking advantage of the information about the past state interactions between the MMC categories, it was possible to forecast the current dependent variable more accurately. Other relevant contributions are related to the optimization algorithm, as in Lèbre and Bourguignon (2008) and Chen and Lio (2009), and to empirical applications (Ching, Fung, and Ng 2003; Ching and Ng 2006; Damásio 2018; Damásio and Mendonça 2019, 2020). Also, Damásio and Nicolau (2020) proposed a new methodology for detecting and testing the presence multiple structural breaks in a Markov chain occurring at unknown dates. In the vast majority of MMC models' studies, a positive correlation between the different data sequences is assumed due to the restrictions imposed. This aspect means it is always considered that at moment \(t\), an increase in a state probability for a data sequence has an increasing impact on another data sequence, for time \(t+1\). Thereupon, if one has a negative correlation between series, the parameter estimates are forced to be zero. The solution to this problem is very straightforward; one can relax the assumptions and not assume the constraints. However, that means the results produced by the model will no longer be probabilities. Raftery and Tavaré (1994) presented an alternative, by dropping the positivity condition and imposing another set of restrictions. Ching, Ng, and Fung (2008) also tackled this issue and proposed a method where one splits the \(Q\) matrix into the sum of two other matrices and one represents the positive correlations and another the negative correlations. Also, in Nicolau (2014), a specification completely free from constraints, inspired by the MTD model, was proposed, facilitating the estimation procedure and, at the same time, providing a more accurate specification for \(P_j(i_0 | i_1, \dots, i_s)\). The model was: + +\begin{equation} +P_j(i_0 | i_1, \dots, i_s) = P_j^{\Phi}(i_0 | i_1, \dots, i_s) := +\\ + \frac{\Phi(\eta_{j0} + \eta_{j1}P(i_0|i_1) + \dots + \eta_{js}P(i_0|i_s))}{\sum_{k=1}^m \Phi(\eta_{j0} + \eta_{j1}P(k|i_1) + \dots + \eta_{js}P(k|i_s))} + \label{eq:eq4} +\end{equation} where \(n_{ji} \in \mathbb{R}(j = 1, \dots, s; i = 1, \dots, m)\) and \(\Phi\) is the (cumulative) standard normal distribution function. + +This specification is denoted as and MTD-Probit model. The log-likelihood is given by: \begin{equation} +LL = \sum_{i_1, i_2, \dots, i_{i_s}, i_0} n_{i_1, i_2, \dots, i_{i_s}, i_0} log(P_j^{\Phi}(i_0 | i_1, \dots, i_s) ) \label{eq:eq5} +\end{equation} and the maximum likelihood estimator is defined, as usual, as \(\widehat{\eta} = \text{arg max}_{n_{j1}, \dots, n_{js}} LL\). The parameters \(P_{jk}(i_0|i_1)\), \(k\) =\(1, \dots, s\) can be estimated in advance, through the consistent and unbiased estimators proposed by Ching, Fung, and Ng (2002): + +\begin{equation} +\widehat{P}_{jk}(i_0|i_1) = \frac{n_{i_1i_0}}{\sum_{i_0=1}^n n_{i_1 i_0}} \label{eq:eq6} +\end{equation} This specification can be superior to the MTD because the estimation procedure is easier, and the standard numerical optimization routines can be easily applied in the absence of constraints. However, similarly to the standard MTD, the likelihood is not a strictly concave function on the entire parameter state-space, thus the choice of starting values is still important. Additionally, the model describes a broader range of possible dependencies since the parameters are not constrained. Moreover, this proposed model is more accurate than the MTD model. For more details on this, see Nicolau (2014). + +Overall, the published work on MMC models was mostly based on improving the estimation methods and/or making the model more parsimonious. In Damásio (2013) and Damásio and Nicolau (2014), a different approach was used, and the work developed focused on the usage of MMC as regressors in a certain model. Notably, it showed that an MMC can improve the forecast of a dependent variable. In a way, it demonstrated that an MMC can be an end in itself, but it can be an instrument to reach an end or a purpose. In this work, the opposite will be developed: instead of considering an MMC as regressors, a model in which a vector with pre-determined exogenous variables is part of \(\mathcal{F}_{t-1}\) is proposed. + +\hypertarget{covariates-in-markov-chain-models}{% +\section{Covariates in Markov chain models}\label{covariates-in-markov-chain-models}} + +Regarding the inclusion of covariates in Markov chains models, Regier (1968) proposed a two-state Markov chain model, where the transition matrix probabilities were a function of a parameter, \(q\), that described the tendency of the subject to move from state to state. Kalbfleisch and Lawless (1985) proposed a panel data analysis method under a continuous-time Markov model that could be generalized to handle covariate analysis and the fitting of certain non-homogeneous models. This work overcame the limitations of Bartholomew (1968), Spilerman and Singer (1976) and Wasserman (1980) methodologies, by developing a new algorithm that provided a very efficient way of obtaining maximum likelihood estimates. Also, Muenz and Rubinstein (1985) developed a Markov model for covariates dependence of binary sequences, where the transitions probabilities were estimated through two logistic regressions that depended on a set of covariates. Essentially, Muenz and Rubinstein (1985) modeled a non-homogeneous Markov chain through logistic regression, considering only two states. Islam, Arabia, and Chowdhury (2004) developed an extension of this model considering three states, and Islam and Chowdhury (2006) generalized this approach for HOMC. Additionally, Azzalini (1994) proposed a model to study the influence of time-dependent covariates on the marginal distribution of a binary response in serially correlated binary data, where Markov chains are expressed in terms of transitional probabilities. Jackson (2011) proposed a Markov model for panel data, which allowed for the transitions intensities to vary between individuals or constant time-dependent covariates. Specifically, this work allowed to account for different intensities throughout transitions of states and include individual-specific covariates. The time-inhomogeneos model proposed is restricted to piecewise-constant intensities. The implementation of this work is available in the package \CRANpkg{msm}. More recently, Bolano (2020) proposed an MTD-based approach to handle categorical covariates, that considers each covariate separately and combines the effects of the lags of the MTD and the covariates employing a mixture model. Specifically, the model is given by: + +\begin{equation} +P(X_t = k \mid X_{t-1} = i, C_1 = c_1, \dots, C_l = c_l) \approx \theta_0 a_{ik} + \sum_{h=1}^l \theta_h d_{c_{h}k} \label{eq:eq7} +\end{equation} + +where \(a_{ik}\) is the transition probability from state \(i\) to state \(k\), as in a conventional Markov chains and \(d_{c_{h}k}\) is the probability of observing the states \(k\) given the modality \(c_h\) of the covariate \(h\). Lastly, \(\theta_0, \dots, \theta_l\) are the weights of the explanatory elements of the model. + +According to the literature presented, several researchers have proposed methodologies or generalizations to include covariates in Markov chain models. Primarily for social sciences and health applications, where the transition probabilities were generally modeled through logistic regression. However, there has been an increased focus on categorical covariates, opposing continuous covariates and a lack of approaches to multivariate Markov chain models. Thus, with this work, we aim to tackle this research gap. + +\hypertarget{multivariate-markov-chains-with-covariates}{% +\section{Multivariate Markov chains with covariates}\label{multivariate-markov-chains-with-covariates}} + +\hypertarget{theoretical-model}{% +\subsection{Theoretical model}\label{theoretical-model}} + +In this work, a new generalization of Ching, Fung, and Ng (2002) MMC model is presented: the GMMC model, that is, we will consider exogeneous or pre-determined covariates in the \(\sigma\) - algebra generated by the available information until \(t-1\) (\(\mathcal{F}_{t-1}\)). These variables can be deterministic or stochastic and do not necessarily need to be reported at time \(t\). Broadly, the model is given by: + +\begin{equation} +P(S_{jt} = k | \mathcal{ F}_{t-1} ) = P(S_{jt} = k | S_{1t-1} = i_1, S_{2t-1} = i_2, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) \label{eq:eq8} +\end{equation} We can specify this model as proposed by Ching, Fung, and Ng (2002) with Raftery's notation: + +\begin{multline} +P(S_{jt} = i_0 | S_{1t-1} = i_1,\dots, S_{st-1} = i_s, \boldsymbol{x}_t) \equiv \\ +\lambda_{j1}P(S_{jt} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_t) + \dots + \lambda_{js}P(S_{jt} = i_0 | S_{st-1} = i_s, \boldsymbol{x}_t) \label{eq:eq9} +\end{multline} subject to the usual constraints. + +\hypertarget{estimation-and-inference}{% +\subsection{Estimation and inference}\label{estimation-and-inference}} + +This proposed model is estimated through MLE, similar to the standard MTD model. The log-likelihood is given by: + +\begin{equation} +LL = \sum_{t = 1}^n log P(S_{jt} = i_0 | S_{1t-1} = i_1, \dots, S_{st-1} = i_s, \boldsymbol{x}_t) \label{eq:eq10} +\end{equation} + +Additionally, the probabilities can be estimated through an multinomial logit model. The proof for consistency and asymptotic distribution is available in the Supplementary Material section. + +\hypertarget{monte-carlo-simulation-study}{% +\subsection{Monte Carlo simulation study}\label{monte-carlo-simulation-study}} + +A Monte Carlo simulation study was designed to evaluate the dimension and power of the test parameters of the proposed model. The R statistical environment was used for all computations. This simulation study was comprised of two parts. + +\hypertarget{part-i-detect-a-non-homogeneous-markov-chain}{% +\subsubsection{Part I: Detect a non-homogeneous Markov chain}\label{part-i-detect-a-non-homogeneous-markov-chain}} + +First, we considered two sequences with two and three states. The main goal was to assess if the model detected the presence of a non-homogeneous Markov chain correctly and if the estimate of the parameter would correspond to the expected. So, given two sequences, one generated through a non-homogeneous Markov chain and the other generated through a homogeneous Markov chain, it would be expected that the parameter associated with the transition probabilities of the first sequence would be one and the parameter associated with the transition probabilities of the second sequence would be zero. With this in mind, the transitions probabilities of the first sequence were estimated through a logistic regression, where parameters of this regression were randomly generated in R, and the second sequence was generated through a first-order Markov chain. Hence, for both states cases considered, it was expected that the estimated regression would be: + +\begin{multline} +P(S_{1t} = i_0 | S_{1t-1} = i_1, S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) = \\ +1 \times P(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}) + 0 \times P(S_{1t} = i_0 | S_{2t-1} = i_2, \boldsymbol{x}_{t-1}) \label{eq:eq11} +\end{multline} + +To assess the test power and dimension, we used the Wald test with the following hypothesis: + +\begin{table} + +\caption{\label{tab:dim-pow-tex}Power and dimension of test assessment} +\centering +\begin{tabular}[t]{l|l|l} +\hline + & Hypothesis & Test\\ +\hline +Power & $H_0: \lambda_{11} = 0$ & $\frac{\widehat{\lambda}_{11}^2}{se(\widehat{\lambda}_{11})^2} \sim \chi^2_{(1)}$\\ +\hline + & $H_0: \lambda_{12} = 1$ & $\frac{(\widehat{\lambda}_{12}-1)^2}{se(\widehat{\lambda}_{12})^2} \sim \chi^2_{(1)}$\\ +\hline +Dimension & $H_0: \lambda_{11} = 1$ & $\frac{(\widehat{\lambda}_{11}-1)^2}{se(\widehat{\lambda}_{11})^2} \sim \chi^2_{(1)}$\\ +\hline + & $H_0: \lambda_{12} = 0$ & $\frac{\widehat{\lambda}_{12}^2}{se(\widehat{\lambda}_{12})^2} \sim \chi^2_{(1)}$\\ +\hline +\end{tabular} +\end{table} + +The simulation procedure was performed as follows: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\tightlist +\item + Generate the values of the coefficients for the probability transition matrix of series \(S_{1t}\) randomly; +\item + Generate the probability transition matrix of series \(S_{2t}\) randomly; +\item + Set the initial value of \(S_{2t}\) to 1 and simulate the following from the defined probability transition matrix; +\item + In each iteration (of 1000 repetitions), + + \begin{itemize} + \tightlist + \item + Generate \(X_t \sim N(2,25)\); + \item + Generate the time-varying probabilities of series \(S_{1t}\) through the values of the fixed coefficients and the lagged variable \(x_t\); + \item + Set the initial values of the series \(S_{1t}\) as 1; + \item + For each period \(t\), simulate the next state of \(S_{1t}\) from the probabilities simulated for that moment; + \item + Estimate the model through the function \texttt{mmcx}; + \item + Calculate the Wald test and add to the counter if it is rejected. + \end{itemize} +\end{enumerate} + +\begin{figure} + +{\centering \includegraphics{genmarkov_files/figure-latex/figure-2states-1} + +} + +\caption{Simulation study results for two-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test remains stable regardless sample size. Power of test increases with sample size. The proposed model detects the presence of non-homogenenous Markov Chain.}\label{fig:figure-2states} +\end{figure} + +Considering two states, the test dimension was at 5.7\% with a sample size of 100 observations, sightly increased with 500 observations, and returned to the expected values in 1000 and 5000 observations. For a sample size of 100, 500, and 1000 observations, we have low test power. So, when considering two states, the sample must have at least 5000 observations, or, if that is not possible, consider a higher significance level when testing for individual significance. + +\begin{figure} + +{\centering \includegraphics{genmarkov_files/figure-latex/figure-3states-1} + +} + +\caption{Simulation study results for three-states, displaying the proportion of rejections of the null hypothesis for two parameter values. Dimension of test decreases as sample size increases. Power of test is stable regardless of sample size. The proposed model detects the presence of non-homogenenous Markov Chain.}\label{fig:figure-3states} +\end{figure} + +Considering three states, the test dimension was 9.7\% for a sample size of 100 observations, 0.2\% for a sample size of 500 observations, and 0.3\% for a sample size of 1000. Regarding the test power, we see similar behavior, for a sample of 100 observations, the test power was 90.5\%, and from a sample of 500 observations, we reach a test power of 100\%. Thus, when considering three states, one may consider a sample of 500 observations without compromising the test power and dimension. + +\newpage + +\hypertarget{part-ii-detecting-parameters-assigned-values}{% +\subsubsection{Part II: Detecting Parameters Assigned Values}\label{part-ii-detecting-parameters-assigned-values}} + +Secondly, we performed a simulation study where we considered two non-homogeneous Markov chain with two states. Here, the main goal was to assess if the model correctly detected the parameters assigned. So, in this case, we started by generating the terms of the model proposed. These terms were estimated through logistic regression, and the parameters of this regression were randomly generated in R. Similarly to Part I, we considered a Wald test to assess the power and dimension of the test. The simulation procedure was performed as follows: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\tightlist +\item + Generate the values of the coefficients to calculate the probability transition matrices randomly; +\item + In each iteration (of 1000 repetitions), + + \begin{itemize} + \tightlist + \item + Generate \(\{x_t\} \sim N(2,25)\); + \item + Generate the probabilities \(P \left(S_{jt}|S_{st-1}, x_{t-1} \right)\), with \(j=1,2\) and \(s=1,2\). + \item + Set the initial values of the series \(S_{1t}\) and \(S_{2t}\) as 1; + \item + For each period \(t\), calculate the probabilities \(P \left(S_{1t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)\) and \(P \left( S_{2t}|S_{1t-1}, S_{2t-1}, x_{t-1} \right)\) through the assigned values of the \(\lambda\)'s. Considering the calculated probabilities, simulate the next state for each series, \(S_{1t}\) and \(S_{2t}\). + \item + Estimate the model through the function \texttt{mmcx}; + \item + Calculate the Wald test and add to the counter if it is rejected. + \end{itemize} +\end{enumerate} + +The probabilities \(P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)\) and \(P\left(S_{1t}|S_{2t-1}, x_{t-1}\right)\) presented some differences regarding its values' distributions. Specifically, \(P\left(S_{1t}|S_{1t-1}, x_{t-1} \right)\) had more extreme probabilities values, with the minimum value being close to 0 and the maximum value being close to 1. And, the probabilities \(P\left(S_{1t}|S_{2t-1}, x_{t-1} \right)\) had more moderate values, with the minimum value being, on average, 0.3 and the maximum value, 0.7. When the probabilities have values close to 1, one says that the states/regimes are persistent. We calculated the power and dimension of test for each value of \(\lambda\) when the estimated probabilities are moderate and when they are extreme. Hence, considering equation 1: + +\begin{multline} +P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\dots, S_{2t-1} = i_2, \boldsymbol{x}_{t-1} \right) = \\ +\lambda_{11}P\left(S_{1t} = i_0 | S_{1t-1} = i_1,\boldsymbol{x}_{t-1}\right) + \lambda_{12}P\left(S_{1t} = i_0 | S_{2t-1} = i_s, \boldsymbol{x}_{t-1} \right) \label{eq:eq12} +\end{multline} + +The parameter \(\lambda_{11}\) will be associated with more extreme probabilities and \(\lambda_{12}\) will be associated with more moderate probabilities. + +\begin{figure} + +{\centering \includegraphics{genmarkov_files/figure-latex/figure-persistent-1-1} + +} + +\caption{Simulation study results for persistent states on low values of the parameters (case 1), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension decreases as sample size increases. Power of test increases with sample size. The proposed model has low power of test when low parameter values are associated with persistent states.}\label{fig:figure-persistent-1} +\end{figure} + +\begin{figure} + +{\centering \includegraphics{genmarkov_files/figure-latex/figure-persistent-2-1} + +} + +\caption{Simulation study results for persistent states on high values of the parameters (case 2), displaying the proportion of rejections of the null hypothesis for four parameter values. Dimension and power of test increase as sample size increases. The results point towards a low test power in this setting.}\label{fig:figure-persistent-2} +\end{figure} + +When the states are persistent and the parameter's value is low (i.e., 0.2 and 0.4), we have low test power. By increasing this value, the power of test increases as well. When the states are not persistent, we do not have a clear pattern regarding the power of test, for a value of the parameter of 0.2, the power of test is still low (although not as low as the first scenario), increases when we have a value of 0.4, decreases when the value is 0.6 and increases again when the value is 0.8. Overall, the estimated standard errors seem high, leading to low test power. Regarding the test dimension, when we have a higher weight associated with the non-persistent states, the test dimension converges to 0. However, when this weight is associated with the persistent states, the test dimension increases with the sample size, reaching a value of 10\% in some cases. Hence, one must use a 10\% significance level to perform statistical inference on the parameters in this situation. + +\hypertarget{software-implementation}{% +\subsection{Software implementation}\label{software-implementation}} + +Regarding the software implementation for each function, for the \texttt{multimtd} function the estimation method was presented in Berchtold (2001) applied to the multivariate case. For \texttt{multimtd\_probit}, a package for numerical maximization of the log-likelihood, \CRANpkg{maxLik} (Henningsen and Toomet 2011), was used. This package performs Maximum Likelihood estimation through different optimization methods that the user can choose. The optimization methods available are Newton-Raphson, Broyden - Fletcher - Goldfarb - Shanno, BFGS al- algorithm, Berndt - Hall - Hall - Hausman, Simulated ANNealing, Conjugate Gradients, and Nelder-Mead. Finally, for the \texttt{mmcx} function, a different approach was used. Unlike the MTD- Probit, the model proposed has equality and inequality restrictions in the parameters. The \CRANpkg{maxLik} (Henningsen and Toomet 2011) package only allows one type of restriction for each Maximum Likelihood estimation, so it was not possible to use this package to estimate the proposed model with exogenous variables. Hence, the algorithm used was the Augmented Lagrangian method, available in the \CRANpkg{alabama} (Varadhan 2015) package through the function \texttt{auglag}. This estimation method for the proposed model is not very common, however, it has been applied to Markov chain models (Rajarshi 2013). The GMMC model's probabilities were estimated through a Multinomial Logit using \texttt{rmultinom} of the \CRANpkg{nnet} package (Venables and Ripley 2002). + +Additionally, the hessian matrices were also computed, which allowed performing statistical inference. The \texttt{maxLik} and \texttt{auglag} compute the Hessian matrices with the estimates. For the function \texttt{multimtd}, since the optimization procedure of Berchtold (2001) was used, the hessian was computed through the second partial derivatives. The function \texttt{multi.mtd} requires the following elements: + +\begin{itemize} +\item + \texttt{y}, a matrix of the categorical data sequences. +\item + \texttt{deltaStop}, the delta below which the optimization phases of the parameters stop. +\item + \texttt{is\_constrained}, flag indicating whether the function will consider the usual set of constraints (usual set: \textit{TRUE}, new set of constraints: \textit{FALSE}). +\item + \texttt{delta}, the amount of change to increase/decrease in the parameters for each iteration of the optimization algorithm. +\end{itemize} + +The last three arguments concern the optimization procedure. For more details see Berchtold (2001). Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2}, to estimate the model and obtain the results: + +\begin{verbatim} +multi.mtd(y=cbind(s1,s2), deltaStop=0.0001, is_constrained=TRUE, delta=0.1) +\end{verbatim} + +The function \texttt{multi.mtd\_probit} requires the following arguments: + +\begin{itemize} +\tightlist +\item + \texttt{y}, a matrix of the categorical data sequences. +\item + \texttt{initial}, a vector of the initial values of the parameters. +\item + \texttt{nummethod}, the numerical maximization method, currently either ``NR'' (for Newton-Raphson), ``BFGS'' (for Broyden-Fletcher-Goldfarb-Shanno), ``BFGSR'' (for the BFGS algorithm implemented in R), ``BHHH'' (for Berndt-Hall-Hall-Hausman), ``SANN'' (for Simulated ANNealing), ``CG'' (for Conjugate Gradients), or ``NM'' (for Nelder-Mead). Lower-case letters (such as ``nr'' for Newton-Raphson) are allowed. The default method is ``BFGS''. For more details see \CRANpkg{maxLik} (Henningsen and Toomet 2011) package. +\end{itemize} + +Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2} again, to estimate the model an obtain the results with BFGS maximization method: + +\begin{verbatim} +multi.mtd_probit(y = cbind(s1,s2), initial=c(1,1,1), nummethod='bfgs') +\end{verbatim} + +Finally, the function \texttt{mmcx} requires the following elements: + +\begin{itemize} +\tightlist +\item + \texttt{y}, a matrix of categorical data sequences. +\item + \texttt{x}, a matrix of covariates (exogeneous variables). +\item + \texttt{initial}, a vector of the initial values of the parameters. +\end{itemize} + +Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2}, and a vector of an exogeneous variables, \texttt{x}, to estimate the model and obtain the results: + +\begin{verbatim} +mmcx(y = cbind(s1,s2), x = cbind(x), initial=c(1,1)) +\end{verbatim} + +These functions return a list with the parameter estimates, standard errors, z-statistics, p- values, and the log-likelihood function value for each equation. + +The package offers an additional function that allows to obtain the transition probability matrices of \texttt{mmcx} considering a specific value of \texttt{x} defined by the user. The function is \texttt{MMC\_tpm} and requires the following elements: + +\begin{itemize} +\tightlist +\item + \texttt{s}, a matrix of categorical data sequences. +\item + \texttt{x}, a matrix of covariates (exogeneous variables). +\item + \texttt{value}, a single value of \texttt{x}, to condition the probability transition matrices. +\item + \texttt{result}, a list returned by the function \texttt{mmcx} containing the model's estimates. +\end{itemize} + +Considering two vectors of two categorical data sequences, \texttt{s1} and \texttt{s2}, a vector of an exogeneous variables, \texttt{x} and \texttt{res} the list returned by the function \texttt{mmcx}, to obtain the transition probability matrices: + +\begin{verbatim} +MMC_tpm(s = cbind(s1,s2), x = cbind(x), value = max(x), result = res) +\end{verbatim} + +The function returns an array containing the probability transition matrices, conditioned on a specific value of \texttt{x}, for each equation. + +\hypertarget{illustration}{% +\section{Illustration}\label{illustration}} + +Markov chain models are used in interdisciplinary areas, such as economics, business, biology, and engineering, with applications to predict long-term behavior from traffic flow to stock market movements, among others. Modeling and predicting stock markets returns is particularly relevant for investors and policy makers. Since the stock market is a volatile environment, and the returns are difficult to predict, estimating the set of probabilities that describe these movements, might provide relevant input. Additionally, incorporating the effect of key macroeconomic variables could provide a more accurate picture of this specific environment. + +The following empirical illustration aims to model stock returns of two indexes as a function of the interest rate spread, specifically the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. + +The interest rate spread is a key macroeconomic variable and provides valuable information regarding the economy state. Specifically, it has been used to forecast recessions as in Estrella and Mishkin (1996), Dombrosky and Haubrich (1996), Chauvet and Senyuz (2016), Tian and Shen (2019) and McMillan (2021). Generically, short-term yields are lower than long-term yields when the economy is in expansion. On the other hand, short-term yields are higher than long-term yields when the economy is in recession. The difference between these yields (or, more specifically, the yield curve's slope) can be used to forecast the state of the economy. Hence, this indicator might provide relevant input for investors. + +We considered the 5-week-day daily stock returns (\(r_t=100 \times \log(P_t/P_{t-1})\), where \(P_t\) is the adjusted close price) of two indexes, S\&P500 and DJIA, from November \(11^{th}\) 2011 to September \(1^{st}\) 2021 (2581 observations). Additionally, we considered the interest rate spread (\(spread_{t}\)), the 10-Year Treasury Constant Maturity Minus 3-Month Treasury Constant Maturity. The data was retrieved from FRED. Below, we have the descriptive statistics of these variables. + +\begin{table} + +\caption{\label{tab:summary-stat-tex}Summary statistics of $stockreturns$ dataset} +\centering +\begin{tabular}[t]{l|l|l|l|l|l|l} +\hline +Variable & Minimum & 1$^{st}$ Quantile & Median & Mean & 3$^{rd}$ Quantile & Maximum\\ +\hline +$spread_{t}$ & -0.52 & 0.92 & 1.54 & 1.454 & 2.03 & 2.97\\ +\hline +$r_{t;SP500}$ & -12.765 & -0.32 & 0.07 & 0.054 & 0.518 & 8.968\\ +\hline +$r_{t;DJIA}$ & -13.842 & -0.327 & 0.071 & 0.046 & 0.508 & 10.764\\ +\hline +\end{tabular} +\end{table} + +Moreover, to apply the model proposed, it is necessary to have a categorical time series, thus we applied the following procedure: + +\[ +S_{st}= +\begin{cases} +1, r_t \leq \widehat{q}_{s;0.25}\\ +2, \widehat{q}_{s;0.25} < r_t < \widehat{q}_{s;0.75} \\ +3, r_t \geq \widehat{q}_{s;0.75}\\ +\end{cases} +\] + +where \(\widehat{q}_{s;\alpha}\) is the estimated quantile of order \(\alpha\) of the marginal distribution of \(r_t\). Considering this illustration and the model proposed, we will have two equations: + +\begin{multline} +P(S_{sp500,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{11} P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{12} P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1}) \label{eq:eq13} +\end{multline} + +\begin{multline} +P(S_{djia,t} | S_{sp500, t-1}, S_{djia, t-1}, spread_{t-1}) = \\ \lambda_{21} P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1}) + \lambda_{22} P(S_{djia,t} | S_{djia, t-1}, spread_{t-1}) \label{eq:eq14} +\end{multline} + +In Figures \ref{fig:fig11} to \ref{fig:fig22} generate through \CRANpkg{ggplot2} (Wickham 2016) and \CRANpkg{gridExtra} (Auguie 2017), we have the smoothed conditional probabilities of both series, depending on \(spread_{t-1}\). The number of observations is high, and the probabilities varied abruptly in a small time frame, making the plots hard to read. To simplify, a moving average model (from \CRANpkg{pracma} (Borchers 2022)) of order 5, due to the frequency of the data, was adjusted to these probabilities to illustrate how they evolve throughout time. These plots represent the probabilities associated with the parameters of the general model proposed, showcasing how these vary throughout time and the main of advantage of this generalization. Instead of having fixed matrices of transition probabilities, we allow for these to vary throughout time, depending on the values of \(spread_{t-1}\). Specifically, Figures \ref{fig:fig11} and \ref{fig:fig12} correspond to the non-homogeneous Markov chain to build the SP\&500's equation and Figures \ref{fig:fig21} and Figures \ref{fig:fig22} correspond to the non-homogeneous Markov chain to build DJIA's equation. We see a similar behavior within each series regardless of whether it depends on the previous states of \(S_{1t}\) or \(S_{2t}\). Additionally, the scales of the graphs are small, indicating that these probabilities vary around the same set of values. + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{genmarkov_files/figure-latex/fig11-1} + +} + +\caption{Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{sp500,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig11} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{genmarkov_files/figure-latex/fig12-1} + +} + +\caption{Estimated conditional probabilities of series 1 (SP500) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{sp500,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig12} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{genmarkov_files/figure-latex/fig21-1} + +} + +\caption{Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 1 (SP500) previous state: $P(S_{djia,t} | S_{sp500, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig21} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.7\linewidth]{genmarkov_files/figure-latex/fig22-1} + +} + +\caption{Estimated conditional probabilities of series 2 (DJIA) depending on $spread_{t-1}$ and on series 2 (DJIA) previous state: $P(S_{djia,t} | S_{djia, t-1}, spread_{t-1})$. This figure shows the estimated non-homogeneous Markov chain from which the realized probabilites will be extracted to maximize the log-likelihood function.}\label{fig:fig22} +\end{figure} + +\newpage + +The model can be estimated through the \texttt{mmcx} function: + +\begin{verbatim} +attach(stockreturns) +res <- mmcx(cbind(sp500, djia), spread_1, initial=c(1,1)) +\end{verbatim} + +\begin{verbatim} +#> -------------------------------------------- +#> Equation 1 +#> Estimate Std. Error t value Pr(>|t|) +#> 1 0.685660 0.171358 4.001 0.000 *** +#> 2 0.314340 0.171358 1.834 0.067 * +#> +#> Log-Likelihood: -2636.355 +#> -------------------------------------------- +#> -------------------------------------------- +#> Equation 2 +#> Estimate Std. Error t value Pr(>|t|) +#> 1 0.629992 0.176327 3.573 0.000 *** +#> 2 0.370008 0.176327 2.098 0.036 ** +#> +#> Log-Likelihood: -2636.622 +#> -------------------------------------------- +\end{verbatim} + +Considering the first equation, the effect of the probabilities depending on S\&P500's previous state and the interest rate spread has a higher weight on the overall probability. Also, this estimate is highly significant, presenting a \(p\)-value close to zero. The effect of DJIA's previous state in S\&P500 is lower but it is also significant for a 10\% significance level. In the second equation, the effect of S\&P500's previous state is higher than DJIA's and both estimates are highly significant. + +One of the advantages of this approach is the possibility to assess the transition probabilities for specific values of \(x_t\), in this case, the interest rate spread. For both series, we calculated the transition probabilities for this variable's minimum and maximum value in the sample, which are -0.52 and 2.97, respectively. To obtain the probability transition matrices for these two cases, the code is the following: + +\begin{verbatim} +tpm_max <- MMC_tpm(cbind(sp500, djia), spread_1, + value = max(spread_1), result = res) + +tpm_min <- MMC_tpm(cbind(sp500, djia), spread_1, + value = min(spread_1), result = res) +\end{verbatim} + +\begin{verbatim} +library(markovchain) +plot(new('markovchain', transitionMatrix = tpm_max[,,1])) # Generate figure 9 +plot(new('markovchain', transitionMatrix = tpm_min[,,1])) # Generate figure 10 +plot(new('markovchain', transitionMatrix = tpm_max[,,2])) # Generate figure 11 +plot(new('markovchain', transitionMatrix = tpm_min[,,2])) # Generate figure 12 +\end{verbatim} + +In Figures \ref{fig:fig-sp500-min} and \ref{fig:fig-sp500-max}, we have the transition probabilities network for S\&P500, corresponding to the minimum and maximum value of the spread. The most noticeable difference between these two networks is regarding the transition probability from the second state to the third state. For the maximum value of \(spread_{t-1}\), the transition probability from the second state to the third state is 0.6. So, when the economy is strong, one might expect to have higher returns, when \(t-1\) was in the second state. However, this scenario shifts when considering the minimum value of \(spread_{t-1}\). The probability of obtaining higher returns, that is, being in state three, becomes almost evenly distributed, regardless of the state in \(t-1\). This indicates the instability of the stock market, when the economy is weaker. Another difference in these networks, is regarding the transition probability from the third state to the first state. For the maximum value of \(spread_{t-1}\), this probability is 0.27 and for the minimum value increases to 0.44. This is also expected, since when the economy is weaker, the probability of having lower returns is greater. + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{genmarkov_files/figure-latex/fig-sp500-max-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 1: SP500 for the maximum value of spread$_{t-1}$. The highest probability of 0.6 refers to the transition from state 2 to state 3.}\label{fig:fig-sp500-max} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{genmarkov_files/figure-latex/fig-sp500-min-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 1: SP500 for the minimum value of spread$_{t-1}$. The highest probability of 0.56 refers to the transition from state 2 to state 2.}\label{fig:fig-sp500-min} +\end{figure} + +Considering the second equation (Figures \ref{fig:fig-djia-max} and \ref{fig:fig-djia-min}), corresponding to the DJIA's returns, we see a similar behaviour as in S\&P500's networks. The transition probability from the second state to the third state is higher for the maximum value of \(spread_{t-1}\) and the transition probability from the third state to the first state is higher when we consider the minimum value of \(spread_{t-1}\). Although, the difference of this last probability between the minimum and maximum value of \(spread_{t-1}\) is not as big as in S\&P500. Overall, the rest of the probabilities structure, remains the same. + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{genmarkov_files/figure-latex/fig-djia-max-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 2: DJIA for the maximum value of spread$_{t-1}$. The probability of 0.58 refers to the transition from state 2 to state 3.}\label{fig:fig-djia-max} +\end{figure} + +\begin{figure} + +{\centering \includegraphics[width=0.6\linewidth]{genmarkov_files/figure-latex/fig-djia-min-1} + +} + +\caption{Graphical representation of the transition probability matrix of Series 2: DJIA for the minimum value of spread$_{t-1}$. The highest probability of 0.51 refers to the transition from state 2 to state 2.}\label{fig:fig-djia-min} +\end{figure} + +\hypertarget{conclusions-limitations-and-further-research}{% +\section{Conclusions, limitations and further research}\label{conclusions-limitations-and-further-research}} + +Several proposals for including of exogenous variables in MMC models have been presented. The main limitations were associated with the high complexity of the models to be developed and estimated. Additionally, most models considered only categorical exogenous variables, existing a lack of focus on continuous exogenous variables. This work proposes a new approach to include continuous exogenous variables in Ching, Fung, and Ng (2002) model for multivariate Markov chain. This is relevant because it allows studying the effect of previous series and exogenous variables on the transition probabilities. The model is based on Ching, Fung, and Ng (2002) MMC model but considers non-homogeneous Markov chains. Thus, the probabilities that compose the model are dependent on exogenous variables. These probabilities are estimated as a usual non-homogeneous Markov chain through a multinomial logit model. The model parameters are then estimated through MLE, as well as the standard errors. We developed a package with the estimation function of the model proposed. In this, we considered the Augmented Lagrangian optimization method for estimating the parameters through MLE. Additionally, we designed a Monte Carlo simulation study to assess this model's test power and dimension. The results showed that the model detected a non-homogeneous Markov chain. Moreover, an empirical illustration demonstrated the relevance of this new model by estimating the probability transition matrix for different exogenous variable values. Ignoring the effect of exogenous variables in MMC means that we would not detect the probabilities' changes according to the covariates' values. In this setting, one would have a limited view of the studied process. Hence, this approach allows us to understand how a specific variable influences a specific process. The main contributions of this work are the development of a package with functions for multivariate Markov chains, addressing the statistical inference in these models and the inclusion of covariates. The limitations are related to the implementation in R, specifically the optimization algorithm applied is not common for MMC models, in that sense, it would be beneficial to study new approaches to optimizing the maximum likelihood function as further research. Additionally, extending this generalization to the MTD-probit model proposed by Nicolau (2014) would also be relevant, which removes the constraints of the model's parameters and allows the model to detect negative effects. + +\hypertarget{references}{% +\section*{References}\label{references}} +\addcontentsline{toc}{section}{References} + +\hypertarget{refs}{} +\begin{CSLReferences}{1}{0} +\leavevmode\vadjust pre{\hypertarget{ref-Adke1988}{}}% +Adke, S. R., and S. R. Deshmukh. 1988. {``{Limit Distribution of a High Order Markov Chain}.''} \emph{Journal of the Royal Statistical Society. Series B (Methodological)} 50 (1): 105--8. \url{https://www.jstor.org/stable/2345812}. + +\leavevmode\vadjust pre{\hypertarget{ref-gridextra}{}}% +Auguie, Baptiste. 2017. \emph{gridExtra: Miscellaneous Functions for "Grid" Graphics}. \url{https://CRAN.R-project.org/package=gridExtra}. + +\leavevmode\vadjust pre{\hypertarget{ref-Azzalini1994}{}}% +Azzalini, A. 1994. {``{Logistic regression for autocorrelated data with application to repeated measures}.''} \emph{Biometrika} 81 (4): 767--75. \url{https://doi.org/10.1093/biomet/81.4.767}. + +\leavevmode\vadjust pre{\hypertarget{ref-Bart1968}{}}% +Bartholomew, J. 1968. {``{Stochastic Models for Social Processes}.''} \emph{The Australian and New Zealand Journal of Sociology} 4 (2): 171--72. https://doi.org/\url{https://doi.org/10.1177/144078336800400215}. + +\leavevmode\vadjust pre{\hypertarget{ref-Berchtold1995}{}}% +Berchtold, A. 1995. {``{Autoregressive Modelling of Markov Chains}.''} \emph{Proc. 10th International Workshop on Statistical Modelling} 104: 19--26. \url{https://doi.org/10.1007/978-1-4612-0789-4_3}. + +\leavevmode\vadjust pre{\hypertarget{ref-Berchtold1996}{}}% +---------. 1996. {``{Modélisation autorégressive des chaines de Markov : utilisation d'une matrice différente pour chaque retard}.''} \emph{Revue de Statistique Appliquée} 44 (3): 5--25. \url{http://www.numdam.org/item/RSA_1996__44_3_5_0/}. + +\leavevmode\vadjust pre{\hypertarget{ref-Berchtold2001}{}}% +---------. 2001. {``Estimation in the Mixture Transition Distribution Model.''} \emph{Journal of Time Series Analysis} 22 (4): 379--97. https://doi.org/\url{https://doi.org/10.1111/1467-9892.00231}. + +\leavevmode\vadjust pre{\hypertarget{ref-Berchtold2003}{}}% +---------. 2003. {``{Mixture transition distribution (MTD) modeling of heteroscedastic time series}.''} \emph{Computational Statistics and Data Analysis} 41 (3-4): 399--411. \url{https://doi.org/10.1016/S0167-9473(02)00191-3}. + +\leavevmode\vadjust pre{\hypertarget{ref-Berchtold2020}{}}% +Berchtold, A., O. Maitre, and K. Emery. 2020. {``{Optimization of the mixture transition distribution model using the march package for R}.''} \emph{Symmetry} 12 (12): 1--14. \url{https://doi.org/10.3390/sym12122031}. + +\leavevmode\vadjust pre{\hypertarget{ref-Bolano2020}{}}% +Bolano, D. 2020. {``{Handling covariates in markovian models with a mixture transition distribution based approach}.''} \emph{Symmetry} 12 (4). \url{https://doi.org/10.3390/SYM12040558}. + +\leavevmode\vadjust pre{\hypertarget{ref-pracma}{}}% +Borchers, Hans W. 2022. \emph{Pracma: Practical Numerical Math Functions}. \url{https://CRAN.R-project.org/package=pracma}. + +\leavevmode\vadjust pre{\hypertarget{ref-Chauvet2016}{}}% +Chauvet, M., and Z. Senyuz. 2016. {``A Dynamic Factor Model of the Yield Curve Components as a Predictor of the Economy.''} \emph{International Journal of Forecasting} 32 (2): 324--43. https://doi.org/\url{https://doi.org/10.1016/j.ijforecast.2015.05.007}. + +\leavevmode\vadjust pre{\hypertarget{ref-ChenLio2009}{}}% +Chen, D. G., and Y. L. Lio. 2009. {``{A Novel Estimation Approach for Mixture Transition Distribution Model in High-Order Markov Chains}.''} \emph{Communications in Statistics - Simulation and Computation} 38 (5): 990--1003. \url{https://doi.org/10.1080/03610910802715009}. + +\leavevmode\vadjust pre{\hypertarget{ref-Ching2002}{}}% +Ching, W. K., E. S. Fung, and M. K. Ng. 2002. {``A Multivariate Markov Chain Model for Categorical Data Sequences and Its Applications in Demand Predictions.''} \emph{IMA Journal of Management Mathematics} 13 (3): 187--99. \url{https://doi.org/10.1093/imaman/13.3.187}. + +\leavevmode\vadjust pre{\hypertarget{ref-Ching2003}{}}% +---------. 2003. {``A Higher-Order Markov Model for the Newsboy's Problem.''} \emph{The Journal of the Operational Research Society} 54 (3): 291--98. + +\leavevmode\vadjust pre{\hypertarget{ref-Ching2006}{}}% +Ching, W. K., and M. K. Ng. 2006. \emph{Markov Chains: Models, Algorithms and Applications}. Springer. \url{https://doi.org/10.1007/0-387-29337-X}. + +\leavevmode\vadjust pre{\hypertarget{ref-Ching2008}{}}% +Ching, W. K., M. K. Ng, and E. S. Fung. 2008. {``{Higher-order multivariate Markov chains and their applications}.''} \emph{Linear Algebra and Its Applications} 428 (2-3): 492--507. \url{https://doi.org/10.1016/j.laa.2007.05.021}. + +\leavevmode\vadjust pre{\hypertarget{ref-Damasio2013}{}}% +Damásio, B. 2013. {``{Multivariate Markov Chains - Estimation, Inference and Forecast. A New Approach: What If We Use Them As Stochastic Covariates?}''} Master dissertation, Universidade de Lisboa, Instituto Superior de Economia e Gestão. \url{http://hdl.handle.net/10400.5/6397}. + +\leavevmode\vadjust pre{\hypertarget{ref-Damasio2018}{}}% +---------. 2018. {``{Essays on Econometrics: Multivariate Markov Chains}.''} \{PhD\} dissertation, Universidade de Lisboa, Instituto Superior de Economia e Gestão. \url{https://www.repository.utl.pt/bitstream/10400.5/18128/1/TD-BD-2019.pdf}. + +\leavevmode\vadjust pre{\hypertarget{ref-Damasio2019}{}}% +Damásio, B., and S. Mendonça. 2019. {``{Modelling insurgent-incumbent dynamics: Vector autoregressions, multivariate Markov chains, and the nature of technological competition}.''} \emph{Applied Economics Letters} 26 (10): 843--49. \url{https://doi.org/10.1080/13504851.2018.1502863}. + +\leavevmode\vadjust pre{\hypertarget{ref-DamasioM2020}{}}% +---------. 2020. {``Leader-Follower Dynamics in Real Historical Time: A Markovian Test of Non-Linear Causality Between Sail and Steam (Co-)development, Mimeo.''} + +\leavevmode\vadjust pre{\hypertarget{ref-DAMASIO2014}{}}% +Damásio, B., and J. Nicolau. 2014. {``{Combining a regression model with a multivariate Markov chain in a forecasting problem}.''} \emph{Statistics \& Probability Letters} 90: 108--13. https://doi.org/\url{https://doi.org/10.1016/j.spl.2014.03.026}. + +\leavevmode\vadjust pre{\hypertarget{ref-Damasio2020}{}}% +---------. 2020. {``{Time inhomogeneous multivariate Markov chains : detecting and testing multiple structural breaks occurring at unknown dates}.''} REM Working Papers 0136--2020. REM Working Papers. Instituto Superior de Economia e Gestão. \url{http://hdl.handle.net/10400.5/20164}. + +\leavevmode\vadjust pre{\hypertarget{ref-Dombrosky1996}{}}% +Dombrosky, A. M., and J. Haubrich. 1996. {``Predicting Real Growth Using the Yield Curve.''} \emph{Economic Review} I (Q): 26--35. \url{https://EconPapers.repec.org/RePEc:fip:fedcer:y:1996:i:qi:p:26-35}. + +\leavevmode\vadjust pre{\hypertarget{ref-Estrella1996}{}}% +Estrella, A., and F. S. Mishkin. 1996. {``{The yield curve as a predictor of U.S. recessions}.''} \emph{Current Issues in Economics and Finance} 2 (Jun). \url{https://www.newyorkfed.org/research/current_issues/ci2-7.html}. + +\leavevmode\vadjust pre{\hypertarget{ref-maxLik}{}}% +Henningsen, A., and O. Toomet. 2011. {``maxLik: A Package for Maximum Likelihood Estimation in {R}.''} \emph{Computational Statistics} 26 (3): 443--58. \url{https://doi.org/10.1007/s00180-010-0217-1}. + +\leavevmode\vadjust pre{\hypertarget{ref-Islam2004}{}}% +Islam, M. A., S. Arabia, and R. I. Chowdhury. 2004. {``{A Three State Markov Model for Analyzing Covariate Dependence}.''} \emph{International Journal of Statistical Sciences} 3 (i): 241--49. \url{http://www.ru.ac.bd/stat/wp-content/uploads/sites/25/2019/01/P21.V3s.pdf}. + +\leavevmode\vadjust pre{\hypertarget{ref-IslamAtaharul2006}{}}% +Islam, M. A., and R. I. Chowdhury. 2006. {``{A higher order Markov model for analyzing covariate dependence}.''} \emph{Applied Mathematical Modelling} 30 (6): 477--88. \url{https://doi.org/10.1016/j.apm.2005.05.006}. + +\leavevmode\vadjust pre{\hypertarget{ref-jackson2011multi}{}}% +Jackson, Christopher. 2011. {``Multi-State Models for Panel Data: The Msm Package for r.''} \emph{Journal of Statistical Software} 38: 1--28. \url{https://doi.org/10.18637/jss.v038.i0810.18637/jss.v038.i08}. + +\leavevmode\vadjust pre{\hypertarget{ref-JacobLewis1978}{}}% +Jacobs, P. A., and A. W. Lewis. 1978. {``{Discrete Time Series Generated by Mixtures II : Asymptotic Properties}.''} \emph{Journal of the Royal Statistical Society: Series B (Methodological)} 40 (2): 222--28. \url{https://www.jstor.org/stable/2984759}. + +\leavevmode\vadjust pre{\hypertarget{ref-Kalbfleisch1985}{}}% +Kalbfleisch, J. D., and J. F. Lawless. 1985. {``{The analysis of panel data under a Markov assumption}.''} \emph{Journal of the American Statistical Association} 80 (392): 863--71. \url{https://doi.org/10.1080/01621459.1985.10478195}. + +\leavevmode\vadjust pre{\hypertarget{ref-Kijima2002}{}}% +Kijima, M., K. Komoribayashi, and E. Suzuki. 2002. {``{A multivariate Markov model for simulating correlated defaults}.''} \emph{Journal of Risk} 4 (July). \url{https://doi.org/10.21314/JOR.2002.066}. + +\leavevmode\vadjust pre{\hypertarget{ref-Le1996}{}}% +Le, N. D., R. D. Martin, and A. Raftery. 1996. {``{Modeling Flat Stretches, Brusts, and Outliers in Time Series Using Mixture Transition Distribution Models}.''} \emph{Journal of the American Statistical Association} 91 (436): 1504--15. \url{https://doi.org/10.1111/j.2517-6161.1985.tb01383.x}. + +\leavevmode\vadjust pre{\hypertarget{ref-Lebre2008}{}}% +Lèbre, S., and P. Y. Bourguignon. 2008. {``{An EM algorithm for estimation in the mixture transition distribution model}.''} \emph{Journal of Statistical Computation and Simulation} 78 (8): 713--29. \url{https://doi.org/10.1080/00949650701266666}. + +\leavevmode\vadjust pre{\hypertarget{ref-Logan1981}{}}% +Logan, J. 1981. {``{A structural model of the higher‐order Markov process incorporating reversion effects}.''} \emph{The Journal of Mathematical Sociology} 8 (1): 75--89. \url{https://doi.org/10.1080/0022250X.1981.9989916}. + +\leavevmode\vadjust pre{\hypertarget{ref-march}{}}% +Maitre, O., and K. Emery. 2020. \emph{March: Markov Chains}. \url{https://CRAN.R-project.org/package=march}. + +\leavevmode\vadjust pre{\hypertarget{ref-Martin1987}{}}% +Martin, R. D., and A. Raftery. 1987. {``{Non-Gaussian State-Space Modeling of Nonstationary Time Series: Comment: Robustness, Computation, and Non-Euclidean Models}.''} \emph{Journal of the American Statistical Association} 82 (400): 1044--50. \url{https://doi.org/10.2307/2289377}. + +\leavevmode\vadjust pre{\hypertarget{ref-McMillan2021}{}}% +McMillan, D. G. 2021. {``Predicting GDP Growth with Stock and Bond Markets: Do They Contain Different Information?''} \emph{International Journal of Finance \& Economics} 26 (3): 3651--75. https://doi.org/\url{https://doi.org/10.1002/ijfe.1980}. + +\leavevmode\vadjust pre{\hypertarget{ref-Mehran1989}{}}% +Mehran, F. 1989. {``{Analysis of Discrete Longitudinal Data: Infinite-Lag Markov Models}.''} In \emph{Statistical Data Analysis and Inference}, 533--41. Amsterdam: North-Holland. https://doi.org/\url{https://doi.org/10.1016/B978-0-444-88029-1.50053-8}. + +\leavevmode\vadjust pre{\hypertarget{ref-Muenz1985}{}}% +Muenz, L. R., and L. V. Rubinstein. 1985. {``{Markov Models for Covariate Dependence of Binary Sequences }.''} \emph{Biometrics} 41 (1): 91--101. \url{http://www.jstor.org/stable/2530646}. + +\leavevmode\vadjust pre{\hypertarget{ref-DTMCPack}{}}% +Nicholson, W. 2013. \emph{DTMCPack: Suite of Functions Related to Discrete-Time Discrete-State Markov Chains}. \url{https://CRAN.R-project.org/package=DTMCPack}. + +\leavevmode\vadjust pre{\hypertarget{ref-Nicolau2014}{}}% +Nicolau, J. 2014. {``{A new model for multivariate markov chains}.''} \emph{Scandinavian Journal of Statistics} 41 (4): 1124--35. \url{https://doi.org/10.1111/sjos.12087}. + +\leavevmode\vadjust pre{\hypertarget{ref-Nicolau_2014}{}}% +Nicolau, J., and F. I. Riedlinger. 2014. {``{Estimation and inference in multivariate Markov chains}.''} \emph{Statistical Papers} 56 (4): 1163--73. \url{https://doi.org/10.1007/s00362-014-0630-6}. + +\leavevmode\vadjust pre{\hypertarget{ref-Pegram1980}{}}% +Pegram, G. 1980. {``{An Autoregressive Model for Multilag Markov Chains}.''} \emph{Journal of Applied Probability} 17 (2): 350--62. \url{https://doi.org/10.2307/3213025}. + +\leavevmode\vadjust pre{\hypertarget{ref-Raftery1985}{}}% +Raftery, A. 1985. {``{A Model for High-Order Markov Chains}.''} \emph{Journal of the Royal Statistical Society: Series B (Methodological)} 47 (3): 528--39. \url{https://doi.org/10.1111/j.2517-6161.1985.tb01383.x}. + +\leavevmode\vadjust pre{\hypertarget{ref-Tavare1994}{}}% +Raftery, A., and S. Tavaré. 1994. {``{Estimation and Modelling Repeated Patterns in High Order Markov Chains with the Mixture Transition Distribution Model}.''} \emph{Applied Statistics} 43 (1): 179--99. \url{https://doi.org/10.2307/2986120}. + +\leavevmode\vadjust pre{\hypertarget{ref-Rajarshi2013}{}}% +Rajarshi, M. B. 2013. \emph{Statistical Inference for Discrete Time Stochastic Processes}. {SpringerBriefs in Statistics}. \url{http://www.springer.com/978-81-322-0762-7}. + +\leavevmode\vadjust pre{\hypertarget{ref-Regier1968}{}}% +Regier, M. H. 1968. {``{A Two-State Markov Model for Behavioral Change}.''} \emph{Journal of the American Statistical Association} 63 (323): 993--99. \url{https://doi.org/10.1080/01621459.1968.11009325}. + +\leavevmode\vadjust pre{\hypertarget{ref-Siu2005}{}}% +Siu, T. K., W. K. Ching, E. S. Fung, and M. K. Ng. 2005. {``{On a multivariate Markov chain model for credit risk measurement}.''} \emph{Quantitative Finance} 5 (6): 543--56. \url{https://doi.org/10.1080/14697680500383714}. + +\leavevmode\vadjust pre{\hypertarget{ref-markovchains}{}}% +Spedicato, G. A. 2017. {``Discrete Time Markov Chains with r.''} \emph{The R Journal}, July. \url{https://journal.r-project.org/archive/2017/RJ-2017-036/index.html}. + +\leavevmode\vadjust pre{\hypertarget{ref-Spilerman1976}{}}% +Spilerman, S., and B. Singer. 1976. {``{The Representation of Social Processes by Markov Models}.''} \emph{American Journal of Sociology} 82 (1): 1--54. \url{https://www.jstor.org/stable/2777460}. + +\leavevmode\vadjust pre{\hypertarget{ref-Tian2019}{}}% +Tian, R., and G. Shen. 2019. {``Predictive Power of Markovian Models: Evidence from US Recession Forecasting.''} \emph{Journal of Forecasting} 38 (6): 525--51. https://doi.org/\url{https://doi.org/10.1002/for.2579}. + +\leavevmode\vadjust pre{\hypertarget{ref-alabama}{}}% +Varadhan, R. 2015. \emph{Alabama: Constrained Nonlinear Optimization}. \url{https://CRAN.R-project.org/package=alabama}. + +\leavevmode\vadjust pre{\hypertarget{ref-nnet}{}}% +Venables, W. N., and B. D. Ripley. 2002. \emph{Modern Applied Statistics with s}. Fourth. New York: Springer. \url{https://www.stats.ox.ac.uk/pub/MASS4/}. + +\leavevmode\vadjust pre{\hypertarget{ref-Wang2014}{}}% +Wang, C., T. Z. Huang, and W. K. Ching. 2014. {``{A new multivariate Markov chain model for adding a new categorical data sequence}.''} \emph{Mathematical Problems in Engineering} 2014. \url{https://doi.org/10.1155/2014/502808}. + +\leavevmode\vadjust pre{\hypertarget{ref-Wasserman1980}{}}% +Wasserman, S. 1980. {``{Analyzing social networks as stochastic processes}.''} \emph{Journal of the American Statistical Association} 75 (370): 280--94. \url{https://doi.org/10.1080/01621459.1980.10477465}. + +\leavevmode\vadjust pre{\hypertarget{ref-ggplot2}{}}% +Wickham, Hadley. 2016. \emph{Ggplot2: Elegant Graphics for Data Analysis}. Springer-Verlag New York. \url{https://ggplot2.tidyverse.org}. + +\leavevmode\vadjust pre{\hypertarget{ref-Wong2001}{}}% +Wong, C. S., and W. K. Li. 2001. {``{On a mixture autoregressive conditional heteroscedastic model}.''} \emph{Journal of the American Statistical Association} 96 (455): 982--95. \url{https://doi.org/10.1198/016214501753208645}. + +\leavevmode\vadjust pre{\hypertarget{ref-Zhang2006}{}}% +Zhang, X., M. L. King, and R. J. Hyndman. 2006. {``{A Bayesian approach to bandwidth selection for multivariate kernel density estimation}.''} \emph{Computational Statistics and Data Analysis} 50 (11): 3009--31. \url{https://doi.org/10.1016/j.csda.2005.06.019}. + +\leavevmode\vadjust pre{\hypertarget{ref-Zhu2010}{}}% +Zhu, D. M., and W. K. Ching. 2010. {``{A new estimation method for multivariate Markov chain model with application in demand predictions}.''} \emph{Proceedings - 3rd International Conference on Business Intelligence and Financial Engineering, BIFE 2010}, 126--30. \url{https://doi.org/10.1109/BIFE.2010.39}. + +\end{CSLReferences} + + +\address{% +Carolina Vasconcelos\\ +NOVA Information Management School (NOVA IMS)\\% +Campus de Campolide, 1070-312 Lisboa, Portugal\\ +% +% +% +\href{mailto:cvasconcelos@novaims.unl.pt}{\nolinkurl{cvasconcelos@novaims.unl.pt}}% +} + +\address{% +Bruno Damásio\\ +NOVA Information Management School (NOVA IMS)\\% +Campus de Campolide, 1070-312 Lisboa, Portugal\\ +% +% +% +\href{mailto:bdamasio@novaims.unl.pt}{\nolinkurl{bdamasio@novaims.unl.pt}}% +} diff --git a/_articles/RJ-2024-006/motivation-letter.html b/_articles/RJ-2024-006/motivation-letter.html new file mode 100644 index 0000000000..a020894362 --- /dev/null +++ b/_articles/RJ-2024-006/motivation-letter.html @@ -0,0 +1,1692 @@ + + + + + + + + + + + + + +motivation-letter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + + + +

    +

    Editor
    +The R Journal
    +

    +

    Dear Professor van der Loo and Professor Cook,

    +

    Please consider our revised article titled “GenMarkov: Modeling +Generalized Multivariate Markov Chains in R” for publication in the R +Journal.

    +

    We have made all the minor revisions requested. In addition, given +that the point-by-point answer was apparently not received, we sent the +responses to the referees corresponding to the first round of revisions +once more.

    +

    We believe the article is suitable for publication.

    +

    Regards,

    +

    Carolina Vasconcelos
    +NOVA Information Management School Universidade Nova de Lisboa Lisbon, +Portugal

    +

    Bruno DamC!sio NOVA Information Management School Universidade Nova +de Lisboa Lisbon, Portugal

    + + + + +
    + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-006/motivation-letter.md b/_articles/RJ-2024-006/motivation-letter.md new file mode 100644 index 0000000000..6ad934ad51 --- /dev/null +++ b/_articles/RJ-2024-006/motivation-letter.md @@ -0,0 +1,45 @@ +--- +output: + html_document: + df_print: paged +fontsize: 12pt +--- + +\thispagestyle{empty} +\today + +Editor +The R Journal +\bigskip + +Dear Professor van der Loo and Professor Cook, +\bigskip + +Please consider our revised article titled "GenMarkov: Modeling Generalized Multivariate Markov Chains in R" for publication in the R Journal. + +We have made all the minor revisions requested. In addition, given that the point-by-point answer was apparently not received, we sent the responses to the referees corresponding to the first round of revisions once more. + +We believe the article is suitable for publication. + +\bigskip +\bigskip + +Regards, + + + + +Carolina Vasconcelos +NOVA Information Management School +Universidade Nova de Lisboa +Lisbon, Portugal +cvasconcelos@novaims.unl.pt + +Bruno DamC!sio +NOVA Information Management School +Universidade Nova de Lisboa +Lisbon, Portugal +bdamasio@novaims.unl.pt + +\bigskip + diff --git a/_articles/RJ-2024-006/supplementary_materials/GenMarkov_0.2.0.tar.gz b/_articles/RJ-2024-006/supplementary_materials/GenMarkov_0.2.0.tar.gz new file mode 100644 index 0000000000..2755f65a8c Binary files /dev/null and b/_articles/RJ-2024-006/supplementary_materials/GenMarkov_0.2.0.tar.gz differ diff --git a/_articles/RJ-2024-006/supplementary_materials/GenMarkov_0.2.0.zip b/_articles/RJ-2024-006/supplementary_materials/GenMarkov_0.2.0.zip new file mode 100644 index 0000000000..9e1bc699f5 Binary files /dev/null and b/_articles/RJ-2024-006/supplementary_materials/GenMarkov_0.2.0.zip differ diff --git a/_articles/RJ-2024-006/supplementary_materials/proofs.pdf b/_articles/RJ-2024-006/supplementary_materials/proofs.pdf new file mode 100644 index 0000000000..63eed41c1a Binary files /dev/null and b/_articles/RJ-2024-006/supplementary_materials/proofs.pdf differ diff --git a/_articles/RJ-2024-007/2022-185_R3.bib b/_articles/RJ-2024-007/2022-185_R3.bib new file mode 100644 index 0000000000..e534a07d32 --- /dev/null +++ b/_articles/RJ-2024-007/2022-185_R3.bib @@ -0,0 +1,799 @@ +@preamble{ " \newcommand{\noop}[1]{} " } % a do-nothing command that serves a purpose + +@article{choi2018smoothed, + title={Smoothed quantile regression analysis of competing risks}, + author={Choi, Sangbum and Kang, Sangwook and Huang, Xuelin}, + journal={Biometrical Journal}, + volume={60}, + number={5}, + pages={934--946}, + year={2018}, + url={https://doi.org/10.1002/bimj.201700104}, + publisher={Wiley Online Library} +} + +@article{fan2018quantile, + title={Quantile regression for competing risks analysis under case-cohort design}, + author={Fan, Caiyun and Ma, Huijuan and Zhou, Yong}, + journal={Journal of Statistical Computation and Simulation}, + volume={88}, + number={6}, + pages={1060--1080}, + year={2018}, + url={https://doi.org/10.1080/00949655.2017.1419352}, + publisher={Taylor \& Francis} +} + +@article{jung2009regression, + title={Regression on quantile residual life}, + author={Jung, Sin-Ho and Jeong, Jong-Hyeon and Bandos, Hanna}, + journal={Biometrics}, + volume= 65, + number= 4, + pages={1203--1212}, + year=2009, + url={https://doi.org/10.1111/j.1541-0420.2009.01196.x}, + publisher={Wiley Online Library} +} + +@article{kim2012censored, + title={Censored quantile regression for residual lifetimes}, + author={Kim, Mi-Ok and Zhou, Mai and Jeong, Jong-Hyeon}, + journal={Lifetime Data Analysis}, + volume=18, + number=2, + pages={177--194}, + year=2012, + url={https://doi.org/10.1007/s10985-011-9212-2}, + publisher={Springer} +} + +@article{pang2012variance, + title={Variance estimation in censored quantile regression via induced smoothing}, + author={Pang, Lei and Lu, Wenbin and Wang, Huixia Judy}, + journal={Computational Statistics \& Data Analysis}, + volume=56, + number=4, + pages={785--796}, + year=2012, + url={https://doi.org/10.1016/j.csda.2010.10.018}, + publisher={Elsevier} +} + +@article{peng2009competing, + title={Competing risks quantile regression}, + author={Peng, Limin and Fine, Jason P}, + journal={Journal of the American Statistical Association}, + volume=104, + number=488, + pages={1440--1453}, + year=2009, + url={https://doi.org/10.1198/jasa.2009.tm08228}, + publisher={Taylor \& Francis} +} + +@article{peng2008survival, + title={Survival analysis with quantile regression models}, + author={Peng, Limin and Huang, Yijian}, + journal={Journal of the American Statistical Association}, + volume={103}, + number={482}, + pages={637--649}, + year={2008}, + url={https://doi.org/10.1198/016214508000000355}, + publisher={Taylor \& Francis} +} + +@article{chiou2015semiparametric, + title={Semiparametric accelerated failure time modeling for clustered failure times from stratified sampling}, + author={Chiou, Sy Han and Kang, Sangwook and Yan, Jun}, + journal={Journal of the American Statistical Association}, + volume=110, + number=510, + pages={621--629}, + year=2015, + url={https://doi.org/10.1080/01621459.2014.917978}, + publisher={Taylor \& Francis} +} + +@article{brown2007induced, + title={Induced smoothing for rank regression with censored survival times}, + author={Brown, BM and Wang, You-Gan}, + journal={Statistics in Medicine}, + volume=26, + number=4, + pages={828--836}, + year=2007, + url={https://doi.org/10.1002/sim.2576}, + publisher={Wiley Online Library} +} + +@article{koenker1978regression, + title={Regression quantiles}, + author={Koenker, Roger and Bassett Jr, Gilbert}, + journal={Econometrica: Journal of the Econometric Society}, + pages={33--50}, + year=1978, + url={https://doi.org/10.2307/1913643}, + publisher={JSTOR} +} + +@book{fleming2011counting, + title={Counting Processes and Survival Analysis}, + author={Fleming, Thomas R and Harrington, David P}, + volume=169, + year=2011, + publisher={John Wiley \& Sons} +} + +@article{caplan2019dental, + title={Dental restoration longevity among geriatric and special needs patients}, + author={Caplan, DJ and Li, Y and Wang, W and Kang, S and Marchini, L and Cowen, HJ and Yan, J}, + journal={JDR Clinical \& Translational Research}, + volume={4}, + number={1}, + pages={41--48}, + year={2019}, + url={https://journals.sagepub.com/doi/pdf/10.1177/2380084418799083}, + publisher={SAGE Publications Sage CA: Los Angeles, CA} +} + +@Manual{quantregpackage, + title = {quantreg: Quantile regression}, + author = {Roger Koenker}, + year = {2022}, + note = {R package version 5.87}, + url = {https://CRAN.R-project.org/package=quantreg} +} + +@Manual{R:qris, + title = {qris: Quantile regression model for residual lifetime using an induced smoothing approach}, + author = {Kyu Hyun Kim and Sangwook Kang and Sy Han Chiou}, + year = {2022}, + note = {R package version 1.0.0}, + url = {https://CRAN.R-project.org/package=qris} +} + +@article{li2016quantile, + title={Quantile residual life regression with longitudinal biomarker measurements for dynamic prediction}, + author={Li, Ruosha and Huang, Xuelin and Cortes, Jorge E}, + journal={Journal of the Royal Statistical Society. Series C: Applied Statistics}, + volume={65}, + number={5}, + pages={755--773}, + year={2016}, + url={http://www.jstor.org/stable/44681854}, + publisher={Wiley-Blackwell} +} + +@article{chiou2015rank, + title={Rank-based estimating equations with general weight for accelerated failure time models: {A}n induced smoothing approach}, + author={Chiou, S and Kang, Sangwook and Yan, J}, + journal={Statistics in Medicine}, + volume={34}, + number={9}, + pages={1495--1510}, + year={2015}, + url={https://doi.org/10.1002/sim.6415}, + publisher={Wiley Online Library} +} + +@article{zeng2008efficient, + title={Efficient resampling methods for nonsmooth estimating functions}, + author={Zeng, Donglin and Lin, DY}, + journal={Biostatistics}, + volume={9}, + number={2}, + pages={355--363}, + year={2008}, + url={https://doi.org/10.1093/biostatistics/kxm034}, + publisher={Oxford University Press} +} + +@article{cox1972regression, + title={Regression Models and Life-Tables}, + author={Cox, David R}, + journal={Journal of the Royal Statistical Society: Series B (Methodological)}, + volume={34}, + number={2}, + pages={187--202}, + year={1972}, + url={https://doi.org/10.1111/j.2517-6161.1972.tb00899.x}, + publisher={Wiley Online Library} +} + +@article{ying1995survival, + title={Survival Analysis with Median Regression Models}, + author={Ying, Zhiliang and Jung, Sin-Ho and Wei, Lee-Jen}, + journal={Journal of the American Statistical Association}, + volume={90}, + number={429}, + pages={178--184}, + year={1995}, + url={https://doi.org/10.1080/01621459.1995.10476500}, + publisher={Taylor \& Francis} +} + +@article{portnoy2003censored, + title={Censored regression quantiles}, + author={Portnoy, Stephen}, + journal={Journal of the American Statistical Association}, + volume={98}, + number={464}, + pages={1001--1012}, + year={2003}, + url={https://doi.org/10.1198/016214503000000954}, + publisher={Taylor \& Francis} +} + +@article{oakes2003inference, + title={Inference for the proportional mean residual life model}, + author={Oakes, David and Dasu, Tamraparni}, + journal={Lecture Notes-Monograph Series}, + pages={105--116}, + year={2003}, + url={http://www.jstor.org/stable/4356266}, + publisher={JSTOR} +} + +@article{chen2005semiparametric, + title={Semiparametric estimation of proportional mean residual life model in presence of censoring}, + author={Chen, YQ and Jewell, NP and Lei, X and Cheng, SC}, + journal={Biometrics}, + volume={61}, + number={1}, + pages={170--178}, + year={2005}, + url={https://doi.org/10.1111/j.0006-341X.2005.030224.x}, + publisher={Wiley Online Library} +} + +@article{maguluri1994estimation, + title={Estimation in the mean residual life regression model}, + author={Maguluri, Gangaji and Zhang, Cun-Hui}, + journal={Journal of the Royal Statistical Society: Series B (Methodological)}, + volume={56}, + number={3}, + pages={477--489}, + year={1994}, + url={https://doi.org/10.1111/j.2517-6161.1994.tb01994.x}, + publisher={Wiley Online Library} +} + +@article{oakes1990note, + title={A note on residual life}, + author={Oakes, David and Dasu, Tamraparni}, + journal={Biometrika}, + volume={77}, + number={2}, + pages={409--410}, + year={1990}, + url={https://doi.org/10.1093/biomet/77.2.409}, + publisher={Oxford University Press} +} + +@article{chen2006linear, + title={Linear life expectancy regression with censored data}, + author={Chen, Ying Qing and Cheng, Seu}, + journal={Biometrika}, + volume={93}, + number={2}, + pages={303--313}, + year={2006}, + url={https://doi.org/10.1093/biomet/93.2.303}, + publisher={Oxford University Press} +} + +@article{chen2007additive, + title={Additive Expectancy Regression}, + author={Chen, Ying Qing}, + journal={Journal of the American Statistical Association}, + volume={102}, + number={477}, + pages={153--166}, + year={2007}, + url={https://doi.org/10.1198/016214506000000870}, + publisher={Taylor \& Francis} +} + +@article{zhang2010goodness, + title={Goodness-of-fit tests for additive mean residual life model under right censoring}, + author={Zhang, Zhigang and Zhao, Xingqiu and Sun, Liuquan}, + journal={Lifetime Data Analysis}, + volume={16}, + number={3}, + pages={385--408}, + year={2010}, + url={https://doi.org/10.1007/s10985-010-9152-2}, + publisher={Springer} +} + +@techreport{liu2008regression, + title={Regression analysis of mean residual life function}, + author={Liu, Shufang and Ghosh, Sujit K}, + year={2008}, + institution={North Carolina State University. Dept. of Statistics}, + url = {https://repository.lib.ncsu.edu/bitstream/handle/1840.4/3041/mimeo2613.pdf?sequence=1} +} + +@article{sun2009class, + title={A class of transformed mean residual life models With Censored survival data}, + author={Sun, Liuquan and Zhang, Zhigang}, + journal={Journal of the American Statistical Association}, + volume={104}, + number={486}, + pages={803--815}, + year={2009}, + url={https://doi.org/10.1198/jasa.2009.0130}, + publisher={Taylor \& Francis} +} + +@article{sun2012mean, + title={Mean residual life models with time-dependent coefficients under right censoring}, + author={Sun, Liuquan and Song, Xinyuan and Zhang, Zhigang}, + journal={Biometrika}, + volume={99}, + number={1}, + pages={185--197}, + year={2012}, + url={https://doi.org/10.1093/biomet/asr065}, + publisher={Oxford University Press} +} + +@article{jung1996quasi, + title={Quasi-Likelihood for median regression models}, + author={Jung, Sin-Ho}, + journal={Journal of the American Statistical Association}, + volume={91}, + number={433}, + pages={251--257}, + year={1996}, + url={https://doi.org/10.1080/01621459.1996.10476683}, + publisher={Taylor \& Francis Group} +} + +@article{portnoy1997gaussian, + title={The Gaussian hare and the Laplacian tortoise: computability of squared-error versus absolute-error estimators}, + author={Portnoy, Stephen and Koenker, Roger}, + journal={Statistical Science}, + volume={12}, + number={4}, + pages={279--300}, + year={1997}, + url={https://doi.org/10.1214/ss/1030037960}, + publisher={Institute of Mathematical Statistics} +} + +@article{wei2006quantile, + title={Quantile regression methods for reference growth charts}, + author={Wei, Ying and Pere, Anneli and Koenker, Roger and He, Xuming}, + journal={Statistics in Medicine}, + volume={25}, + number={8}, + pages={1369--1382}, + year={2006}, + url={https://doi.org/10.1002/sim.2271}, + publisher={Wiley Online Library} +} + +@article{whang2006smoothed, + title={Smoothed empirical likelihood methods for quantile regression models}, + author={Whang, Yoon-Jae}, + journal={Econometric Theory}, + pages={173--205}, + year={2006}, + doi={10.1017/S0266466606060087}, + publisher={JSTOR} +} + +@article{gelfand2003bayesian, + title={Bayesian semiparametric regression for median residual life}, + author={Gelfand, Alan E and Kottas, Athanasios}, + journal={Scandinavian Journal of Statistics}, + volume={30}, + number={4}, + pages={651--665}, + year={2003}, + url={https://doi.org/10.1111/1467-9469.00356}, + publisher={Wiley Online Library} +} + +@article{wang2009locally, + title={Locally weighted censored quantile regression}, + author={Wang, Huixia Judy and Wang, Lan}, + journal={Journal of the American Statistical Association}, + volume={104}, + number={487}, + pages={1117--1128}, + year={2009}, + url={https://doi.org/10.1198/jasa.2009.tm08230}, + publisher={Taylor \& Francis} +} + +@article{huang2010quantile, + title={Quantile calculus and censored regression}, + author={Huang, Yijian}, + journal={Annals of Statistics}, + volume={38}, + number={3}, + pages={1607}, + year={2010}, + doi={10.1214/09-AOS771}, + publisher={NIH Public Access} +} + +@article{portnoy2010asymptotics, + title={Asymptotics for censored regression quantiles}, + author={Portnoy, Stephen and Lin, Guixian}, + journal={Journal of Nonparametric Statistics}, + volume={22}, + number={1}, + pages={115--130}, + year={2010}, + url={https://doi.org/10.1080/10485250903105009}, + publisher={Taylor \& Francis} +} + +@article{johnson2009induced, + title={Induced smoothing for the semiparametric accelerated failure time model: {A}symptotics and extensions to clustered data}, + author={Johnson, Lynn M and Strawderman, Robert L}, + journal={Biometrika}, + volume={96}, + number={3}, + pages={577--590}, + year={2009}, + url={https://doi.org/10.1093/biomet/asp025}, + publisher={Oxford University Press} +} + +@article{fu2010rank, + title={Rank regression for analysis of clustered data: {A} natural induced smoothing approach}, + author={Fu, Liya and Wang, You-Gan and Bai, Zhidong}, + journal={Computational Statistics \& Data Analysis}, + volume={54}, + number={4}, + pages={1036--1050}, + year={2010}, + url={https://doi.org/10.1016/j.csda.2009.10.015}, + publisher={Elsevier} +} + +@article{chiou2014fast, + title={Fast accelerated failure time modeling for case-cohort data}, + author={Chiou, Sy Han and Kang, Sangwook and Yan, Jun}, + journal={Statistics and Computing}, + volume={24}, + number={4}, + pages={559--568}, + year={2014}, + url={https://doi.org/10.1007/s11222-013-9388-2}, + publisher={Springer} +} + +@Manual{aftgeepackage, + title = {aftgee: Accelerated failure time model with generalized estimating equations}, + author = {Sy Han Chiou and Sangwook Kang and Jun Yan}, + year = {2021}, + note = {R package version 1.1.6}, + url = {https://CRAN.R-project.org/package=aftgee} +} + +@article{bang2002median, + title={Median regression with censored cost data}, + author={Bang, Heejung and Tsiatis, Anastasios A}, + journal={Biometrics}, + volume={58}, + number={3}, + pages={643--649}, + year={2002}, + url={https://doi.org/10.1111/j.0006-341X.2002.00643.x}, + publisher={Wiley Online Library} +} + +@Manual{r2021, + title = {R: {A} language and environment for statistical computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2021}, + url = {https://www.R-project.org/} + } + +@incollection{sit2017survival, + title={Survival analysis: {A} quantile perspective}, + author={Ying, Zhiliang and Sit, Tony}, + booktitle={Handbook of Quantile Regression}, + pages={89--108}, + year={2017}, + url={}, + publisher={Chapman and Hall/CRC} +} + +@article{brown2005standard, + author={Brown, BM and Wang, You-Gan}, + title={Standard errors and covariance matrices for smoothed rank estimators}, + journal={Biometrika}, + volume={92}, + number={1}, + pages={149--158}, + year={2005}, + url={https://doi.org/10.1093/biomet/92.1.149.}, + publisher={Oxford University Press} +} + +@article{kassebaum2015global, + title={Global burden of untreated caries: {A} systematic review and metaregression}, + author={Kassebaum, NJ and Bernab{\'e}, E and Dahiya, M and Bhandari, B and Murray, CJL and Marcenes, W}, + journal={Journal of Dental Research}, + volume={94}, + number={5}, + pages={650--658}, + year={2015}, + doi={10.1177/0022034515573272}, + publisher={SAGE Publications Sage CA: Los Angeles, CA} +} + +@article{lin2000fitting, + title={On fitting {C}ox's proportional hazards models to survey data}, + author={Lin, DY}, + journal={Biometrika}, + volume={87}, + number={1}, + pages={37--47}, + year={2000}, + url={https://doi.org/10.1093/biomet/87.1.37}, + publisher={Oxford University Press} +} + +@article{Kang:fitt:2016, +title={Fitting semiparametric accelerated failure time models for nested case–control data}, + author={Kang, Sangwook}, + journal={Journal of Statistical Computation and Simulation}, + volume={87}, + number={4}, + pages={652--663}, + year={2017}, + url={https://doi.org/10.1080/00949655.2016.1222611}, + publisher={Taylor \& Francis} +} + +@article{ma2010semiparametric, + title={Semiparametric median residual life model and inference}, + author={Ma, Yanyuan and Yin, Guosheng}, + journal={The Canadian Journal of Statistics}, + volume={38}, + number={4}, + pages={665--679}, + year={2010}, + url={https://doi.org/10.1002/cjs.10076}, + publisher={Wiley Online Library} +} + +@article{zhang2015smoothed, + title={Smoothed estimator of quantile residual lifetime for right censored data}, + author={Zhang, Li and Liu, Peng and Zhou, Yong}, + journal={Journal of Systems Science and Complexity}, + volume={28}, + number={6}, + pages={1374--1388}, + year={2015}, + url={https://doi.org/10.1007/s11424-015-3067-7}, + publisher={Springer} +} + +@Manual{Brqpackage, + title = {Brq: Bayesian Analysis of Quantile Regression Models}, + author = {Rahim Alhamzawi}, + year = {2020}, + note = {R package version 3.0}, + url = {https://CRAN.R-project.org/package=Brq} +} + +@Manual{cmprskQRpackage, + title = {cmprskQR: Analysis of competing risks using quantile regressions}, + author = {Stephan Dlugosz and Limin Peng and Ruosha Li and Shuolin Shi}, + year = {2019}, + note = {R package version 0.9.2}, + url = {https://CRAN.R-project.org/package=cmprskQR} +} + +@article{loprinzi1994prospective, + title={Prospective evaluation of prognostic variables from patient-completed questionnaires. {N}orth {C}entral {C}ancer {T}reatment {G}roup.}, + author={Loprinzi, Charles Lawrence and Laurie, John A and Wieand, H Sam and Krook, James E and Novotny, Paul J and Kugler, John W and Bartel, Joan and Law, Marlys and Bateman, Marilyn and Klatt, Nancy E}, + journal={Journal of Clinical Oncology}, + volume={12}, + number={3}, + pages={601--607}, + year={1994}, + url={https://doi.org/10.1200/JCO.1994.12.3.601} +} + +@article{kim2023smoothed, + title={Smoothed quantile regression for censored residual life}, + author={Kim, Kyu Hyun and Caplan, Daniel J and Kang, Sangwook}, + journal={Computational Statistics}, + volume={38}, + pages={1001--1022}, + year={2023}, + url = {https://doi.org/10.1007/s00180-022-01262-z} +} + +@article{jin2001simple, + title={A simple resampling method by perturbing the minimand}, + author={Jin, Zhezhen and Ying, Zhiliang and Wei, LJ}, + journal={Biometrika}, + volume={88}, + number={2}, + pages={381--390}, + year={2001}, + url={https://doi.org/10.1093/biomet/88.2.381}, + publisher={Oxford University Press} +} + +@article{jin2003rank, + title={Rank-based inference for the accelerated failure time model}, + author={Jin, Zhezhen and Lin, DY and Wei, LJ and Ying, Zhiliang}, + journal={Biometrika}, + volume={90}, + number={2}, + pages={341--353}, + year={2003}, + url={https://doi.org/10.1093/biomet/90.2.341}, + publisher={Oxford University Press} +} + +@article{zhou2006simple, + title={A simple censored median regression estimator}, + author={Zhou, Lingzhi}, + journal={Statistica Sinica}, + pages={1043--1058}, + year={2006}, + url={https://www.jstor.org/stable/24307586}, + publisher={JSTOR} +} + +@article{powell1986censored, + title={Censored regression quantiles}, + author={Powell, James L}, + journal={Journal of Econometrics}, + volume={32}, + number={1}, + pages={143--155}, + year={1986}, + url={https://doi.org/10.1016/0304-4076(86)90016-3}, + publisher={Elsevier} +} + +@Manual{ctqrpackage, + title = {ctqr: {C}ensored and truncated quantile regression}, + author = {Paolo Frumento}, + year = {2021}, + note = {R package version 2.0}, + url = {https://CRAN.R-project.org/package=ctqr} +} + +@article{ackerberg2012practical, + title={A practical asymptotic variance estimation for two-step semiparametric estimators}, + author={Ackerberg, Daniel and Chen, Xiaohong and Hahn, Jinyong}, + journal={Review of Economics and Statistics}, + volume={94}, + number={2}, + pages={481--498}, + year={2012}, + url={https://doi.org/10.1162/REST_a_00251}, + publisher={The MIT Press} +} + +@article{kim2021comparison, + title={Comparison of variance estimation methods in semiparametric accelerated failure time models for multivariate failure time data}, + author={Kim, Kyuhyun and Ko, Jungyeol and Kang, Sangwook}, + journal={Japanese Journal of Statistics and Data Science}, + volume={4}, + number={2}, + pages={1179--1202}, + year={2021}, + url={https://doi.org/10.1007/s42081-021-00126-y}, + publisher={Springer} +} + +@Manual{Rcpppackage, + title = {Rcpp: Seamless R and C++ Integration}, + author = {Dirk Eddelbuettel and Romain Francois and JJ Allaire and Kevin Ushey and Qiang Kou and Nathan Russell and Inaki Ucar and Douglas Bates and John Chambers}, + year = {2022}, + note = {R package version 1.0.9}, + url = {https://CRAN.R-project.org/package=Rcpp} +} + +@Manual{RcppArmadillopackage, + title = {RcppArmadillo: `Rcpp' Integration for the `Armadillo' Templated Linear Algebra Library}, + author = {Dirk Eddelbuettel and Romain Francois and Doug Bates and Binxiang Ni and Conrad Sanderson}, + year = {2022}, + note = {R package version 0.11.1.1.0}, + url = {https://CRAN.R-project.org/package=RcppArmadillo} +} + +@article{siegel2021cancer, + title={Cancer statistics, 2021}, + author={Siegel, Rebecca L and Miller, Kimberly D and Fuchs, Hannah E and Jemal, Ahmedin}, + journal={CA: A Cancer Journal for Clinicians}, + volume={71}, + number={1}, + pages={7--33}, + year={2021}, + url={https://doi.org/10.3322/caac.21654}, + publisher={Wiley Online Library} +} + +@article{prentice1986case, + title={A case-cohort design for epidemiologic cohort studies and disease prevention trials}, + author={Prentice, Ross L}, + journal={Biometrika}, + volume={73}, + number={1}, + pages={1--11}, + year={1986}, + url={https://doi.org/10.1093/biomet/73.1.1}, + publisher={Oxford University Press} +} + +@Manual{ggplot2package, + title = {ggplot2: Create elegant data visualisations using the grammar of graphics}, + author = {Hadley Wickham and Winston Chang and Lionel Henry and Thomas Lin Pedersen and Kohske Takahashi and Claus Wilke and Kara Woo and Hiroaki Yutani and Dewey Dunnington}, + year = {2022}, + note = {R package version 3.3.6}, + url = {https://CRAN.R-project.org/package=ggplot2} +} + +@article{koenker1994quantile, + title={Quantile smoothing splines}, + author={Koenker, Roger and Ng, Pin and Portnoy, Stephen}, + journal={Biometrika}, + volume={81}, + number={4}, + pages={673--680}, + year={1994}, + url={https://doi.org/10.1093/biomet/81.4.673}, + publisher={Oxford University Press} +} + +@article{koenker2004penalized, + title={Penalized triograms: {T}otal variation regularization for bivariate smoothing}, + author={Koenker, Roger and Mizera, Ivan}, + journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + volume={66}, + number={1}, + pages={145--163}, + year={2004}, + url={https://doi.org/10.1111/j.1467-9868.2004.00437.x}, + publisher={Wiley Online Library} +} + +@Manual{survivalpackage, + title = {survival: Survival analysis}, + author = {Terry M Therneau}, + year = {2021}, + note = {R package version 3.2-13}, + url = {https://CRAN.R-project.org/package=survival} +} + +@Article{brmspackage, + title = {Advanced {Bayesian} Multilevel Modeling with the {R} + Package {brms}}, + author = {Paul-Christian Bürkner}, + journal = {The R Journal}, + year = {2018}, + volume = {10}, + number = {1}, + pages = {395--411}, + doi = {10.32614/RJ-2018-017}, + encoding = {UTF-8} + } + +@article{koenker2008censored, + title={Censored quantile regression redux}, + author={Koenker, Roger}, + journal={Journal of Statistical Software}, + volume={27}, + pages={1--25}, + year={2008} +} diff --git a/_articles/RJ-2024-007/2022-185_R3.tex b/_articles/RJ-2024-007/2022-185_R3.tex new file mode 100644 index 0000000000..afea87329c --- /dev/null +++ b/_articles/RJ-2024-007/2022-185_R3.tex @@ -0,0 +1,1331 @@ +\title{Fitting a Quantile Regression Model for Residual Life with the R Package qris} +\author{Kyu Hyun Kim, Sangwook Kang, and Sy Han Chiou} + +\maketitle + +\abstract{ + In survival analysis, regression modeling has traditionally focused on assessing covariate effects on survival times, + which is defined as the elapsed time between a baseline and event time. + Nevertheless, focusing on residual life can provide a more dynamic assessment of covariate effects, + as it offers more updated information at specific time points between the baseline and event occurrence. + Statistical methods for fitting quantile regression models have recently been proposed, + providing favorable alternatives to modeling the mean of residual lifetimes. + Despite these progresses, the lack of computer software that implements these methods remains an obstacle for researchers analyzing data in practice. + In this paper, we introduce an R package \CRANpkg{qris} \citep{R:qris}, which implements methods for fitting semiparametric quantile regression models on residual life subject to right censoring. + We demonstrate the effectiveness and versatility of this package through comprehensive simulation studies and + a real-world data example, showcasing its valuable contributions to survival analysis research. +} + +\section{Introduction} \label{sec:intro} + +In the analysis of time-to-event data, standard statistical inference procedures often focus on quantities +based on failure time and its relationship with covariates measured at baseline. +However, throughout the follow-up process, +inference procedures based on residual life become increasingly intuitive for assessing the survival of subjects +and can offer insights into the effectiveness of treatments in prolonging the remaining lifetime. +As covariates can substantially change over time and +models based solely on baseline covariates have limited potential for long-term prognosis, +there is a growing interest in modeling the remaining lifetime of a surviving subject with updated patient information. +Many efforts have been made to model the mean residual life including proportional mean residual life models +\citep{maguluri1994estimation, oakes1990note, oakes2003inference, chen2005semiparametric}, +additive mean residual life models \citep{chen2006linear, chen2007additive, zhang2010goodness}, +and proportional scaled mean residual life models \citep{liu2008regression}. +Given that failure times are usually right-skewed and heavy-tailed, +the mean of the residual life might not be identifiable if +the follow-up time is not sufficiently long. +For this reason, quantiles, which are robust under skewed distribution, +have traditionally been used more frequently as alternative summary measures. +For example, the approach on the semiparametric quantile regression model for continuous responses \citep{koenker1978regression} has been extended to uncensored failure time data +\citep{jung1996quasi, portnoy1997gaussian, wei2006quantile} +and censored failure times data \citep{ying1995survival, portnoy2003censored,peng2008survival, huang2010quantile}. + + +When the outcome variable is the residual life, +semiparametric quantile models that apply the inverse probability of censoring weighting (IPCW) +principle to address right-censored observations have been explored +\citep{jung2009regression, kim2012censored, li2016quantile}. +These approaches are based on non-smooth estimating functions with respect to regression parameters, +and the estimates of the regression parameters are obtained either through zero-crossing of +non-smooth estimating functions using grid search techniques \citep{jung2009regression} or +by optimizing non-smooth objective functions with $L_1$-minimization algorithms \citep{kim2012censored, li2016quantile}. +While these methods are relatively straightforward to implement, +an additional challenge lies in standard error estimation, +which necessitates the computationally intensive use of a multiplier bootstrap method \citep{li2016quantile}. +Alternatively, \citet{jung2009regression} and \citet{kim2012censored} utilized the minimum dispersion statistic and +the empirical likelihood method, respectively, +to bypass the need to directly estimate the variance of the regression parameter estimator for +hypothesis testing and constructing confidence intervals. +The non-smooth nature of the estimating functions in these approaches +precludes the estimation of variance using the robust sandwich-type variance estimator typically employed +in equation-based estimation methods. +To lessen the associated computational burden, an induced smoothing was proposed \citep{brown2005standard}, +which modifies the non-smooth estimating equations into smooth ones. +Leveraging the asymptotic normality of the non-smooth estimator, +the smooth estimating functions are constructed by averaging out the random perturbations +inherent in the non-smooth estimating functions. +The resulting estimating functions become smooth with respect to the regression parameters, +allowing for the straightforward application of standard numerical algorithms, such as the Newton-Raphson method. +Furthermore, these smoothed estimating functions facilitate the straightforward computation of variances using +the robust sandwich-type estimator. +The induced smoothing approach has been employed in fitting semiparametric accelerated failure time (AFT) models +via the rank-based approach \citep{johnson2009induced, aftgeepackage, chiou2015semiparametric, Kang:fitt:2016}. +Regarding quantile regression, \citet{choi2018smoothed} considered the induced smoothing approach under +a competing-risks setting. All of these methods are based on modeling event times. +Recently, \citet{kim2023smoothed} proposed an induced smoothing estimator for fitting +a semiparametric quantile regression model for residual life. + + + +The availability of published R packages for fitting quantile regression models is somewhat limited. +The \code{rq()}, \code{nlrq()}, \code{rqss()}, and \code{crq()} functions in the package \CRANpkg{quantreg} +\citep{quantregpackage} are predominantly used and provide various features for fitting linear, +nonlinear, non-parametric, and censored quantile regression models, respectively. +The \code{rq()} function minimizes non-smooth objective functions to obtain point estimates of regression coefficients +and can accommodate right-censored survival times by incorporating weights. +By redefining survival times as the remaining lifetime at time $t_0$, +one can also obtain a non-smoothed estimator for quantile regression models for residual life \citep{kim2012censored}. +On the other hand, the \code{nlrq()} function is designed to fit a nonlinear quantile regression model, while +the \code{rqss()} function fits additive quantile regression models with +nonparametric terms, including univariate components and bivariate components, +using smoothing splines and total variation regularization techniques \citep{koenker1994quantile, koenker2004penalized}. +% On the other hand, the \code{nlrq()} function is designed to fit a nonlinear quantile regression model, +% while the \code{rqss()} function fits additive quantile regression models with nonparametric terms, +% including univariate components and bivariate components, using smoothing splines and +% total variation regularization techniques \citep{koenker1994quantile, koenker2004penalized}. +Furthermore, the \code{crq()} function fits a quantile regression model for censored data on the $\tau$-th +conditional quantile function of the response variable. +Overall, the \CRANpkg{quantreg} implements three methods for handling right-censored survival times: \citet{powell1986censored}'s estimator, +\citet{portnoy2003censored}'s estimator and \citet{peng2008survival}'s estimator. +However, none of the implemented methods in the \code{nlrq()}, \code{rqss()}, or \code{crq()} functions +are applicable for handling censored residual life using the induced smoothing methods. +The only function that implements the induced smoothing method is the \code{aftsrr()} function in the package +\CRANpkg{aftgee} \citep{aftgeepackage}, +but it is specifically designed for fitting semiparametric AFT models, which are not directly applicable +to fitting quantile regression models. + + +% In an effort to lessen the computational burden in handling non-smooth estimating equations, +% the \code{aftsrr()} function in package \CRANpkg{aftgee} \citep{aftgeepackage} is the only function that implements the induced smoothing method in the context of fitting semiparametric AFT models. + +Other R packages that can be used to fit quantile regression models for survival data include the package +\CRANpkg{ctqr} \citep{ctqrpackage}, package \CRANpkg{Brq} \citep{Brqpackage}, package \CRANpkg{brms} \citep{brmspackage}, +and package \CRANpkg{cmprskQR} \citep{cmprskQRpackage}. +The \code{ctqr()} function in the package \CRANpkg{ctqr} implements the methods proposed in +\citet{ctqrpackage} for right or interval-censored failure times with left-truncation. +The \code{Bqr()} function in the package \CRANpkg{Brq} implements Bayesian methods based on the +asymmetric Laplace distribution. +In the package \CRANpkg{brms}, the \code{brm()} function with the \code{family=asym\_laplace()} +option enables the implementation of full Bayesian inference. +The \code{crrQR()} function in the package \CRANpkg{cmprskQR} allows fitting quantile regression models +with competing risks. +All of these R packages are designed for fitting quantile regression models for failure times defined from a baseline +and are not applicable to the residual life setting. + +% In an effort to lessen the computational burden in handling non-smooth estimating equations, +% the \code{aftsrr()} function in package \CRANpkg{aftgee} \citep{aftgeepackage} is the only function that implements the induced smoothing method in the context of fitting semiparametric AFT models. + +The recently developed R package \CRANpkg{qris} \citep{R:qris} provides an efficient tool for +fitting semiparametric quantile regression models for residual life subject to right censoring. +The \CRANpkg{qris} package offers three methods for estimating the regression parameters: +$L_1$-minimization of non-smooth objective functions, induced smoothing with a non-iterative approach, +and an iterative procedure. +For standard error estimation, the \CRANpkg{qris} package provides two resampling-based approaches: +the partial multiplier bootstrap and the full multiplier bootstrap methods. +The partial multiplier bootstrap method utilizes the robust sandwich-type estimator by +incorporating the sample variance of perturbed estimating functions, +while the full multiplier bootstrap method is obtained by considering the sample variance +from the solutions of perturbed estimating functions. +To enhance the interpretability of results, the \CRANpkg{qris} package incorporates +graphical visualizations of covariate effects at different quantiles and base times, +utilizing the plotting environment similar to that in the \CRANpkg{ggplot2} package \citep{ggplot2package}, +%the \code{ggplot} plotting environment \citep{ggplot2package}, +thereby allowing for extensive flexibility and customization. +The ultimate goal of creating the \CRANpkg{qris} package is to facilitate +the easy incorporation of quantile regression for residual life into daily routines. +The package \CRANpkg{qris} is available on the Comprehensive R Archive Network (CRAN) at +\url{https://CRAN.R-project.org/package=qris}. + +The rest of the article is organized as follows: Section~\nameref{sec:nsm} introduces +a semiparametric regression model for quantiles of residual life and the estimation methods +implemented in the package. +Section~\nameref{sec:implementation} provides details about computing algorithms. +Illustrations of the package using a simulated dataset and the real data from the +North Central Cancer Treatment Group +are presented in Section~\nameref{sec:illustration}. +Finally, in Section~\nameref{sec:conclusion}, concluding remarks are provided along with some discussions. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Model %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Semiparametric quantile regression for residual life} +\label{sec:nsm} + +Define $T$ as the potential failure time that is subject to right censoring by $C$ +and $\vect{X}$ as a $p \times 1$ vector of covariates, +where $p$ is the number of covariates, including an intercept. +The observed data consists of +$n$ independent copies of $(Z, \delta, \vect{X})$, where $Z = \min(T, C)$, +$\delta = I(T \leq C)$, % is the failure indicator, +and $I(\cdot)$ is an indicator function. +We also assume $T$ and $C$ are marginally independent. +Define the $\tau$-th quantile of the residual life at $t_0 > 0$ as +$\theta_{\tau}(t_0)$ that satisfies $P(T_i - t_0 \geq \theta_{\tau}(t_0) \ | \ T_i > t_0) = 1 - \tau$. +We consider the semiparametric quantile regression model for the residual life \citep{kim2012censored, kim2023smoothed}. Given $T_i > t_0$, +\begin{equation} \label{qr:mod1} + \log(T_i - t_0) = \vect{X}_{i}^{\top}\bm{\beta}_0(\tau, t_0) + \epsilon_i, i = 1, \ldots, n, %\label{qr:mod2} +\end{equation} +where $\bm{\beta}_0(\tau, t_0)$ is a $p \times 1$ vector of regression coefficients, +and $\epsilon_i$ is a random error having zero $\tau$-th quantile. +The quantile regression model for a continuous response \citep{koenker1978regression} +is a special case of Equation~\eqref{qr:mod1} when $t_0 = 0$. +For ease of notation, we omit $\tau$ and $t_0$ in $\bm{\beta}_0(\tau, t_0)$ and $\theta_{\tau}(t_0)$ +and write $\bm{\beta}_0$ and $\theta$. +We present different estimation procedures to estimate $\bm{\beta}_0$ given $\tau$ and $t_0$ in the following. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Non-smooth model point estimation %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Estimation using non-smooth functions} \label{sec:nsm:pt} + +When there is no censoring, an estimator for $\beta_0$ in Equation~\eqref{qr:mod1} +can be obtained by solving the estimating equation \citep{kim2012censored}, where +\begin{equation} \label{eq:ns:obj1} + \frac{1}{n}\sum_{i=0}^{n}I[T_i \ge t_0] \vect{X}_i \left\{I\left[\log(T_i - t_0) \leq \vect{X}_i^{\top}\bm{\beta} \right] - \tau \right\} = 0. +\end{equation} +However, Equation~\eqref{eq:ns:obj1} cannot be directly used when $T_i - t_0$ is subject to right censoring. +The IPCW technique can be incorporated into Equation~\eqref{eq:ns:obj1} +to account for the right censoring \citep{li2016quantile}. +Specifically, in the presence of right censoring, +the estimator for $\bm{\beta}_0$ in Equation~\eqref{qr:mod1} can be obtained as the root of the following weighted estimating equations: +\begin{equation} \label{eq:nsm:ipw} + U_{t_0}(\bm{\beta}, \tau) = \frac{1}{n}\sum_{i=1}^{n}I[Z_i \ge t_0] \vect{X}_i \left\{I \left[\log(Z_i - t_0) \leq \vect{X}_i^{\top} \bm{\beta} \right]\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0)} -\tau \right\}, +\end{equation} +where $\widehat{G}(\cdot)$ +is the Kaplan-Meier estimate of the survival function $G(\cdot)$ of the censoring time $C$ and +$\widehat{G}(t) = \prod_{i: t_i \leq t} (1 - \sum_{j=1}^n (1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n I(Z_j \geq t_i))$. +A computational challenge arises because the exact solution to Equation~\eqref{eq:nsm:ipw} might not exist +due to the non-smoothness in $\beta$ caused by the involvement of indicator functions. +When the exact solutions do not exist, the root of Equation~\eqref{eq:nsm:ipw} can be approximated by +minimizing the $L_1$-objective function $L_{t_0}(\bm{\beta}, \tau)$ \citep{li2016quantile}, +\begin{align*} + \label{l1:nsm} + \nonumber + L_{t_0}(\bm{\beta}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/\widehat{G}(t_0)} \left| \log(Z_i - t_0) - \vect{X}_i^{\top}\beta \right| + \\ + & \left| M - \bm{\beta}^{\top}\sum_{l=1}^n - \vect{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}(Z_l)/\widehat{G}(t_0)}\right| + + \ \left| M - \bm{\beta}^{\top}\sum_{l=1}^n 2\tau \vect{X}_l I[Z_l > t_0]\right|, +\end{align*} +where $M > 0$ bounds +$\left| \bm{\beta}^{\top}\sum_{i=1}^n - \vect{X}_i \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/ \widehat{G}(t_0)}\right|$ +and $\left| \bm{\beta}^{\top}\sum_{i=1}^n 2\tau \vect{X}_i I[Z_i > t_0]\right|$ from above. +Numerically, the limit $M$ is set to be an extremely large number, and the \code{qris()} function uses $M = 10^6$. +Denote the resulting estimator to be $\bns$. +It has been shown that $\bns$ is consistent for $\bm{\beta}_0$ and asymptotically normally distributed +\citep{li2016quantile}. + +Despite the well-established asymptotic properties, directly estimating the variance of $\bns$ is impractical +because it involves the derivative of non-smooth functions. +A multiplier bootstrap method has typically been employed \citep{li2016quantile} to address this difficulty. +The multiplier bootstrap method considers the perturbed version of $U_{t_0}(\beta, \tau)$, defined as +\begin{equation*} + \label{eq:nsm:rev} + U_{t_0}^{\ast}(\beta, \tau) = \frac{1}{n}\sum_{i=1}^{n} \eta_i I[Z_i \ge t_0] \vect{X}_i \left\{I \left[\log(Z_i - t_0) \leq \vect{X}_i^{\top} \bm{\beta} \right]\frac{\delta_i}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} -\tau \right\}, +\end{equation*} +where $\eta_i, i = 1, \ldots, n, $ are independently and identically (iid) +generated from a positive random variable with unity mean and variance, +and $\widehat{G}^\ast(\cdot)$ is a perturbed version of $\widehat{G}(\cdot)$, +constructed as +$\widehat{G}^\ast(t) = +\prod_{i: t_i \leq t} (1 - \sum_{j=1}^n \eta_j(1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n \eta_jI(Z_j \geq t_i))$ +for a given realization of $\eta_i$. +% by substituting $\sum_{j=1}^n (1-\delta_j) I(Z_j \leq t)$ in the numerator and $\sum_{j=1}^n I(Z_j \geq t)$ +% in the denominator with +% $\sum_{j=1}^n \eta_j (1-\delta_j) I(Z_j \leq t)$ and $\sum_{j=1}^n \eta_j I(Z_j \geq t)$ given $(\eta_1, \ldots, \eta_n)$, respectively. +On the other hand, a perturbed $L_1$-objective function, denoted as $L_{t_0}^{\ast}(\bm{\beta}, \tau)$, +can be similarly constructed, where +\begin{align*} + L_{t_0}^{\ast}(\bm{\beta}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} \left| \log(Z_i - t_0) - \vect{X}_i^{\top}\bm{\beta} \right| + \nonumber \\ + & \left| M - \bm{\beta}^{\top}\sum_{l=1}^n - \vect{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}^{\ast}(Z_l)/\widehat{G}^{\ast}(t_0)}\right| + + \ \left| M - \beta^{\top}\sum_{l=1}^n 2\tau \vect{X}_l \eta_l I[Z_l > t_0]\right|. +\end{align*} +Solving for $U_{t_0}^{\ast}(\bm{\beta}, \tau) = 0$, or equivalently, +minimizing $L_{t_0}^{\ast}(\bm{\beta}, \tau)$, yields one realization of $\bns$. +The multiplier bootstrap variance is computed as the sample variance of +a large number of realizations of $\bns$. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Induced smoothing %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Estimation using induced smoothed functions} \label{sec:IS:pt} + +The regression coefficient in Equation~\eqref{qr:mod1} can be more efficiently obtained +through the induced smoothed version of Equation~\eqref{eq:nsm:ipw}. +The induced smoothed estimating functions are constructed by taking +the expectation with respect to a mean-zero random noise added to the +regression parameters in Equation~\eqref{eq:nsm:ipw}. +Specifically, +\begin{align}\label{eq:is} + \widetilde{U}_{t_0}(\bm{\beta}, \tau, H) & = E_w \{U_{t_0}(\bm{\beta}+\matr{H}^{1/2}\matr{W}, \tau)\}\nonumber\\ + & = \frac{1}{n} \sum_{i=1}^{n} I[Z_i > t_0] \vect{X}_i \left\{ \Phi\left(\frac{\vect{X}_i^\top\bm{\beta}-\log(Z_i-t_0)}{\sqrt{\vect{X}_i^{\top} \matr{H} \vect{X}_{i}}}\right)\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0) } -\tau \right\}, +\end{align} +where $\matr{H} = O(n^{-1})$, +$\matr{W} \sim N(0, \matr{I}_p)$ is a standard normal random vector, +$\matr{I}_p$ is the $p \times p $ identity matrix, +and $\Phi(\cdot)$ is the cumulative distribution function of a standard normal random variable. +A typical choice for $\matr{H}$ is to fix it at $n^{-1}\matr{I}_p$, +while some alternative choices are explored in \citet{chiou2015rank}. +Let $\bis$ be the solution to $\widetilde{U}_{t_0}(\bm{\beta}, \tau, \matr{H}) = 0$. +Since Equation~\eqref{eq:is} is a smooth function in $\bm{\beta}$, +the estimator can be obtained using standard numerical algorithms such as the Newton-Raphson method. +Moreover, the induced smoothed estimator for $\bm{\beta}_0$ has been shown to be +asymptotically equivalent to its non-smooth counterpart \citep{kim2023smoothed}. + + +Following the idea in Section~\nameref{sec:nsm:pt}, +the multiplier bootstrap procedure can be similarly employed to estimate the variance of $\bis$. +The perturbed version of Equation~\eqref{eq:is} takes the form of +\begin{equation} \label{eq:7} + \widetilde{U}^{\ast}_{t_0}(\bm{\beta}, \tau, \matr{H}) = \frac{1}{n} \sum_{i=1}^{n} \eta_i I[Z_i > t_0] \vect{X}_i \left\{ \Phi\left(\frac{\vect{X}_i^\top\bm{\beta} - \log(Z_i-t_0)}{\sqrt{\vect{X}_i^{\top} \matr{H} \vect{X}_{i}}}\right)\frac{\widehat{G}^{\ast}(t_0) \delta_i}{\widehat{G}^{\ast}(Z_i)} -\tau \right\}. +\end{equation} +The multiplier bootstrap procedure estimates the variance of $\bis$ by calculating the sample variance of +a large number of realizations of $\bis$ obtained by repeatedly solving Equation~\eqref{eq:7}. + + +It has been shown that the asymptotic variance +$\Var(\bm{\beta}, \tau)$ can be decomposed into +$\matr{A}(\bm{\beta})^{\top} \matr{V}(\bm{\beta}) \matr{A}(\bm{\beta})$ \citep{kim2023smoothed}, +where the two components, $\matr{A}(\bm{\beta})$ and $\matr{V}(\bm{\beta})$, can be estimated separately. +Since Equation~\eqref{eq:is} is a smooth function in $\bm{\beta}$, the slope matrix, +$\matr{A}(\bm{\beta})$, can be conveniently estimated by differentiating +$\widetilde{U}_{t_0}(\bm{\beta}, \tau, \matr{H})$ with respect to $\bm{\beta}$. +The explicit form of $\matr{A}(\bm{\beta})$ is as follows: +\begin{align} \label{eq:cov:slp} + \matr{A}(\bm{\beta}) & = \frac{\partial \widetilde{U}_{t_0}(\bm{\beta}, \tau, \matr{H})}{\partial \bm{\beta}} \nonumber \\ + & = \frac{1}{n}\sum_{i=1}^{n} I[Z_i > t_0] \vect{X}_i \frac{G(t_0) \delta_i}{G(Z_i)} \phi\left(\frac{{\vect{X}_i}^{\top}\bm{\beta} - \log(Z_i-t_0)}{\sqrt{{\vect{X}_i}^{\top}\matr{H} \vect{X}_i}}\right)\left(\frac{-{\vect{X}_i}}{\sqrt{{\vect{X}_i}^{\top} \matr{H} {\vect{X}_i}}}\right), +\end{align} +where $\phi (\cdot)$ is the density function of the standard normal random variable. + +The slope matrix, $\widehat{\matr{A}}(\bis)$, can be evaluated directly +by plugging in $\bis$ and $\widehat{G}(\cdot)$. +On the other hand, the variance of the estimating function, +$\widehat{\matr{V}}(\bm{\beta})$, can be obtained by a computationally efficient +resampling method motivated by the multiplier bootstrap procedure in +Section~\nameref{sec:nsm:pt}. +Specifically, we propose estimating $\widehat{\matr{V}}(\bis)$ as the +simple variance of a large set of realizations of the perturbed version of +$\widetilde{U}_{t_0}(\bis, \tau, \matr{H})$ presented in Equation~\eqref{eq:7}. +We refer to this procedure as the partial multiplier bootstrapping approach +because it utilizes the perturbed estimating function, +similar to the full multiplier bootstrapping approach, +but the computation of $\widehat{\matr{A}}(\bis)$ and $\widehat{\matr{V}}(\bis)$ +does not involve the repeated solving of the perturbed estimating equations. +Thus, the partial multiplier bootstrapping approach is expected to be computationally +more efficient than the multiplier bootstrap method. +A similar procedure and its performance have been studied in modeling failure +times with semiparametric AFT models \citep{chiou2014fast,aftgeepackage}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Iteration procedure %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Iterative procedure in induced smoothing estimation} \label{sec:iter} + +The induced estimator $\bis$ is obtained with a fixed $\matr{H}$, +as described in Section~\nameref{sec:IS:pt}, and its variance is estimated separately. +This estimation procedure can be viewed as a special case of the following iterative procedure, +which updates $\matr{H}$ and $\bis$ iteratively. +Specifically, the iterative algorithm utilizes the Newton-Raphson method while sequentially updating $\bis$ +and $\widehat{\Var}(\bis)$ until convergence. +Similar iterative algorithms have also been considered previously in the induced smoothing approach +for semiparametric AFT models \citep{johnson2009induced, chiou2014fast, chiou2015semiparametric, choi2018smoothed}. +The iterative procedure is summarized as follows: +\begin{description} +\item[\bf Step 1:] + Set the initial values $\widehat{\bm{\beta}}^{(0)}$, + $\widehat{\matr{\Sigma}}^{(0)} = \matr{I}_{p}$, + and $\matr{H}^{(0)} = n^{-1}\widehat{\matr{\Sigma}}^{(0)}$. +\item[\bf Step 2:] + Given $\widehat{\bm{\beta}}^{(k)}$ and $\matr{H}^{(k)}$ at the $k$-th step, update $\widehat{\bm{\beta}}^{(k)}$ by + \begin{equation*} + \widehat{\bm{\beta}}^{(k+1)}=\widehat{\bm{\beta}}^{(k)} - \widehat{\matr{A}}(\widehat{\bm{\beta}}^{(k)})^{-1}{\widetilde{U}_{t_0}(\widehat{\bm{\beta}}^{(k)}, \tau, \matr{H}^{(k)}}). + \end{equation*} +\item[\bf Step 3:] + Given $\widehat{\bm{\beta}}^{(k+1)}$ and $\widehat{\matr{\Sigma}}^{(k)}$, update $\widehat{\matr{\Sigma}}^{(k)}$ by + \begin{equation*} + \widehat{\matr{\Sigma}}^{(k+1)} = \widehat{\matr{A}}(\widehat{\bm{\beta}}^{(k+1)})^{-1} \widehat{\matr{V}}(\widehat{\bm{\beta}}^{(k+1)}, \tau) \widehat{\matr{A}}(\widehat{\bm{\beta}}^{(k+1)})^{-1}. + \end{equation*} +\item[\bf Step 4:] + Set $\matr{H}^{(k+1)} = n^{-1}\widehat{\matr{\Sigma}}^{(k+1)}$. Repeat Steps 2, 3 and 4 until $\widehat{\bm{\beta}}^{(k)}$ and $\widehat{\matr{\Sigma}}^{(k)}$ converge. +\end{description} +The initial value, $\widehat{\bm{\beta}}^{(0)}$, could be chosen as $\bns$. +We define $\bit$ and $\widehat{\bm{\Sigma}}_{\tiny\mbox{IT}}$ as the +values of $\widehat{\bm{\beta}}^{(k)}$ and $\widehat{\matr{\Sigma}}^{(k)}$ at convergence, +and $\widehat{\Var}(\bit) = n^{-1}\widehat{\matr{\Sigma}}_{\tiny\mbox{IT}}$. +In Step 3, $\widehat{\matr{V}}(\widehat{\bm{\beta}}^{(k+1)}, \tau)$ +is obtained using the partial multiplier bootstrap approach. +However, the full multiplier bootstrap approach can also be employed +but would require longer computation times. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Package implementation %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Package implementation} +\label{sec:implementation} + +The main function in the \CRANpkg{qris} package for +estimating the regression parameters in the quantile regression model for residual life is the +\code{qris()} function. +The \code{qris()} function is written in C++ and incorporated into R +using the \CRANpkg{Rcpp} \citep{Rcpppackage} and \CRANpkg{RcppArmadillo} \citep{RcppArmadillopackage} packages. +The synopsis of \code{qris} is: + +\begin{example} + > args(qris) + function (formula, data, t0 = 0, Q = 0.5, nB = 100, method = c("smooth", + "iterative", "nonsmooth"), se = c("fmb", + "pmb"), init = c("rq", "noeffect"), verbose = FALSE, + control = qris.control()) +\end{example} +% \input{codes/argsqris.tex} + +The required argument is \code{formula}, +which specifies the quantile regression model to be fitted using the variables in \code{data}. +The \code{formula} assumes that the response variable is a \class{Surv} object +created by the \code{Surv()} function in the \CRANpkg{survival} package \citep{survivalpackage}. +This formula structure is commonly adopted for handling survival data in R, as seen in functions +like \code{survreg()} and \code{coxph()} in the \CRANpkg{survival} package. +The argument \code{t0} specifies the base time used in defining residual life. +The default value of \code{t0} is set to zero, in which case residual life reduces to a failure time. +The \code{Q} argument is used to specify the target quantile of residual life to estimate, +with the default value being set to 0.5 (median). +The \code{nB} argument specifies the bootstrapping size used in standard error estimation, +with the default value set to 100. +The \code{method} argument specifies one of the three estimation methods: +\code{"nonsmooth"}, \code{"smooth"}, and \code{"iterative"}, +corresponding to the estimating procedures outlined in Sections~\nameref{sec:nsm:pt}, +\nameref{sec:IS:pt}, and~\nameref{sec:iter}, respectively. +Given the point estimates of the regression parameters, +their standard errors can be estimated using one of two implemented methods: +\code{se = "fmb"} and \code{se = "pmb"}. +The \code{se = "fmb"} method employs a full-multiplier bootstrapping approach to +estimate the variance by the sample variance of large realizations of $\widehat\beta$. +The \code{se = "pmb"} method estimates the variance using a robust sandwich variance estimator +and employs the computationally efficient partial multiplier bootstrapping approach described in +Section~\nameref{sec:IS:pt}. +The \code{"fmb"} option is available for all three point estimation methods, +whereas the \code{"pmb"} option is not available for the \code{"nonsmooth"} +point estimation method due to the lack of a closed-form sandwich variance estimator. +The \code{init} argument allows users to specify the initial value for estimating regression parameters +by either a $p$-dimensional numerical vector or a character string. +In the latter case, the options \code{init = "rq"} and \code{init = "noeffect"} correspond to +the point estimate obtained from the \code{rq()} function in the \CRANpkg{quantreg} package +and a $p$-dimensional vector of zeros, respectively. +The default value for \code{init} is \code{init = "rq"}. +Among the three methods implemented for point estimation, \code{method = "smooth"} and +\code{method = "nonsmooth"} are non-iterative, +in the sense that point estimation is performed separately from the estimation of standard errors. +On the other hand, \code{method = "iterative"} calculates point estimates and the corresponding +standard error estimates simultaneously through iterative updates. +When \code{method = "iterative"}, users can define specific convergence criteria using \code{qris.control()}. +The available options in \code{qris.control()} are as follows. + +\begin{example} + > args(qris.control) + function (maxiter = 10, tol = 0.001, trace = FALSE) +\end{example} +% \input{codes/argscontrol.tex} + +The \code{maxiter} argument specifies the maximum number of iterations. +The default value for \code{maxiter} is ten, +as the proposed algorithm typically converges within ten steps based on our exploration. +The convergence tolerance is controlled using the \code{tol} argument, +which has a default value of \code{1e-3}. +The \code{trace} argument takes a logical value and +is used to determine whether to print the result for each iteration. +The default setting is \code{trace = FALSE}. +The \class{qris} object is fully compatible with many of R's generic functions, +including \code{coef()}, \code{confint()}, \code{plot()}, \code{predict()}, +\code{print()}, \code{residuals()}, \code{summary()}, and \code{vcov()}. + + +Among the available \code{S3} methods, +a unique feature of the \CRANpkg{qris} package's \code{S3 plot} method, +when applied to a \class{qris} object, is its ability to automatically +update the original object by extending the range of $\tau$ or $t_0$ values. +This extension enables the generation of a covariate effect plot over the +newly specified values of $\tau$ or $t_0$, +providing a comprehensive visualization of the covariate effects across the extended range. +The \code{S3} method for plotting a \class{qris} object is shown below. +\begin{example} + > argsAnywhere(plot.qris) + function (x, t0s = NULL, Qs = NULL, nB = NULL, vari = NULL, byQs = FALSE, + ggextra = NULL, ...) + NULL +\end{example} +The argument \code{x} is a \class{qris} object created using the \code{qris()} function. +The \code{t0s} and \code{Qs} arguments are numeric vectors that enable users to specify +the values of $t_0$ or $\tau$ for plotting the covariate effect. +If \code{t0s} and \code{Qs} are not specified, +the covariate effects are plotted against $\tau = 0.1, 0.2, \ldots, 0.9$ +at the base time ($t_0$) inherited from the \class{qris} object specified in \code{x}. +The \code{nB} argument is a numerical variable that controls the sample size for bootstrapping, +used to compute standard error estimations based on the variance estimation specified +in the original \class{qris} object. +When \code{nB} is specified, the function calculates standard errors +for all combinations of $t_0$ and $\tau$ specified in \code{t0s} and \code{Qs}, +computes 95\% confidence intervals accordingly, +and includes them in the covariate effect plot. +The \code{vari} argument is a character string that allows users to specify the +names of the covariates they want to display in the effect plots. +When the \code{vari} argument is not specified, +all covariates will be included in the plots by default. +The coefficient event plot can be plotted against the specified quantiles by +setting \code{byQs = TRUE} or against the specified base times by setting \code{byQs = FALSE}. +Finally, the \code{ggextra} argument allows users to pass additional graphical parameters +to the \CRANpkg{ggplot2} package, offering further customization options for the plots. +When the \code{plot()} function is called, it internally invokes the \code{qris.extend()} +function to compute the covariate effects at additional values. +The syntax for the \code{qris.extend()} function is provided below: +\begin{example} + > args(qris.extend) + function (x, t0s = NULL, Qs = NULL, nB = NULL, vari = NULL) + NULL +\end{example} +The arguments in \code{qris.extend()} are inherited from the arguments specified in +the \code{plot()} function. +To reduce runtime when repeatedly calling the \code{plot()}, +one can calculate the desired covariate effects by applying \code{qris.extend()} +outside of \code{plot()} first and then supply the results to \code{plot()}. +This approach allows for pre-computation of the covariate effects, making it more +efficient when generating multiple plots. +Overall, the unique plotting feature in \CRANpkg{qris} +provides users with a seamless and effortless approach to conducting a +comprehensive assessment of the covariate effects across different quantiles or base times. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Illustration %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%- +\section{Illustration} \label{sec:illustration} + +\subsection{Simulated data}\label{subsec:simulation} +In this subsection, we present a simple simulation example to validate the implementations in the +proposed \CRANpkg{qris} package. +The simulation involves five covariates, denoted as $X_1, \ldots, X_5$. +Among these covariates, $X_1$ and $X_4$ follow a standard uniform distribution, +$X_2$ follows a binomial distribution with a success probability of 0.5, +$X_3$ follows a standard normal distribution, and $X_5$ follows a standard exponential distribution. +We assume that $X_2, X_3, X_4$, and $X_5$ do not impact the residual life, +meaning their corresponding coefficient values $\beta_2$, $\beta_3$, $\beta_4$, and $\beta_5$ are zero. +The survival time $T$ is generated from a Weibull distribution with the survival function +$S(t) = \exp\{-(\rho t)^\kappa\}$ for $t > 0$, where $\kappa = 2$, and $\rho$ is obtained by solving +\begin{equation} \label{eq:sim:weibull} + \rho^{-1}\{ (\rho t_0)^\kappa - \log (1-\tau) \}^{(1/\kappa)}- t_0 = \exp\{\beta_0 + \beta_1 X_1\}, +\end{equation} +for a specified $t_0$ and $\tau$. +We set the intercept $\beta_0 = \log(5)$ and $\beta_1 = \log(2)$ at $t_0 = 0$. +Given $\rho$, $\tau$, and $X_1$, the true values of $\beta_0$ and $\beta_1$ +can be obtained sequentially from Equation~\ref{eq:sim:weibull} for different $t_0 > 0$. +In our case, the corresponding true values of $\beta_0$ are approximately 1.411 and 1.219 for $t_0=1$ and 2, respectively. +Similarly, the true values of $\beta_1$ are approximately 0.797 and 0.907 for $t_0=1$ and 2, respectively. +The closed-form expression for generating $T$ is then $\{ -\log(1 - u) \}^{1/\kappa} / \rho$, +where $u$ is a uniform random variable over $(0, 1)$. +Given these specifications, +we have implemented the \code{data.gen()} function to generate simulation data. +The \code{data.gen()} function takes four arguments: +\code{n}, \code{t0}, \code{cen}, and \code{Q}, representing the sample size, $t_0$, censoring proportion, +and $\tau$, respectively. +We generate censoring times $C$ from an independent uniform distribution over $(0, c)$, +where $c$ is chosen to achieve the desired censoring proportions of 10\% and 30\%. +Using the generated dataset, we fit the model using three different estimation methods: +induced smoothing, non-smooth, and iterative-induced smoothing. +All analyses were conducted on a 4.2 GHz Intel(R) quad Core(TM) i7-7700K central processing unit (CPU) using R 4.3.0 \citep{r2021}. +The following code demonstrates the implementation of \code{data.gen()} to generate a simulation dataset. +\begin{example} + > data.gen <- function(n, t0, cen = .3, Q = .5) { + + if (!(t0 %in% 0:2)) + + stop("T0 is limited to three specific values: 0, 1, or 2.") + + if (!(cen %in% c(0, .1, .3))) + + stop("Censoring is limited to three specific values: 0%, 10%, or 30%.") + + if (!(Q %in% c(.25, .5))) + + stop("Q is limited to two specific values: 0.25, or 0.50.") + + censoring <- Inf + + if (t0 == 0) { + + if (cen == .1) censoring <- runif(n, 0, 125.1) + + if (cen == .3) censoring <- runif(n, 0, 25.49) + + beta0 <- log(5); beta1 <- log(2) + + } + + if (t0 == 1) { + + if (cen == .1) censoring <- runif(n, 0, 120.8) + + if (cen == .3) censoring <- runif(n, 0, 23.41) + + beta0 <- 1.410748; beta1 <- 0.7974189 + + } + + if (t0 == 2) { + + if (cen == .1) censoring <- runif(n, 0, 120.6) + + if (cen == .3) censoring <- runif(n, 0, 26.20) + + beta0 <- 1.219403; beta1 <- 0.9070615 + + } + + dat <- data.frame(censoring, + + Time0 = sqrt(-log(1 - runif(n))), + + X1 = runif(n), + + X2 = rbinom(n, 1, .5), + + X3 = rnorm(n), + + X4 = runif(n), + + X5 = rexp(n, 1)) + + rho <- (-log(1 - Q))^0.5 * (((exp(beta0 + beta1 * dat$X1) + t0)^2 - t0^2)^-0.5) + + dat$Time0 <- dat$Time0 / rho + + dat$Time <- pmin(dat$Time0, dat$censoring) + + dat$status <- 1 * (dat$Time0 < dat$censoring) + + subset(dat, select = c(Time, status, X1, X2, X3, X4, X5)) + + } + > set.seed(3) + > head(data.gen(200, 0)) + + Time status X1 X2 X3 X4 X5 + 1 4.283379 0 0.09137221 0 2.1638425 0.33833437 0.8751895 + 2 14.797025 1 0.81196535 1 0.8803785 0.82101134 0.3648634 + 3 5.934559 1 0.60923418 1 0.5051163 0.56536790 0.3997803 + 4 7.223266 1 0.54550179 1 0.1105902 0.32417202 1.2169470 + 5 15.128553 1 0.86115736 0 -0.2928586 0.05825095 0.1835962 + 6 5.135852 1 0.28915525 0 0.7723200 0.94126325 0.3809120 +\end{example} +% \input{codes/datagen} +The \code{data.gen()} function generates a \code{data.frame} containing seven variables. +The \code{Time} variable represents the observed survival time, +while the \code{status} variable serves as the event indicator, +taking the value 1 for observed events and 0 for censored observations. +The variables \code{X1}, $\ldots$, \code{X5} are the covariates. +The implementation in the \code{data.gen()} function generates the Weibull survival times +using the inverse probability integral transform technique. +Alternatively, users can use the \code{rweibull()} function with the parameters +\code{shape = 2} and \code{scale = 1 / rho} to generate these Weibull survival times directly. + +We assess the performance of the proposed implementation across various scenarios, +including three sample sizes ($n = 200, 400, 1000$), three levels of $t_0$ ($0, 1, 2$), +two censoring proportions (10\% and 30\%), and two values of $\tau$ (0.25 and 0.50). +For a given dataset, we apply the full-multiplier bootstrapping approach with 200 bootstrap samples +to all three available estimating procedures: +\code{method = "nonsmooth"}, \code{method = "smooth"}, and \code{method = "iterative"}. +To facilitate the evaluation process, we create the \code{do\_fmb()} function +to record the coefficient estimates, standard errors, +and computing times for fitting a single simulated dataset generated from \code{data.gen()}. +The following is the implementation of the \code{do\_fmb()} function and the corresponding code +to run the simulation with 200 replications. +We present the code and result of the simulation experiments conducted at three different sample sizes, +with $t_0$ values set to 0 and 1, +while holding the censoring proportion at 30\% and $\tau$ value at 0.5. +The results for other simulation scenarios are provided in the Supplementary Materials. +\begin{example} + > do_fmb <- function(n, t0, cen, Q, nB) { + + dat <- data.gen(n, t0, cen, Q) + + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + + stamp <- NULL + + stamp[1] <- Sys.time() + + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "fmb") + + stamp[2] <- Sys.time() + + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "nonsmooth", se = "fmb") + + stamp[3] <- Sys.time() + + f3 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "fmb") + + stamp[4] <- Sys.time() + + list(smooth = c(f1$coef, f1$std), + + nonsmooth = c(f2$coef, f2$std), + + iter = c(f3$coef, f3$std), + + times = diff(stamp)) + + } + > B <- 200 + > set.seed(2) + > sims0_fmb <- mapply(function(n, t0) + + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) + > sim1_fmb <- mapply(function(n, t0) + + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) +\end{example} +% \input{codes/simulation_fmb} + +Figure~\ref{fig:sim1} displays violin plots that provide visualizations of the empirical +distribution of the coefficient estimates. +As expected, all three estimators exhibit small biases, +which are calculated as the difference between the point estimates (PE) and the true regression coefficients. +Furthermore, the empirical distributions of the PEs demonstrate a normal-like shape, +aligning with the asymptotic properties of the proposed method \citep{li2016quantile, kim2023smoothed}. +When the sample size is smaller (e.g., $n = 200$ and 400), +the \code{nonsmooth} approach appears to yield slightly larger empirical standard errors (ESE) +compared to the \code{smooth} or \code{iterative} approaches. +However, when $n = 1000$, the ESEs are similar across all approaches. +On the other hand, the comprehensive simulation results presented in Table 1 of the Supplementary Materials +confirm that all coefficient estimates closely approximate the true regression coefficients. +On the other hand, the ESEs and the averaged estimated standard errors (ASE) are in close agreement for all scenarios, +indicating the validity of the variance estimation. +Furthermore, the computation times, which are presented separately in the upper panel of Table~\ref{tab:time}, +indicate that when employing the full multiplier bootstrapping approach, +the \code{nonsmooth} approach demonstrates a slight advantage in terms of computational efficiency over the +\code{smooth} approach, while the \code{iterative} approach takes 5.1 to 9.5 times longer than the \code{smooth} approach. +In summary, the timing results show that the proposed method can yield valid inference results within seconds, +even with large datasets of up to 1000 observations or +when using the computationally demanding full multiplier bootstrapping approach for variance estimation. + +% As expected, all three estimators yield similar point estimates (PE), +% empirical standard error (ESE) +% computed as the simple standard deviation of the 200 PEs, +% and averaged estimated standard error (ASE) computed as the average of the 200 +% full multiplier bootstrap standard errors. +% More importantly, all PEs are close to true regression coefficients confirming the unbiasedness of the proposed estimators. +% The ESE and ASE are in close agreement for all scenarios, +% indicating the validity of the variance estimation. +% On the other hand, Table~\ref{tab:time} shows that, when the full multiplier bootstrapping approach is employed (fmb), the non-smooth approach has a slight edge over the smooth approach in terms of computing time while the iterative estimator took 5.1 to 9.5 times longer than the smooth approach. +% Overall, the timing results show the proposed method can provide valid inference results +% in seconds even with dataset size as large as 1000 and with the computational demanding +% full multiplier bootstrapping approach for variance estimation. + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{1.0\textwidth} + % \includegraphics[scale = .275]{figure/vplot_t0_c3_Q50} + %\includegraphics[width = .95\textwidth]{figure/vplot_t0_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_t0_c3_Q50} + \caption{$t_0 = 0$} + \label{fig:sim1t0} + %} + \end{subfigure} +% \hill + \\[3ex] + \begin{subfigure}[b]{1.0\textwidth} + %\includegraphics[width = .95\textwidth]{figure/vplot_t1_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_t1_c3_Q50} + \caption{$t_0 = 1$} + \label{fig:sim1t1} + %} + \end{subfigure} + \caption{\label{fig:sim1}Comparison of the \code{smooth}, \code{nonsmooth} and \code{iterative} estimators with \code{se = "fmb"} + under 30\% censoring and $\tau = 0.5$.} +\end{figure*} + +When $t_0 = 0$, the targeted semiparametric quantile regression model for residual life +simplifies to the standard quantile regression model for survival time. +In such cases, existing functions like \code{crq()} from the \CRANpkg{quantreg} package \citep{quantregpackage} +can be employed. +A comparison between the performance of \code{crq()} and our proposed implementation +when $t_0 = 0$ is presented in the Supplementary Materials, +where the standard errors of the \code{crq()} are obtained from the bootstrap method with 200 bootstrap samples. +Overall, the performance of \code{crq()} is comparable to the proposed methods in terms of bias and standard errors. +However, we have occasionally encountered situations where the \code{crq()} function fails to converge, +particularly when the sample size is large, as in the case of $n = 1000$. +In the other extended simulation scenarios outlined in the Supplementary Materials, +which encompass various levels of $t_0$, censoring proportions, and $\tau$, +the proposed methods consistently exhibit satisfactory performance across all settings. + +% We further extended the simulation to include different levels of $t_0$, $\tau$, +% and censoring rate, and the results of this extended simulation can also be found in the +% Supplementary Materials. + +The true potential of the proposed smooth approach lies in its capability for +efficient variance estimation through the implementation of the partial multiplier bootstrapping approach. +This approach eliminates the need for repetitive solving of estimating equations, +resulting in improved computational efficiency in variance estimation. +To demonstrate its usefulness, we conducted a simulation using both the smooth approach +and the iterative approach with the partial multiplier bootstrapping approach (\code{se = "pmb"}). +This simulation was conducted under the settings of $\tau = 0.5$, $t_0 = 0$ and $1$, +and a 30\% censoring rate. +The \code{do\_pmb()} function was accordingly modified as follows. + +\begin{example} + > do_pmb <- function(n, t0, cen, Q, nB) { + + dat <- data.gen(n, t0, cen, Q) + + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + + stamp <- NULL + + stamp[1] <- Sys.time() + + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "pmb") + + stamp[2] <- Sys.time() + + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "pmb") + + stamp[3] <- Sys.time() + + list(smooth = c(f1$coef, f1$std), + + iter = c(f2$coef, f2$std), + + times = diff(stamp)) + + } + > B <- 200 + > set.seed(2) + > sims0_pmb <- mapply(function(n, t0) + + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) + > sims1_pmb <- mapply(function(n, t0) + + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) +\end{example} +% \input{codes/simulation_pmb} + +The simulation results obtained using the partial multiplier bootstrapping approach are presented in Figure~\ref{fig:sim2} +and Tables 7 -- 12 in the Supplementary Materials, +while the computing times are displayed in the lower panel of Table~\ref{tab:time}. +Overall, the estimation results obtained using \code{se = "pmb"} in Figure~\ref{fig:sim2} +closely resemble those in Figure~\ref{fig:sim1} with \code{se = "fmb"}. +As seen in Tables 7 and 8, the ESEs from the non-iterative and iterative methods are comparable, +while the ASEs slightly overestimate the ESEs when the sample size is small. +The gaps are slightly smaller for the iterative method, +as shown in some cases \citep{johnson2009induced, kim2021comparison}. +The magnitudes of the differences are not large, and they also become smaller when the sample size reaches $n = 1000$. +More importantly, the computing times with \code{se = "pmb"} show significant speed improvements compared +to when \code{se = "fmb"} is used in every case; we observed up to 79\% timing improvements. + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{1.0\textwidth} + %\subfigure[$t_0 = 0$]{ + % \includegraphics[scale = .275]{figure/vplot_t0_c3_Q50} + %\includegraphics[width = 0.95\textwidth]{figure/vplot_pmb_t0_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_pmb_t0_c3_Q50} + \caption{$t_0 = 0$} + \label{fig:sim2t0} + %} + \end{subfigure} +% \hfill + \\[3ex] + \begin{subfigure}[b]{1\textwidth} +% \subfigure[$t_0 = 1$]{ + %\includegraphics[width = 0.95\textwidth]{figure/vplot_pmb_t1_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_pmb_t1_c3_Q50} + \caption{$t_0 = 1$} + \label{fig:sim2t1} + %} + \end{subfigure} + \caption{\label{fig:sim2} + Comparison of the \code{smooth} and \code{iterative} estimators with \code{se = "pmb"} + under 30\% censoring and $\tau = 0.5$.} +\end{figure*} + + + +\begin{table} + \caption{\label{tab:time} Runtimes (in seconds) when \code{se = fmb} and \code{se = pmb}.} + \centering + \begin{tabular}[t]{llrrrrrrrr} + \toprule + \multicolumn{2}{c}{} & \multicolumn{3}{c}{$t_0 = 0$} & \multicolumn{3}{c}{$t_0 = 1$} \\ + \cmidrule(l{3pt}r{3pt}){1-2}\cmidrule(l{3pt}r{3pt}){3-5} \cmidrule(l{3pt}r{3pt}){6-8} + se & method & 200 & 400 & 1000 & 200 & 400 & 1000\\ + \midrule + % \multirow{3}{*}{fmb} + \code{fmb} & Smooth & 0.103 & 0.174 & 0.471 & 0.106 & 0.178 & 0.480\\ + & Nonsmooth & 0.080 & 0.142 & 0.472 & 0.080 & 0.141 & 0.468\\ + & Iterative & 0.981 & 1.500 & 2.410 & 0.985 & 1.567 & 2.882\\ + [2ex] + \code{pmb} & Smooth & 0.022 & 0.052 & 0.223 & 0.022 & 0.053 & 0.224\\ + & Iterative & 0.296 & 0.580 & 1.407 & 0.296 & 0.581 & 1.435\\ + \bottomrule + \end{tabular} +\end{table} +% \input{codes/simTime} + +After confirming the satisfactory performance of the proposed methodologies, +we now proceed to illustrate the application of the \code{init} argument. +This argument controls the initial values assigned to the root-finding algorithm's estimates and +the plotting capacity of the \CRANpkg{qris} package. +For this illustrative example, we consider a simpler simulation scenario that involves a single binary covariate. +This simplified simulation can be generated using the revised version of the \code{data.gen()} function provided below. + +\begin{example} + > ## Global parameters + + rho0 <- .2 * sqrt(log(2)) + + rho1 <- .1 * sqrt(log(2)) + > data.gen <- function(n) { + + dat <- data.frame(censoring = runif(n, 0, 23.41), + + Time0 = sqrt(-log(1 - runif(n))), + + X = rbinom(n, 1, .5)) + + dat$Time0 <- ifelse(dat$X > 0, dat$Time0 / rho1, dat$Time0 / rho0) + + dat$Time <- pmin(dat$Time0, dat$censoring) + + dat$status <- 1 * (dat$Time0 < dat$censoring) + + subset(dat, select = c(Time, status, X)) + + } + > set.seed(10) + > head(dat <- data.gen(200)) + Time status X + 1 6.034713 1 1 + 2 7.181451 0 1 + 3 9.993908 0 1 + 4 16.225520 0 1 + 5 1.993033 0 1 + 6 5.277471 0 0 +\end{example} + +The updated \code{data.gen()} function returns a \code{data.frame} comprising three variables: +\code{Time}, \code{status}, and \code{X}, representing the +observed survival time, event indicator, and binary covariate, respectively. +We will first illustrate the usage of the argument \code{init} by considering three different initial values: +\code{init = "rq"}, \code{init = c(1,1)}, and a random vector \code{init = rnorm(2)}, +all used in conjunction with the smooth estimator \code{method = "smooth"}. +The following codes provide an example with different initial values. +\begin{example} + > (random <- rnorm(2)) + [1] 1.5025446 0.5904095 + > f1 <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, init = "rq", nB = 0) + > f2 <- update(f1, init = c(1, 1)) + > f3 <- update(f1, init = random) + > all.equal(f1$coef, f2$coef) + [1] TRUE + > all.equal(f2$coef, f3$coef) + [1] TRUE +\end{example} + +The \class{qris} object, with its \code{call} component, +is compatible with the \code{update()} function, +a built-in function commonly used for updating the +attributes of an existing object without requiring redundant and repetitive code. +In the example above, we used the \code{update()} function to modify the initial value specification in \code{f1}. +We observed that different initial values yield identical point estimates, thereby affirming the robustness of the proposed method against fluctuations in initial values. + +The covariate effects, along with their associated 95\% point-wise confidence intervals across +various quantiles or base times, can be visually assessed by applying the generic function +\code{plot()} to a \class{qris} object. +We demonstrate this feature using the following \code{qris} fit, +where the standard errors are obtained using \code{se = "pmb"}, $t_0 = 1$, +and all other parameters are set to their default values. +We update the \code{qris} fit with extended quantiles over ${0.4, 0.5, 0.6, 0.7}$ and +plot the covariate effects against these quantiles using the \code{plot()} function. +\begin{example} + > fit <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, se = "pmb") + > fit2 <- qris.extend(fit, Qs = 4:7 / 10) +\end{example} +The extended \class{qris} fit generated by the \code{qris.extend()} function inherits +all the attributes from the original \class{qris} object and +includes additional \code{ggdat} components. +The following code compares the components of the returned values from the extended \class{qris} fit +and the original \class{qris} fit. +\begin{example} + > class(fit2) + [1] "qris" + > names(fit) + [1] "call" "coefficient" "data" "formula" "para" + [6] "stderr" "varNames" "vcov" + > setdiff(names(fit2), names(fit)) + [1] "ggdat" +\end{example} +Specifically, the extended \class{qris} fit inherits +\code{call}, \code{coefficient}, \code{para}, \code{stderr}, \code{varNames}, and \code{vcov} +from the original \class{qris} object. +The \code{call} component is the function call from the original \code{qris()} fit, +while \code{coefficient}, \code{stderr}, and \code{vcov} are used to store the point estimates, +standard error estimates, and covariance matrix, respectively. +The \code{para} component is a list containing the parameters specified during the +fitting of the quantile regression model, and \code{varNames} is a character string +representing the variable names in the function call. +The newly added values are \code{ggdat} and \code{gg}. +The \code{ggdat} is a data frame containing covariate information generated under +the different quantiles and base times specified in the \code{qris.extend()}. +Finally, the corresponding covariate effect plot can be generated by plotting the +extended \class{qris} fit as follows. +\begin{example} + > plot(fit2) +\end{example} + +The true values of $\beta$'s at different quantiles and base times, +computed from Equation~\eqref{eq:sim:weibull}, can be implemented in the following commands. +\begin{example} + > ## Global parameters + > r <- 2:1 * sqrt(log(2)) / 10 + > k <- 2 + > ## Function to calculate true beta + > trueB <- function(t0, tau) { + + b <- log(1 / r * ((r * t0) ^ k - log(1 - tau))^(1 / k) - t0) + + c(b[1], b[2] - b[1]) + + } + > ## True beta calculation + > true_Q <- c(t(sapply(4:7 / 10, trueB, t0 = 1))) + > true_t0 <- c(t(sapply(1:3, trueB, tau = .5))) +\end{example} +% \input{codes/trueB} + +The following code extends the \class{ggplot} objects generated by \code{plot.qris()} +by adding additional layers of true value curves and incorporating various \code{ggplot} options. +The resulting figures, Figure~\ref{fig:simulation_quantile} and Figure~\ref{fig:simulation_t0}, +present the output based on whether the covariate effects are plotted against quantiles or base times, respectively. +This observed trend aligns with the specifications described in Equation~\eqref{eq:sim:weibull}, +where increasing $\tau$ corresponds to an increasing $\beta_0$ while keeping $\rho$ and $X$ fixed. +On the other hand, the covariate effect does not change with quantiles but slightly increases with base times, +echoing the model specification where $\beta_0$ is inversely related to $t_0$ and $\beta_1$ +%is directly proportional to $t_0$. +increases as $t_0$ increases. + +\begin{example} + > library(ggplot2) + > plot(fit2) + theme(legend.position = "bottom") + + + geom_line(aes(x = Qs, y = true_Q, col = variable, linetype = "True value")) + + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) + > b <- plot(fit2, t0s = 1:3, byQs = F) + > b + theme(legend.position = "bottom") + + + geom_line(aes(x = t0s, y = true_t0, col = variable, + + linetype = "True value")) + + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) +\end{example} +% \input{codes/plot1} + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[Plot for $Q\in\{0.4, \ldots, 0.7\}$ at $t_0 = 1$.]{ + \includegraphics[width = 1.0\textwidth]{simulation_smooth_quantile.png} + \caption{Plot for $Q\in\{0.4, \ldots, 0.7\}$ at $t_0 = 1$} + \label{fig:simulation_quantile} +% } + \end{subfigure} + \hfill + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[Plot for $t_0\in\{1, \ldots, 3\}$ at $Q = 0.5$.]{ + \includegraphics[width = 1.0\textwidth]{simulation_smooth_t0.png} + \caption{Plot for $t_0\in\{1, \ldots, 3\}$ at $Q = 0.5$} + \label{fig:simulation_t0} +% } + \end{subfigure} + \caption{(a) Estimated effects of covariate with the associated $95\%$ pointwise confidence intervals for quantiles ranging from 0.4 to 0.7 at $t_0=1$. Red and blue solid lines are the point estimates of regression parameters for intercept and covariate X, respectively. Similarly, red and blue dotted lines are the upper and lower bounds of $95\%$ pointwise confidence intervals for intercept and covariate X, respectively. + (b) Estimated effects of covariate with the associated $95\%$ pointwise confidence intervals for base times ranging from 1 to 3 at $\tau=0.5$. Red and blue solid lines are the point estimates of regression parameters for intercept and covariate X, respectively. Similarly, red and blue dotted lines are the upper and lower bounds of $95\%$ pointwise confidence intervals for intercept and covariate X, respectively.} + \label{fig:simulation} +\end{figure*} + + +\subsection{North Central Cancer Treatment Group Lung Cancer Data} \label{subsec:lung} + +The North Central Cancer Treatment Group Lung Cancer Data records the survival of patients with advanced lung cancer, +along with assessments of the patients' performance status measured by both physicians and the patients themselves +\citep{loprinzi1994prospective}. +The original objective of the study was to ascertain whether descriptive information from a +patient-completed questionnaire could offer prognostic insights. +The original objective of the study was to determine whether descriptive information from a patient-completed +questionnaire could provide prognostic information. +However, for this illustration, we focus on how gender and weight loss affect the quantiles of residual life +for patients diagnosed with advanced lung cancer at different time points. +The lung cancer data are publicly available from the \CRANpkg{survival} package \citep{survivalpackage} as \code{lung}. +The following code displays the structure of the \code{lung} dataset with variables of interest. + +\begin{example} + > data(cancer, package = "survival") + > str(subset(lung, select = c(time, status, sex, wt.loss))) + 'data.frame': 228 obs. of 4 variables: + $ time : num 306 455 1010 210 883 ... + $ status : num 2 2 1 2 2 1 2 2 2 2 ... + $ sex : num 1 1 1 1 1 1 2 2 1 1 ... + $ wt.loss: num NA 15 15 11 0 0 10 1 16 34 ... +\end{example} +% \input{codes/cancer0} + +The \code{lung} data contains 228 patients whose observed survival times in days and +censoring status (1 = censored, 2 = dead) are recorded in the \code{time} and the \code{status} columns, +respectively. +Although the censoring status in this dataset is not recorded in the typical 0-1 fashion, +the \code{Surv()} function is still applicable to create the corresponding ``\code{Surv}" object. +The \code{lung} data yields a censoring rate of $27.6\%$ with a median survival time of 310 days. +The covariates of interest are gender (\code{sex = 1} if male, \code{sex = 2} if female) and +weight loss (\code{wt.loss}). +In the following, we use the proposed semiparametric quantile regression models to assess +the gender and standardized weight loss effects on different quantiles of residual life at different base times. + + +We first model the median residual life (\code{Q = 0.5}) when the base time is one month (\code{t0 = 30}). +Since the estimated median survival times for combined lung cancers are typically less than one year, +with a range of 8 to 13 months \citep{siegel2021cancer}, +setting the base time at one month provides insight into how gender and weight loss impact the residual time +in early follow-up. +In the following, we obtain the regression coefficient estimates using the induced smoothing functions and +the corresponding variance estimate with the partial multiplier bootstrap approach. + +\begin{example} + > lung$male <- factor(lung$sex, 1:2, c("Male", "Female")) + > lung$std.wt.loss <- scale(lung$wt.loss) + > fit1 <- qris(Surv(time, status) ~ male + std.wt.loss, + + data = lung, t0 = 30, Q = .5, nB = 100, + + method = "smooth", se = "pmb") + > summary(fit1) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.5, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.5611 0.0950 58.550 <2e-16 *** + maleFemale 0.4804 0.1805 2.661 0.0078 ** + std.wt.loss -0.0731 0.0837 -0.874 0.3824 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer1} + +Subjects with missing values (in any of the variables relevant for the modeling task) +are automatically removed when \code{qris()} is called. +The estimated intercept implies that the median residual life for patients who have survived up to 30 days +is $\exp(5.5611) = 260.1$ days for a male with an average weight loss. +More interestingly, the summary shows that the gender effect is statistically significant at the 0.05 significance level, +indicating that a female patient is expected to have a median residual life at 30 days that is $\exp(0.4804) = 1.617$ +times that of a male patient with the same weight loss. +The effect of the weight loss is not statistically significant at the 0.05 level. +In addition to \code{summary()}, important statistics such as the coefficient and variance estimates can be extracted by +\code{S3} methods \code{coef()} and \code{vcov()}, respectively. + +\begin{example} + > coef(fit1) + (Intercept) maleFemale std.wt.loss + 5.56111984 0.48044228 -0.07307635 + > vcov(fit1) + (Intercept) maleFemale std.wt.loss + (Intercept) 0.009021459 -0.010944549 -0.003074041 + maleFemale -0.010944549 0.032594288 0.002847148 + std.wt.loss -0.003074041 0.002847148 0.006998314 +\end{example} +% \input{codes/s3fit1} +Moreover, the corresponding 95\% Wald-type confidence interval can be printed by applying +the \code{confint()} function to the \class{qris} object. +\begin{example} + > confint(fit1) + 2.5 % 97.5 % + (Intercept) 5.3749598 5.74727989 + maleFemale 0.1265926 0.83429199 + std.wt.loss -0.2370390 0.09088626 +\end{example} +% \input{codes/cifit1} + +The \code{update()} function can be conveniently applied to update existing \class{qris} objects. +The following examples update the \code{method} and \code{se} arguments from \code{fit1}. +The updated results yield similar coefficient estimates, but the non-smooth procedure (\code{method = "nonsmooth"}) +yields slightly greater standard error estimates. +\begin{example} + > summary(fit2 <- update(fit1, method = "nonsmooth", se = "fmb")) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.5, nB = 100, method = "nonsmooth", + se = "fmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.5585 0.1132 49.106 <2e-16 *** + maleFemale 0.4695 0.2015 2.331 0.0198 * + std.wt.loss -0.0668 0.1029 -0.650 0.5159 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer2} + +\begin{example} + > summary(update(fit1, method = "iterative")) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.5, nB = 100, method = "iterative", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.5605 0.1016 54.712 <2e-16 *** + maleFemale 0.4807 0.1626 2.957 0.0031 ** + std.wt.loss -0.0720 0.0903 -0.797 0.4252 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer3} + +At a lower (\code{Q = 0.25}) and a higher (\code{Q = 0.75}) quantiles, +the gender effect remains significant at the 0.05 significance level indicating +female patients are associated with longer lower-quantile and higher-quantile residual life +than male patients with the same weight loss. +Among these models, we observed that female patients tend to have higher coefficient estimates +when fitting higher-quantile residual life. +While the sign of the estimated regression coefficient for weight loss changes to a negative value +when considering the lower quantile, the effects remain statistically insignificant for +both the lower and higher quantiles. + + +\begin{example} + > summary(update(fit1, Q = 0.25)) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.25, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 4.9111 0.1034 47.480 <2e-16 *** + maleFemale 0.4651 0.2041 2.279 0.0227 * + std.wt.loss 0.0543 0.0584 0.930 0.3525 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer4} +\begin{example} + > summary(update(fit1, Q = 0.75)) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.75, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 6.0748 0.1063 57.126 <2e-16 *** + maleFemale 0.5237 0.1487 3.522 0.0004 *** + std.wt.loss -0.0171 0.1166 -0.147 0.8835 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer5} + +We also consider the base time at six months \code{t0 = 180}, +which enables us to assess gender and weight loss effects in median residual time at a moderate length of follow-up. +The estimated effect for the gender and weight loss increases as $t_0$ increases from $30$ days to $180$ days and +becomes significant at the 0.05 significant level. +Additionally, the effect of the weight loss seems to be associated with a shorter survival time after +$180$ days, with a $p$-value of $0.0008$. + +\begin{example} + > summary(update(fit1, t0 = 180)) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 180, Q = 0.5, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.2243 0.0912 57.255 <2e-16 *** + maleFemale 0.5821 0.1867 3.117 0.0018 ** + std.wt.loss -0.2515 0.0754 -3.337 0.0008 *** + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer6} + +The \class{qris} object is designed to be compatible with \code{S3} methods: \code{predict()} and +\code{residuals()} functions. +The following presents the fitted survival times for two hypothetical male and female patients with no weight loss, +as well as the first five residual values for the dataset. + \begin{example} + > lung.new <- data.frame(male = c("Male", "Female"), std.wt.loss = 0) + > predict(fit2, newdata = lung.new) + 1 2 + 444.9026 289.4422 + > head(residuals(fit2), 5) + 1 2 3 4 5 + -20.86127 -575.86127 232.44474 -416.82295 -555.82295 + \end{example} + +To better understand the covariate effects on different quantiles of residual time and across different base times, +we plot the estimated regression coefficients of the intercept, sex, and weight loss in \code{fit1} and \code{fit2}. +Figures~\ref{fig:realdata_smooth} and~\ref{fig:realdata_nonsmooth} display the estimated regression coefficients when +\code{method = "smooth"} and \code{method = "nonsmooth"}, respectively, at +different quantiles ranging from 0.2 and 0.5 at $t_0 = 30$ days. +The \code{plot.qris()} function is currently not available for the iterative estimator. This is mainly due to an extended computation time involved, as indicated by our simulation results, and the nature of plotting that necessitates computations across various quantiles or base times. +As expected, the two plots show very similar patterns. +We plot the estimated regression coefficients of the intercept, sex, and weight loss for different quantiles in the range +of 0.2 to 0.5 at $t_0= 50$, 60, 70, and 80 days (Figure~\ref{fig:realdata_multi_quantile}), +as well as for different base times in the range of 50 to 80 days at $\tau=0.2$, 0.3, 0.4, and 0.5 (Figure~\ref{fig:realdata_multi_basetime}). +The estimation method used is non-iterative induced smoothed estimation (\code{method = "smooth"}). +In Figure~\ref{fig:realdata_multi_quantile}, +the estimated intercept increases as the quantile increases (for a given base time). +The estimated slopes for sex remain largely the same, +but those for weight loss tend to decrease slightly across different quantiles (for a given base time). +These patterns remain consistent for different base times. +In Figure~\ref{fig:realdata_multi_basetime}, the estimated intercepts increase as the quantiles increase, +but with a given quantile, they remain flat across the different base times considered. +The estimated regression coefficients for the two covariates do not appear to change significantly +for different base times. + +\begin{example} + > hide <- theme(legend.position = "none") + > plot(fit1, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) + > plot(fit2, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) + > plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = TRUE, ggextra = hide) + > plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = FALSE, ggextra = hide) +\end{example} +% \input{codes/plotcancer1} + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[\code{method = ''smooth''} and \code{se = ''pmb''}]{ + \includegraphics[width = 1.0\textwidth]{realdata_smooth_quantile.png} + \caption{\code{method = ''smooth''} and \code{se = ''pmb''}} + \label{fig:realdata_smooth} +% } + \end{subfigure} +% \hfill + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[\code{method = ''nonsmooth''} and \code{se = ''fmb''}]{ + \includegraphics[width = 1.0\textwidth]{realdata_nonsmooth_quantile.png} + \caption{\code{method = ''nonsmooth''} and \code{se = ''fmb''}} + \label{fig:realdata_nonsmooth} +% } + \end{subfigure} + \\[2ex] + \begin{subfigure}[b]{0.47\linewidth} +%\subfigure[Multiple covariate effect plot against quantiles]{ + \includegraphics[width = 1.0\textwidth]{realdata_multi_quantile.png} + \caption{\code{method = ''smooth''} and \code{se = ''pmb''}} + \label{fig:realdata_multi_quantile} +% } + \end{subfigure} +% \hfill + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[Multiple covariate effect plot against base time]{ + \includegraphics[width = 1.0\textwidth]{realdata_multi_basetime.png} + \caption{Multiple covariate effect plot against base time} + \label{fig:realdata_multi_basetime} +% } + \end{subfigure} + \caption{Green, red and blue lines are the point estimates of regression parameters for + intercept, covariate sex and covariate weight loss, respectively. Solid line and dotted line are the point estimates and the upper and lower bounds of $95\%$ pointwise confidence intervals for each regression coefficient. + (a) \code{method = "smooth"} and \code{se = "pmb"} ($\tau = 0.2, 0.3, 0.4, 0.5, t_0=30$) + (b) \code{method = "nonsmooth"} and \code{se = "fmb"} ($\tau = 0.2, 0.3, 0.4, 0.5, t_0=30$) + (c) \code{method = "smooth"} and \code{se = "pmb"} against quantiles ($\tau = 0.2, 0.3, 0.4, 0.5, t_0 = 50, 60, 70, 80$) + (d) \code{method = "smooth"} and \code{se = "pmb"} against base times ($\tau = 0.2, 0.3, 0.4, 0.5, t_0 = 50, 60, 70, 80$)} + \label{fig:realdata} +\end{figure*} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Conclusion %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Conclusion} \label{sec:conclusion} + +The purpose of the \CRANpkg{qris} package is to provide a comprehensive tool for fitting quantile regression models on residual life for right-censored survival data, with the aim of promoting widespread dissemination and utilization. +This package implements one estimation method based on non-smooth estimating functions and two estimation methods +based on their induced smoothed versions. +The non-smooth estimator is calculated through $L_{1}$-type minimization while incorporating the IPCW technique, +and its variance is calculated using full multiplier bootstrapping. +The first type of the induced smoothed estimator, a non-iterative version, directly solves estimating functions, +and its variance can be calculated using either the full multiplier bootstrapping or +the robust sandwich form with partial multiplier bootstrapping. +As evidenced by the simulation results, this enables one to substantially reduce computing times without sacrificing +estimation accuracy and stability compared to the original non-smooth function-based method. +The iterative smoothed estimator has an advantage in obtaining more precise estimates than its non-iterative version, +although it requires longer computing times. For all these methods, estimates of the regression coefficients and their +variances can be calculated at user-defined quantiles and base times, as long as they are identifiable. +Additionally, the package provides features for plotting estimates with associated 95\% confidence intervals against +quantiles and base times using the generic \code{plot} function. +These plots visualize patterns of estimates at different quantiles and base times, +helping users to easily grasp the overall picture. +The package \CRANpkg{qris} and its included functions are verified through illustrations using simulated data +with interpretation of the results demonstrated through a real data application. + + +Some possible directions for extending our package are as follows. +Efforts can be made to reduce the computational burden associated with variance estimation, +which currently accounts for a significant portion of the computing time. +In particular, the iterative-induced smoothed method employs the partial multiplier bootstrap method +to calculate variance estimates in each iteration. +Since this method requires multiple iterations, it is crucial to explore more computationally efficient variance +estimation procedures for each iteration to reduce the currently relatively longer computation time. +One approach is to utilize a closed-form estimation of the mid-part of the sandwich-type variance, +as discussed in \citet{chiou2014fast, choi2018smoothed}. +Implementing this direct variance estimation in each iteration is expected to further enhance computation efficiency. +Another direction is to generalize the approaches to allow for the inclusion of sampling weights, +which is useful for bias correction when failure time data are generated from non-random sampling designs, +such as case-cohort designs \citep{prentice1986case, chiou2015semiparametric}. +% To obtain valid parameter estimates under such study designs, the incorporation of sampling weights is a standard approach. +The current estimating functions implemented in the \CRANpkg{qris} package assume that the data are randomly sampled, +with sampling weights set to 1." +% Incorporation of sampling weights that can accommodate unequal probabilities of being sampled in the \code{qris} function is a natural direction of future extension. +To the best of our knowledge, there is a lack of model-checking procedures and model-comparison methods +specifically designed for the non-smooth estimator, +and a logical next step would be to develop these procedures for subsequent integration into the package. + + +\bibliography{2022-185_R3} + +\address{Kyu Hyun Kim\\ + Department of Statistics and Data Science \emph{and} Department of Applied Statistics\\ + Yonsei University\\ + 50 Yonsei-ro, Seodaemun-gu, Seoul\\ + Republic of Korea\\ + \email{kyuhyunkim07@yonsei.ac.kr}} + +\address{Sangwook Kang\\ + Department of Statistics and Data Science \emph{and} Department of Applied Statistics\\ + Yonsei University\\ + 50 Yonsei-ro, Seodaemun-gu, Seoul\\ + Republic of Korea\\ + \email{kanggi1@yonsei.ac.kr}} + +\address{Sy Han Chiou\\ + Department of Statistics and Data Science\\ + Southern Methodist University\\ + P.O. Box 750332, Dallas, TX\\ USA\\ + \email{schiou@smu.edu}\\ + \url{https://www.sychiou.com/}} + +\end{article} +\end{document} diff --git a/_articles/RJ-2024-007/RJ-2024-007.Rmd b/_articles/RJ-2024-007/RJ-2024-007.Rmd new file mode 100644 index 0000000000..74e0c87ff1 --- /dev/null +++ b/_articles/RJ-2024-007/RJ-2024-007.Rmd @@ -0,0 +1,1307 @@ +--- +title: Fitting a Quantile Regression Model for Residual Life with the R Package qris +abstract: | + In survival analysis, regression modeling has traditionally focused on + assessing covariate effects on survival times, which is defined as the + elapsed time between a baseline and event time. Nevertheless, focusing + on residual life can provide a more dynamic assessment of covariate + effects, as it offers more updated information at specific time points + between the baseline and event occurrence. Statistical methods for + fitting quantile regression models have recently been proposed, + providing favorable alternatives to modeling the mean of residual + lifetimes. Despite these progresses, the lack of computer software + that implements these methods remains an obstacle for researchers + analyzing data in practice. In this paper, we introduce an R package + [**qris**](https://CRAN.R-project.org/package=qris) [@R:qris], which + implements methods for fitting semiparametric quantile regression + models on residual life subject to right censoring. We demonstrate the + effectiveness and versatility of this package through comprehensive + simulation studies and a real-world data example, showcasing its + valuable contributions to survival analysis research. +author: +- name: Kyu Hyun Kim + affiliation: |- + Department of Statistics and Data Science *and* Department of Applied + Statistics + address: + - Yonsei University + - 50 Yonsei-ro, Seodaemun-gu, Seoul + - Republic of Korea + - | + [kyuhyunkim07@yonsei.ac.kr](kyuhyunkim07@yonsei.ac.kr){.uri} +- name: Sangwook Kang + affiliation: |- + Department of Statistics and Data Science *and* Department of Applied + Statistics + address: + - Yonsei University + - 50 Yonsei-ro, Seodaemun-gu, Seoul + - Republic of Korea + - | + [kanggi1@yonsei.ac.kr](kanggi1@yonsei.ac.kr){.uri} +- name: Sy Han Chiou + affiliation: Department of Statistics and Data Science + address: + - Southern Methodist University + - P.O. Box 750332, Dallas, TX + - USA + - '[schiou@smu.edu](schiou@smu.edu){.uri}' + - | + +date: '2025-01-10' +date_received: '2022-10-21' +journal: + firstpage: 114 + lastpage: 134 +volume: 16 +issue: 1 +slug: RJ-2024-007 +citation_url: https://rjournal.github.io/ +packages: + cran: + - qris + - quantreg + - aftgee + - ctqr + - Brq + - brms + - cmprskQR + - ggplot2 + - Rcpp + - RcppArmadillo + - survival + bioc: [] +preview: preview.png +bibliography: 2022-185_R3.bib +CTV: ~ +legacy_pdf: yes +legacy_converted: yes +output: + rjtools::rjournal_web_article: + self_contained: yes + toc: no + mathjax: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js + md_extension: -tex_math_single_backslash +draft: no + +--- + + +:::::::::::::::::::::::::::::::: article +## Introduction {#sec:intro} + +In the analysis of time-to-event data, standard statistical inference +procedures often focus on quantities based on failure time and its +relationship with covariates measured at baseline. However, throughout +the follow-up process, inference procedures based on residual life +become increasingly intuitive for assessing the survival of subjects and +can offer insights into the effectiveness of treatments in prolonging +the remaining lifetime. As covariates can substantially change over time +and models based solely on baseline covariates have limited potential +for long-term prognosis, there is a growing interest in modeling the +remaining lifetime of a surviving subject with updated patient +information. Many efforts have been made to model the mean residual life +including proportional mean residual life models +[@maguluri1994estimation; @oakes1990note; @oakes2003inference; @chen2005semiparametric], +additive mean residual life models +[@chen2006linear; @chen2007additive; @zhang2010goodness], and +proportional scaled mean residual life models [@liu2008regression]. +Given that failure times are usually right-skewed and heavy-tailed, the +mean of the residual life might not be identifiable if the follow-up +time is not sufficiently long. For this reason, quantiles, which are +robust under skewed distribution, have traditionally been used more +frequently as alternative summary measures. For example, the approach on +the semiparametric quantile regression model for continuous responses +[@koenker1978regression] has been extended to uncensored failure time +data [@jung1996quasi; @portnoy1997gaussian; @wei2006quantile] and +censored failure times data +[@ying1995survival; @portnoy2003censored; @peng2008survival; @huang2010quantile]. + +When the outcome variable is the residual life, semiparametric quantile +models that apply the inverse probability of censoring weighting (IPCW) +principle to address right-censored observations have been explored +[@jung2009regression; @kim2012censored; @li2016quantile]. These +approaches are based on non-smooth estimating functions with respect to +regression parameters, and the estimates of the regression parameters +are obtained either through zero-crossing of non-smooth estimating +functions using grid search techniques [@jung2009regression] or by +optimizing non-smooth objective functions with $L_1$-minimization +algorithms [@kim2012censored; @li2016quantile]. While these methods are +relatively straightforward to implement, an additional challenge lies in +standard error estimation, which necessitates the computationally +intensive use of a multiplier bootstrap method [@li2016quantile]. +Alternatively, @jung2009regression and @kim2012censored utilized the +minimum dispersion statistic and the empirical likelihood method, +respectively, to bypass the need to directly estimate the variance of +the regression parameter estimator for hypothesis testing and +constructing confidence intervals. The non-smooth nature of the +estimating functions in these approaches precludes the estimation of +variance using the robust sandwich-type variance estimator typically +employed in equation-based estimation methods. To lessen the associated +computational burden, an induced smoothing was proposed +[@brown2005standard], which modifies the non-smooth estimating equations +into smooth ones. Leveraging the asymptotic normality of the non-smooth +estimator, the smooth estimating functions are constructed by averaging +out the random perturbations inherent in the non-smooth estimating +functions. The resulting estimating functions become smooth with respect +to the regression parameters, allowing for the straightforward +application of standard numerical algorithms, such as the Newton-Raphson +method. Furthermore, these smoothed estimating functions facilitate the +straightforward computation of variances using the robust sandwich-type +estimator. The induced smoothing approach has been employed in fitting +semiparametric accelerated failure time (AFT) models via the rank-based +approach +[@johnson2009induced; @aftgeepackage; @chiou2015semiparametric; @Kang:fitt:2016]. +Regarding quantile regression, @choi2018smoothed considered the induced +smoothing approach under a competing-risks setting. All of these methods +are based on modeling event times. Recently, @kim2023smoothed proposed +an induced smoothing estimator for fitting a semiparametric quantile +regression model for residual life. + +The availability of published R packages for fitting quantile regression +models is somewhat limited. The `rq()`, `nlrq()`, `rqss()`, and `crq()` +functions in the package +[**quantreg**](https://CRAN.R-project.org/package=quantreg) +[@quantregpackage] are predominantly used and provide various features +for fitting linear, nonlinear, non-parametric, and censored quantile +regression models, respectively. The `rq()` function minimizes +non-smooth objective functions to obtain point estimates of regression +coefficients and can accommodate right-censored survival times by +incorporating weights. By redefining survival times as the remaining +lifetime at time $t_0$, one can also obtain a non-smoothed estimator for +quantile regression models for residual life [@kim2012censored]. On the +other hand, the `nlrq()` function is designed to fit a nonlinear +quantile regression model, while the `rqss()` function fits additive +quantile regression models with nonparametric terms, including +univariate components and bivariate components, using smoothing splines +and total variation regularization techniques +[@koenker1994quantile; @koenker2004penalized]. Furthermore, the `crq()` +function fits a quantile regression model for censored data on the +$\tau$-th conditional quantile function of the response variable. +Overall, the [**quantreg**](https://CRAN.R-project.org/package=quantreg) +implements three methods for handling right-censored survival times: +@powell1986censored's estimator, @portnoy2003censored's estimator and +@peng2008survival's estimator. However, none of the implemented methods +in the `nlrq()`, `rqss()`, or `crq()` functions are applicable for +handling censored residual life using the induced smoothing methods. The +only function that implements the induced smoothing method is the +`aftsrr()` function in the package +[**aftgee**](https://CRAN.R-project.org/package=aftgee) +[@aftgeepackage], but it is specifically designed for fitting +semiparametric AFT models, which are not directly applicable to fitting +quantile regression models. + +Other R packages that can be used to fit quantile regression models for +survival data include the package +[**ctqr**](https://CRAN.R-project.org/package=ctqr) [@ctqrpackage], +package [**Brq**](https://CRAN.R-project.org/package=Brq) [@Brqpackage], +package [**brms**](https://CRAN.R-project.org/package=brms) +[@brmspackage], and package +[**cmprskQR**](https://CRAN.R-project.org/package=cmprskQR) +[@cmprskQRpackage]. The `ctqr()` function in the package +[**ctqr**](https://CRAN.R-project.org/package=ctqr) implements the +methods proposed in @ctqrpackage for right or interval-censored failure +times with left-truncation. The `Bqr()` function in the package +[**Brq**](https://CRAN.R-project.org/package=Brq) implements Bayesian +methods based on the asymmetric Laplace distribution. In the package +[**brms**](https://CRAN.R-project.org/package=brms), the `brm()` +function with the `family=asym_laplace()` option enables the +implementation of full Bayesian inference. The `crrQR()` function in the +package [**cmprskQR**](https://CRAN.R-project.org/package=cmprskQR) +allows fitting quantile regression models with competing risks. All of +these R packages are designed for fitting quantile regression models for +failure times defined from a baseline and are not applicable to the +residual life setting. + +The recently developed R package +[**qris**](https://CRAN.R-project.org/package=qris) [@R:qris] provides +an efficient tool for fitting semiparametric quantile regression models +for residual life subject to right censoring. The +[**qris**](https://CRAN.R-project.org/package=qris) package offers three +methods for estimating the regression parameters: $L_1$-minimization of +non-smooth objective functions, induced smoothing with a non-iterative +approach, and an iterative procedure. For standard error estimation, the +[**qris**](https://CRAN.R-project.org/package=qris) package provides two +resampling-based approaches: the partial multiplier bootstrap and the +full multiplier bootstrap methods. The partial multiplier bootstrap +method utilizes the robust sandwich-type estimator by incorporating the +sample variance of perturbed estimating functions, while the full +multiplier bootstrap method is obtained by considering the sample +variance from the solutions of perturbed estimating functions. To +enhance the interpretability of results, the +[**qris**](https://CRAN.R-project.org/package=qris) package incorporates +graphical visualizations of covariate effects at different quantiles and +base times, utilizing the plotting environment similar to that in the +[**ggplot2**](https://CRAN.R-project.org/package=ggplot2) package +[@ggplot2package], thereby allowing for extensive flexibility and +customization. The ultimate goal of creating the +[**qris**](https://CRAN.R-project.org/package=qris) package is to +facilitate the easy incorporation of quantile regression for residual +life into daily routines. The package +[**qris**](https://CRAN.R-project.org/package=qris) is available on the +Comprehensive R Archive Network (CRAN) at +. + +The rest of the article is organized as follows: Section [2](#sec:nsm) +introduces a semiparametric regression model for quantiles of residual +life and the estimation methods implemented in the package. +Section [3](#sec:implementation) provides details about computing +algorithms. Illustrations of the package using a simulated dataset and +the real data from the North Central Cancer Treatment Group are +presented in Section [4](#sec:illustration). Finally, in +Section [5](#sec:conclusion), concluding remarks are provided along with +some discussions. + +## Semiparametric quantile regression for residual life {#sec:nsm} + +Define $T$ as the potential failure time that is subject to right +censoring by $C$ and $\mathbf{X}$ as a $p \times 1$ vector of +covariates, where $p$ is the number of covariates, including an +intercept. The observed data consists of $n$ independent copies of +$(Z, \delta, \mathbf{X})$, where $Z = \min(T, C)$, +$\delta = I(T \leq C)$, and $I(\cdot)$ is an indicator function. We also +assume $T$ and $C$ are marginally independent. Define the $\tau$-th +quantile of the residual life at $t_0 > 0$ as $\theta_{\tau}(t_0)$ that +satisfies +$P(T_i - t_0 \geq \theta_{\tau}(t_0) \ | \ T_i > t_0) = 1 - \tau$. We +consider the semiparametric quantile regression model for the residual +life [@kim2012censored; @kim2023smoothed]. Given $T_i > t_0$, +$$\label{qr:mod1} + \log(T_i - t_0) = \mathbf{X}_{i}^{\top}\boldsymbol{\mathbf{\beta}}_0(\tau, t_0) + \epsilon_i, i = 1, \ldots, n, %\label{qr:mod2} (\#eq:qrmod1)$$ +where $\boldsymbol{\mathbf{\beta}}_0(\tau, t_0)$ is a $p \times 1$ +vector of regression coefficients, and $\epsilon_i$ is a random error +having zero $\tau$-th quantile. The quantile regression model for a +continuous response [@koenker1978regression] is a special case of +Equation \@ref(eq:qrmod1) when $t_0 = 0$. For ease of notation, we omit +$\tau$ and $t_0$ in $\boldsymbol{\mathbf{\beta}}_0(\tau, t_0)$ and +$\theta_{\tau}(t_0)$ and write $\boldsymbol{\mathbf{\beta}}_0$ and +$\theta$. We present different estimation procedures to estimate +$\boldsymbol{\mathbf{\beta}}_0$ given $\tau$ and $t_0$ in the following. + +### Estimation using non-smooth functions {#sec:nsm:pt} + +When there is no censoring, an estimator for $\beta_0$ in +Equation \@ref(eq:qrmod1) can be obtained by solving the estimating +equation [@kim2012censored], where +$$\label{eq:ns:obj1} + \frac{1}{n}\sum_{i=0}^{n}I[T_i \ge t_0] \mathbf{X}_i \left\{I\left[\log(T_i - t_0) \leq \mathbf{X}_i^{\top}\boldsymbol{\mathbf{\beta}} \right] - \tau \right\} = 0. (\#eq:nsobj1)$$ +However, Equation \@ref(eq:nsobj1) cannot be directly used when +$T_i - t_0$ is subject to right censoring. The IPCW technique can be +incorporated into Equation \@ref(eq:nsobj1) to account for the right +censoring [@li2016quantile]. Specifically, in the presence of right +censoring, the estimator for $\boldsymbol{\mathbf{\beta}}_0$ in +Equation \@ref(eq:qrmod1) can be obtained as the root of the following +weighted estimating equations: +$$\label{eq:nsm:ipw} + U_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau) = \frac{1}{n}\sum_{i=1}^{n}I[Z_i \ge t_0] \mathbf{X}_i \left\{I \left[\log(Z_i - t_0) \leq \mathbf{X}_i^{\top} \boldsymbol{\mathbf{\beta}} \right]\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0)} -\tau \right\}, (\#eq:nsmipw)$$ +where $\widehat{G}(\cdot)$ is the Kaplan-Meier estimate of the survival +function $G(\cdot)$ of the censoring time $C$ and +$\widehat{G}(t) = \prod_{i: t_i \leq t} (1 - \sum_{j=1}^n (1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n I(Z_j \geq t_i))$. +A computational challenge arises because the exact solution to +Equation \@ref(eq:nsmipw) might not exist due to the non-smoothness in +$\beta$ caused by the involvement of indicator functions. When the exact +solutions do not exist, the root of Equation \@ref(eq:nsmipw) can be +approximated by minimizing the $L_1$-objective function +$L_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau)$ [@li2016quantile], +$$\begin{aligned} + \label{l1:nsm} + \nonumber + L_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/\widehat{G}(t_0)} \left| \log(Z_i - t_0) - \mathbf{X}_i^{\top}\beta \right| + \\ + & \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n - \mathbf{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}(Z_l)/\widehat{G}(t_0)}\right| + + \ \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n 2\tau \mathbf{X}_l I[Z_l > t_0]\right|, +\end{aligned} (\#eq:l1nsm)$$ +where $M > 0$ bounds +$\left| \boldsymbol{\mathbf{\beta}}^{\top}\sum_{i=1}^n - \mathbf{X}_i \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/ \widehat{G}(t_0)}\right|$ +and +$\left| \boldsymbol{\mathbf{\beta}}^{\top}\sum_{i=1}^n 2\tau \mathbf{X}_i I[Z_i > t_0]\right|$ +from above. Numerically, the limit $M$ is set to be an extremely large +number, and the `qris()` function uses $M = 10^6$. Denote the resulting +estimator to be $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. It +has been shown that $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$ +is consistent for $\boldsymbol{\mathbf{\beta}}_0$ and asymptotically +normally distributed [@li2016quantile]. + +Despite the well-established asymptotic properties, directly estimating +the variance of $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$ is +impractical because it involves the derivative of non-smooth functions. +A multiplier bootstrap method has typically been employed +[@li2016quantile] to address this difficulty. The multiplier bootstrap +method considers the perturbed version of $U_{t_0}(\beta, \tau)$, +defined as +$$\label{eq:nsm:rev} + U_{t_0}^{\ast}(\beta, \tau) = \frac{1}{n}\sum_{i=1}^{n} \eta_i I[Z_i \ge t_0] \mathbf{X}_i \left\{I \left[\log(Z_i - t_0) \leq \mathbf{X}_i^{\top} \boldsymbol{\mathbf{\beta}} \right]\frac{\delta_i}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} -\tau \right\}, (\#eq:nsmrev)$$ +where $\eta_i, i = 1, \ldots, n,$ are independently and identically +(iid) generated from a positive random variable with unity mean and +variance, and $\widehat{G}^\ast(\cdot)$ is a perturbed version of +$\widehat{G}(\cdot)$, constructed as $\widehat{G}^\ast(t) = +\prod_{i: t_i \leq t} (1 - \sum_{j=1}^n \eta_j(1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n \eta_jI(Z_j \geq t_i))$ +for a given realization of $\eta_i$. On the other hand, a perturbed +$L_1$-objective function, denoted as +$L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau)$, can be similarly +constructed, where +$$\begin{aligned} + L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} \left| \log(Z_i - t_0) - \mathbf{X}_i^{\top}\boldsymbol{\mathbf{\beta}} \right| + \nonumber \\ + & \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n - \mathbf{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}^{\ast}(Z_l)/\widehat{G}^{\ast}(t_0)}\right| + + \ \left| M - \beta^{\top}\sum_{l=1}^n 2\tau \mathbf{X}_l \eta_l I[Z_l > t_0]\right|. +\end{aligned}$$ +Solving for $U_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau) = 0$, or +equivalently, minimizing +$L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau)$, yields one +realization of $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. The +multiplier bootstrap variance is computed as the sample variance of a +large number of realizations of +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. + +### Estimation using induced smoothed functions {#sec:IS:pt} + +The regression coefficient in Equation \@ref(eq:qrmod1) can be more +efficiently obtained through the induced smoothed version of +Equation \@ref(eq:nsmipw). The induced smoothed estimating functions are +constructed by taking the expectation with respect to a mean-zero random +noise added to the regression parameters in Equation \@ref(eq:nsmipw). +Specifically, +$$\begin{aligned} +\label{eq:is} + \widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, H) & = E_w \{U_{t_0}(\boldsymbol{\mathbf{\beta}}+\mathbf{H}^{1/2}\mathbf{W}, \tau)\}\nonumber\\ + & = \frac{1}{n} \sum_{i=1}^{n} I[Z_i > t_0] \mathbf{X}_i \left\{ \Phi\left(\frac{\mathbf{X}_i^\top\boldsymbol{\mathbf{\beta}}-\log(Z_i-t_0)}{\sqrt{\mathbf{X}_i^{\top} \mathbf{H} \mathbf{X}_{i}}}\right)\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0) } -\tau \right\}, +\end{aligned} (\#eq:is)$$ +where $\mathbf{H} = O(n^{-1})$, $\mathbf{W} \sim N(0, \mathbf{I}_p)$ is +a standard normal random vector, $\mathbf{I}_p$ is the $p \times p$ +identity matrix, and $\Phi(\cdot)$ is the cumulative distribution +function of a standard normal random variable. A typical choice for +$\mathbf{H}$ is to fix it at $n^{-1}\mathbf{I}_p$, while some +alternative choices are explored in @chiou2015rank. Let +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ be the solution to +$\widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H}) = 0$. +Since Equation \@ref(eq:is) is a smooth function in +$\boldsymbol{\mathbf{\beta}}$, the estimator can be obtained using +standard numerical algorithms such as the Newton-Raphson method. +Moreover, the induced smoothed estimator for +$\boldsymbol{\mathbf{\beta}}_0$ has been shown to be asymptotically +equivalent to its non-smooth counterpart [@kim2023smoothed]. + +Following the idea in Section [2.1](#sec:nsm:pt), the multiplier +bootstrap procedure can be similarly employed to estimate the variance +of $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$. The perturbed +version of Equation \@ref(eq:is) takes the form of +$$\label{eq:7} + \widetilde{U}^{\ast}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H}) = \frac{1}{n} \sum_{i=1}^{n} \eta_i I[Z_i > t_0] \mathbf{X}_i \left\{ \Phi\left(\frac{\mathbf{X}_i^\top\boldsymbol{\mathbf{\beta}} - \log(Z_i-t_0)}{\sqrt{\mathbf{X}_i^{\top} \mathbf{H} \mathbf{X}_{i}}}\right)\frac{\widehat{G}^{\ast}(t_0) \delta_i}{\widehat{G}^{\ast}(Z_i)} -\tau \right\}. (\#eq:7)$$ +The multiplier bootstrap procedure estimates the variance of +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ by calculating the +sample variance of a large number of realizations of +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ obtained by +repeatedly solving Equation \@ref(eq:7). + +It has been shown that the asymptotic variance +$\mathop{\rm Var}\nolimits(\boldsymbol{\mathbf{\beta}}, \tau)$ can be +decomposed into +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})^{\top} \mathbf{V}(\boldsymbol{\mathbf{\beta}}) \mathbf{A}(\boldsymbol{\mathbf{\beta}})$ +[@kim2023smoothed], where the two components, +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})$ and +$\mathbf{V}(\boldsymbol{\mathbf{\beta}})$, can be estimated separately. +Since Equation \@ref(eq:is) is a smooth function in +$\boldsymbol{\mathbf{\beta}}$, the slope matrix, +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})$, can be conveniently estimated +by differentiating +$\widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H})$ +with respect to $\boldsymbol{\mathbf{\beta}}$. The explicit form of +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})$ is as follows: +$$\begin{aligned} + \label{eq:cov:slp} + \mathbf{A}(\boldsymbol{\mathbf{\beta}}) & = \frac{\partial \widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H})}{\partial \boldsymbol{\mathbf{\beta}}} \nonumber \\ + & = \frac{1}{n}\sum_{i=1}^{n} I[Z_i > t_0] \mathbf{X}_i \frac{G(t_0) \delta_i}{G(Z_i)} \phi\left(\frac{{\mathbf{X}_i}^{\top}\boldsymbol{\mathbf{\beta}} - \log(Z_i-t_0)}{\sqrt{{\mathbf{X}_i}^{\top}\mathbf{H} \mathbf{X}_i}}\right)\left(\frac{-{\mathbf{X}_i}}{\sqrt{{\mathbf{X}_i}^{\top} \mathbf{H} {\mathbf{X}_i}}}\right), +\end{aligned} (\#eq:covslp)$$ +where $\phi (\cdot)$ is the density function of the standard normal +random variable. + +The slope matrix, +$\widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$, +can be evaluated directly by plugging in +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ and +$\widehat{G}(\cdot)$. On the other hand, the variance of the estimating +function, $\widehat{\mathbf{V}}(\boldsymbol{\mathbf{\beta}})$, can be +obtained by a computationally efficient resampling method motivated by +the multiplier bootstrap procedure in Section [2.1](#sec:nsm:pt). +Specifically, we propose estimating +$\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +as the simple variance of a large set of realizations of the perturbed +version of +$\widetilde{U}_{t_0}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}, \tau, \mathbf{H})$ +presented in Equation \@ref(eq:7). We refer to this procedure as the +partial multiplier bootstrapping approach because it utilizes the +perturbed estimating function, similar to the full multiplier +bootstrapping approach, but the computation of +$\widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +and +$\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +does not involve the repeated solving of the perturbed estimating +equations. Thus, the partial multiplier bootstrapping approach is +expected to be computationally more efficient than the multiplier +bootstrap method. A similar procedure and its performance have been +studied in modeling failure times with semiparametric AFT models +[@chiou2014fast; @aftgeepackage]. + +### Iterative procedure in induced smoothing estimation {#sec:iter} + +The induced estimator $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ +is obtained with a fixed $\mathbf{H}$, as described in +Section [2.2](#sec:IS:pt), and its variance is estimated separately. +This estimation procedure can be viewed as a special case of the +following iterative procedure, which updates $\mathbf{H}$ and +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ iteratively. +Specifically, the iterative algorithm utilizes the Newton-Raphson method +while sequentially updating +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ and +$\widehat{\mathop{\rm Var}\nolimits}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +until convergence. Similar iterative algorithms have also been +considered previously in the induced smoothing approach for +semiparametric AFT models +[@johnson2009induced; @chiou2014fast; @chiou2015semiparametric; @choi2018smoothed]. +The iterative procedure is summarized as follows: + +**Step 1:** + +: Set the initial values + $\widehat{\boldsymbol{\mathbf{\beta}}}^{(0)}$, + $\widehat{\mathbf{\Sigma}}^{(0)} = \mathbf{I}_{p}$, and + $\mathbf{H}^{(0)} = n^{-1}\widehat{\mathbf{\Sigma}}^{(0)}$. + +**Step 2:** + +: Given $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ and + $\mathbf{H}^{(k)}$ at the $k$-th step, update + $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ by + $$\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}=\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)} - \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)})^{-1}{\widetilde{U}_{t_0}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}, \tau, \mathbf{H}^{(k)}}).$$ + +**Step 3:** + +: Given $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}$ and + $\widehat{\mathbf{\Sigma}}^{(k)}$, update + $\widehat{\mathbf{\Sigma}}^{(k)}$ by + $$\widehat{\mathbf{\Sigma}}^{(k+1)} = \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)})^{-1} \widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}, \tau) \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)})^{-1}.$$ + +**Step 4:** + +: Set $\mathbf{H}^{(k+1)} = n^{-1}\widehat{\mathbf{\Sigma}}^{(k+1)}$. + Repeat Steps 2, 3 and 4 until + $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ and + $\widehat{\mathbf{\Sigma}}^{(k)}$ converge. + +The initial value, $\widehat{\boldsymbol{\mathbf{\beta}}}^{(0)}$, could +be chosen as $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. We +define $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IT}$ and +$\widehat{\boldsymbol{\mathbf{\Sigma}}}_{\tiny IT}$ as the values of +$\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ and +$\widehat{\mathbf{\Sigma}}^{(k)}$ at convergence, and +$\widehat{\mathop{\rm Var}\nolimits}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IT}) = n^{-1}\widehat{\mathbf{\Sigma}}_{\tiny IT}$. +In Step 3, +$\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}, \tau)$ +is obtained using the partial multiplier bootstrap approach. However, +the full multiplier bootstrap approach can also be employed but would +require longer computation times. + +## Package implementation {#sec:implementation} + +The main function in the +[**qris**](https://CRAN.R-project.org/package=qris) package for +estimating the regression parameters in the quantile regression model +for residual life is the `qris()` function. The `qris()` function is +written in C++ and incorporated into R using the +[**Rcpp**](https://CRAN.R-project.org/package=Rcpp) [@Rcpppackage] and +[**RcppArmadillo**](https://CRAN.R-project.org/package=RcppArmadillo) +[@RcppArmadillopackage] packages. The synopsis of `qris` is: + +```{r} +#| include: false +library(qris) +``` + +```{r} +args(qris) +``` + +The required argument is `formula`, which specifies the quantile +regression model to be fitted using the variables in `data`. The +`formula` assumes that the response variable is a '`Surv`' object +created by the `Surv()` function in the +[**survival**](https://CRAN.R-project.org/package=survival) package +[@survivalpackage]. This formula structure is commonly adopted for +handling survival data in R, as seen in functions like `survreg()` and +`coxph()` in the +[**survival**](https://CRAN.R-project.org/package=survival) package. The +argument `t0` specifies the base time used in defining residual life. +The default value of `t0` is set to zero, in which case residual life +reduces to a failure time. The `Q` argument is used to specify the +target quantile of residual life to estimate, with the default value +being set to 0.5 (median). The `nB` argument specifies the bootstrapping +size used in standard error estimation, with the default value set to +100. The `method` argument specifies one of the three estimation +methods: `"nonsmooth"`, `"smooth"`, and `"iterative"`, corresponding to +the estimating procedures outlined in Sections [2.1](#sec:nsm:pt), +[2.2](#sec:IS:pt), and [2.3](#sec:iter), respectively. Given the point +estimates of the regression parameters, their standard errors can be +estimated using one of two implemented methods: `se = "fmb"` and +`se = "pmb"`. The `se = "fmb"` method employs a full-multiplier +bootstrapping approach to estimate the variance by the sample variance +of large realizations of $\widehat\beta$. The `se = "pmb"` method +estimates the variance using a robust sandwich variance estimator and +employs the computationally efficient partial multiplier bootstrapping +approach described in Section [2.2](#sec:IS:pt). The `"fmb"` option is +available for all three point estimation methods, whereas the `"pmb"` +option is not available for the `"nonsmooth"` point estimation method +due to the lack of a closed-form sandwich variance estimator. The `init` +argument allows users to specify the initial value for estimating +regression parameters by either a $p$-dimensional numerical vector or a +character string. In the latter case, the options `init = "rq"` and +`init = "noeffect"` correspond to the point estimate obtained from the +`rq()` function in the +[**quantreg**](https://CRAN.R-project.org/package=quantreg) package and +a $p$-dimensional vector of zeros, respectively. The default value for +`init` is `init = "rq"`. Among the three methods implemented for point +estimation, `method = "smooth"` and `method = "nonsmooth"` are +non-iterative, in the sense that point estimation is performed +separately from the estimation of standard errors. On the other hand, +`method = "iterative"` calculates point estimates and the corresponding +standard error estimates simultaneously through iterative updates. When +`method = "iterative"`, users can define specific convergence criteria +using `qris.control()`. The available options in `qris.control()` are as +follows. + +```{r} +args(qris.control) +``` + +The `maxiter` argument specifies the maximum number of iterations. The +default value for `maxiter` is ten, as the proposed algorithm typically +converges within ten steps based on our exploration. The convergence +tolerance is controlled using the `tol` argument, which has a default +value of `1e-3`. The `trace` argument takes a logical value and is used +to determine whether to print the result for each iteration. The default +setting is `trace = FALSE`. The '`qris`' object is fully compatible with +many of R's generic functions, including `coef()`, `confint()`, +`plot()`, `predict()`, `print()`, `residuals()`, `summary()`, and +`vcov()`. + +Among the available `S3` methods, a unique feature of the +[**qris**](https://CRAN.R-project.org/package=qris) package's `S3 plot` +method, when applied to a '`qris`' object, is its ability to +automatically update the original object by extending the range of +$\tau$ or $t_0$ values. This extension enables the generation of a +covariate effect plot over the newly specified values of $\tau$ or +$t_0$, providing a comprehensive visualization of the covariate effects +across the extended range. The `S3` method for plotting a '`qris`' +object is shown below. + +```{r} +argsAnywhere(plot.qris) +``` + +The argument `x` is a '`qris`' object created using the `qris()` +function. The `t0s` and `Qs` arguments are numeric vectors that enable +users to specify the values of $t_0$ or $\tau$ for plotting the +covariate effect. If `t0s` and `Qs` are not specified, the covariate +effects are plotted against $\tau = 0.1, 0.2, \ldots, 0.9$ at the base +time ($t_0$) inherited from the '`qris`' object specified in `x`. The +`nB` argument is a numerical variable that controls the sample size for +bootstrapping, used to compute standard error estimations based on the +variance estimation specified in the original '`qris`' object. When `nB` +is specified, the function calculates standard errors for all +combinations of $t_0$ and $\tau$ specified in `t0s` and `Qs`, computes +95% confidence intervals accordingly, and includes them in the covariate +effect plot. The `vari` argument is a character string that allows users +to specify the names of the covariates they want to display in the +effect plots. When the `vari` argument is not specified, all covariates +will be included in the plots by default. The coefficient event plot can +be plotted against the specified quantiles by setting `byQs = TRUE` or +against the specified base times by setting `byQs = FALSE`. Finally, the +`ggextra` argument allows users to pass additional graphical parameters +to the [**ggplot2**](https://CRAN.R-project.org/package=ggplot2) +package, offering further customization options for the plots. When the +`plot()` function is called, it internally invokes the `qris.extend()` +function to compute the covariate effects at additional values. The +syntax for the `qris.extend()` function is provided below: + +```{r} +args(qris.extend) +``` + +The arguments in `qris.extend()` are inherited from the arguments +specified in the `plot()` function. To reduce runtime when repeatedly +calling the `plot()`, one can calculate the desired covariate effects by +applying `qris.extend()` outside of `plot()` first and then supply the +results to `plot()`. This approach allows for pre-computation of the +covariate effects, making it more efficient when generating multiple +plots. Overall, the unique plotting feature in +[**qris**](https://CRAN.R-project.org/package=qris) provides users with +a seamless and effortless approach to conducting a comprehensive +assessment of the covariate effects across different quantiles or base +times. + +## Illustration {#sec:illustration} + +### Simulated data {#subsec:simulation} + +In this subsection, we present a simple simulation example to validate +the implementations in the proposed +[**qris**](https://CRAN.R-project.org/package=qris) package. The +simulation involves five covariates, denoted as $X_1, \ldots, X_5$. +Among these covariates, $X_1$ and $X_4$ follow a standard uniform +distribution, $X_2$ follows a binomial distribution with a success +probability of 0.5, $X_3$ follows a standard normal distribution, and +$X_5$ follows a standard exponential distribution. We assume that +$X_2, X_3, X_4$, and $X_5$ do not impact the residual life, meaning +their corresponding coefficient values $\beta_2$, $\beta_3$, $\beta_4$, +and $\beta_5$ are zero. The survival time $T$ is generated from a +Weibull distribution with the survival function +$S(t) = \exp\{-(\rho t)^\kappa\}$ for $t > 0$, where $\kappa = 2$, and +$\rho$ is obtained by solving +$$\label{eq:sim:weibull} + \rho^{-1}\{ (\rho t_0)^\kappa - \log (1-\tau) \}^{(1/\kappa)}- t_0 = \exp\{\beta_0 + \beta_1 X_1\}, (\#eq:simweibull)$$ +for a specified $t_0$ and $\tau$. We set the intercept +$\beta_0 = \log(5)$ and $\beta_1 = \log(2)$ at $t_0 = 0$. Given $\rho$, +$\tau$, and $X_1$, the true values of $\beta_0$ and $\beta_1$ can be +obtained sequentially from Equation \@ref(eq:simweibull) for different +$t_0 > 0$. In our case, the corresponding true values of $\beta_0$ are +approximately 1.411 and 1.219 for $t_0=1$ and 2, respectively. +Similarly, the true values of $\beta_1$ are approximately 0.797 and +0.907 for $t_0=1$ and 2, respectively. The closed-form expression for +generating $T$ is then $\{ -\log(1 - u) \}^{1/\kappa} / \rho$, where $u$ +is a uniform random variable over $(0, 1)$. Given these specifications, +we have implemented the `data.gen()` function to generate simulation +data. The `data.gen()` function takes four arguments: `n`, `t0`, `cen`, +and `Q`, representing the sample size, $t_0$, censoring proportion, and +$\tau$, respectively. We generate censoring times $C$ from an +independent uniform distribution over $(0, c)$, where $c$ is chosen to +achieve the desired censoring proportions of 10% and 30%. Using the +generated dataset, we fit the model using three different estimation +methods: induced smoothing, non-smooth, and iterative-induced smoothing. +All analyses were conducted on a 4.2 GHz Intel(R) quad Core(TM) i7-7700K +central processing unit (CPU) using R 4.3.0 [@r2021]. The following code +demonstrates the implementation of `data.gen()` to generate a simulation +dataset. + +The `data.gen()` function generates a `data.frame` containing seven +variables. The `Time` variable represents the observed survival time, +while the `status` variable serves as the event indicator, taking the +value 1 for observed events and 0 for censored observations. The +variables `X1`, $\ldots$, `X5` are the covariates. The implementation in +the `data.gen()` function generates the Weibull survival times using the +inverse probability integral transform technique. Alternatively, users +can use the `rweibull()` function with the parameters `shape = 2` and +`scale = 1 / rho` to generate these Weibull survival times directly. + +We assess the performance of the proposed implementation across various +scenarios, including three sample sizes ($n = 200, 400, 1000$), three +levels of $t_0$ ($0, 1, 2$), two censoring proportions (10% and 30%), +and two values of $\tau$ (0.25 and 0.50). For a given dataset, we apply +the full-multiplier bootstrapping approach with 200 bootstrap samples to +all three available estimating procedures: `method = "nonsmooth"`, +`method = "smooth"`, and `method = "iterative"`. To facilitate the +evaluation process, we create the `do_fmb()` function to record the +coefficient estimates, standard errors, and computing times for fitting +a single simulated dataset generated from `data.gen()`. The following is +the implementation of the `do_fmb()` function and the corresponding code +to run the simulation with 200 replications. We present the code and +result of the simulation experiments conducted at three different sample +sizes, with $t_0$ values set to 0 and 1, while holding the censoring +proportion at 30% and $\tau$ value at 0.5. The results for other +simulation scenarios are provided in the Supplementary Materials. + +```{r} +#| eval: false +#| echo: true +do_fmb <- function(n, t0, cen, Q, nB) { + dat <- data.gen(n, t0, cen, Q) + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + stamp <- NULL + stamp[1] <- Sys.time() + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "fmb") + stamp[2] <- Sys.time() + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "nonsmooth", se = "fmb") + stamp[3] <- Sys.time() + f3 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "fmb") + stamp[4] <- Sys.time() + list(smooth = c(f1$coef, f1$std), + nonsmooth = c(f2$coef, f2$std), + iter = c(f3$coef, f3$std), + times = diff(stamp)) +} + +B <- 200 +set.seed(2) +sims0_fmb <- mapply(function(n, t0) + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) +sim1_fmb <- mapply(function(n, t0) + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) +``` + +Figure \@ref(fig:figsim1t0) displays violin plots that provide visualizations +of the empirical distribution of the coefficient estimates. As expected, +all three estimators exhibit small biases, which are calculated as the +difference between the point estimates (PE) and the true regression +coefficients. Furthermore, the empirical distributions of the PEs +demonstrate a normal-like shape, aligning with the asymptotic properties +of the proposed method [@li2016quantile; @kim2023smoothed]. When the +sample size is smaller (e.g., $n = 200$ and 400), the `nonsmooth` +approach appears to yield slightly larger empirical standard errors +(ESE) compared to the `smooth` or `iterative` approaches. However, when +$n = 1000$, the ESEs are similar across all approaches. On the other +hand, the comprehensive simulation results presented in Table 1 of the +Supplementary Materials confirm that all coefficient estimates closely +approximate the true regression coefficients. On the other hand, the +ESEs and the averaged estimated standard errors (ASE) are in close +agreement for all scenarios, indicating the validity of the variance +estimation. Furthermore, the computation times, which are presented +separately in the upper panel of Table [1](#tab:time), indicate that +when employing the full multiplier bootstrapping approach, the +`nonsmooth` approach demonstrates a slight advantage in terms of +computational efficiency over the `smooth` approach, while the +`iterative` approach takes 5.1 to 9.5 times longer than the `smooth` +approach. In summary, the timing results show that the proposed method +can yield valid inference results within seconds, even with large +datasets of up to 1000 observations or when using the computationally +demanding full multiplier bootstrapping approach for variance +estimation. + +::: figure* +```{r figsim1t0, echo=FALSE, fig.cap="$t_0 = 0$", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_t0_c3_Q50.png")) +``` + +\ + +```{r figsim1t1, echo=FALSE , fig.cap="$t_0 = 1$", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_t1_c3_Q50.png")) +``` +::: + +When $t_0 = 0$, the targeted semiparametric quantile regression model +for residual life simplifies to the standard quantile regression model +for survival time. In such cases, existing functions like `crq()` from +the [**quantreg**](https://CRAN.R-project.org/package=quantreg) package +[@quantregpackage] can be employed. A comparison between the performance +of `crq()` and our proposed implementation when $t_0 = 0$ is presented +in the Supplementary Materials, where the standard errors of the `crq()` +are obtained from the bootstrap method with 200 bootstrap samples. +Overall, the performance of `crq()` is comparable to the proposed +methods in terms of bias and standard errors. However, we have +occasionally encountered situations where the `crq()` function fails to +converge, particularly when the sample size is large, as in the case of +$n = 1000$. In the other extended simulation scenarios outlined in the +Supplementary Materials, which encompass various levels of $t_0$, +censoring proportions, and $\tau$, the proposed methods consistently +exhibit satisfactory performance across all settings. + +The true potential of the proposed smooth approach lies in its +capability for efficient variance estimation through the implementation +of the partial multiplier bootstrapping approach. This approach +eliminates the need for repetitive solving of estimating equations, +resulting in improved computational efficiency in variance estimation. +To demonstrate its usefulness, we conducted a simulation using both the +smooth approach and the iterative approach with the partial multiplier +bootstrapping approach (`se = "pmb"`). This simulation was conducted +under the settings of $\tau = 0.5$, $t_0 = 0$ and $1$, and a 30% +censoring rate. The `do_pmb()` function was accordingly modified as +follows. + +```{r} +#| eval: false +#| echo: true +do_pmb <- function(n, t0, cen, Q, nB) { + dat <- data.gen(n, t0, cen, Q) + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + stamp <- NULL + stamp[1] <- Sys.time() + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "pmb") + stamp[2] <- Sys.time() + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "pmb") + stamp[3] <- Sys.time() + list(smooth = c(f1$coef, f1$std), + iter = c(f2$coef, f2$std), + times = diff(stamp)) +} + +set.seed(2) +sims0_pmb <- mapply(function(n, t0) + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) +sims1_pmb <- mapply(function(n, t0) + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) +``` + + +The simulation results obtained using the partial multiplier +bootstrapping approach are presented in Figure \@ref(fig:figsim2t0) and +Tables 7 -- 12 in the Supplementary Materials, while the computing times +are displayed in the lower panel of Table [1](#tab:time). Overall, the +estimation results obtained using `se = "pmb"` in Figure \@ref(fig:figsim2t0) +closely resemble those in Figure \@ref(fig:figsim1t0) with `se = "fmb"`. As +seen in Tables 7 and 8, the ESEs from the non-iterative and iterative +methods are comparable, while the ASEs slightly overestimate the ESEs +when the sample size is small. The gaps are slightly smaller for the +iterative method, as shown in some cases +[@johnson2009induced; @kim2021comparison]. The magnitudes of the +differences are not large, and they also become smaller when the sample +size reaches $n = 1000$. More importantly, the computing times with +`se = "pmb"` show significant speed improvements compared to when +`se = "fmb"` is used in every case; we observed up to 79% timing +improvements. + +::: figure* +```{r figsim2t0, echo=FALSE , fig.cap="$t_0 = 0$", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_pmb_t0_c3_Q50.png")) +``` + +\ + +```{r figsim2t1, echo=FALSE , fig.cap="$t_0 = 1$", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_pmb_t1_c3_Q50.png")) +``` +::: + +::: {#tab:time} + -------------------------------------------------------------------------------------------------------------------- + $t_0 = 0$ $t_0 = 1$ + ------------------------------------------ ----------- ----------- ------- ------- ----------- ------- ------- -- -- + se method 200 400 1000 200 400 1000 + + `fmb` Smooth 0.103 0.174 0.471 0.106 0.178 0.480 + + Nonsmooth 0.080 0.142 0.472 0.080 0.141 0.468 + + Iterative 0.981 1.500 2.410 0.985 1.567 2.882 + + `pmb` Smooth 0.022 0.052 0.223 0.022 0.053 0.224 + + Iterative 0.296 0.580 1.407 0.296 0.581 1.435 + -------------------------------------------------------------------------------------------------------------------- + + : Table 1: Runtimes (in seconds) when `se = fmb` and `se = pmb`. +::: + +After confirming the satisfactory performance of the proposed +methodologies, we now proceed to illustrate the application of the +`init` argument. This argument controls the initial values assigned to +the root-finding algorithm's estimates and the plotting capacity of the +[**qris**](https://CRAN.R-project.org/package=qris) package. For this +illustrative example, we consider a simpler simulation scenario that +involves a single binary covariate. This simplified simulation can be +generated using the revised version of the `data.gen()` function +provided below. + +```{r} +#| echo: true +## Global parameters +rho0 <- .2 * sqrt(log(2)) +rho1 <- .1 * sqrt(log(2)) +data.gen <- function(n) { + dat <- data.frame(censoring = runif(n, 0, 23.41), + Time0 = sqrt(-log(1 - runif(n))), + X = rbinom(n, 1, .5)) + dat$Time0 <- ifelse(dat$X > 0, dat$Time0 / rho1, dat$Time0 / rho0) + dat$Time <- pmin(dat$Time0, dat$censoring) + dat$status <- 1 * (dat$Time0 < dat$censoring) + subset(dat, select = c(Time, status, X)) +} +set.seed(10) +head(dat <- data.gen(200)) +``` + +The updated `data.gen()` function returns a `data.frame` comprising +three variables: `Time`, `status`, and `X`, representing the observed +survival time, event indicator, and binary covariate, respectively. We +will first illustrate the usage of the argument `init` by considering +three different initial values: `init = "rq"`, `init = c(1,1)`, and a +random vector `init = rnorm(2)`, all used in conjunction with the smooth +estimator `method = "smooth"`. The following codes provide an example +with different initial values. + +```{r} +#| echo: true +(random <- rnorm(2)) +f1 <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, init = "rq", nB = 0) +f2 <- update(f1, init = c(1, 1)) +f3 <- update(f1, init = random) +all.equal(f1$coef, f2$coef) +all.equal(f2$coef, f3$coef) +``` + +The '`qris`' object, with its `call` component, is compatible with the +`update()` function, a built-in function commonly used for updating the +attributes of an existing object without requiring redundant and +repetitive code. In the example above, we used the `update()` function +to modify the initial value specification in `f1`. We observed that +different initial values yield identical point estimates, thereby +affirming the robustness of the proposed method against fluctuations in +initial values. + +The covariate effects, along with their associated 95% point-wise +confidence intervals across various quantiles or base times, can be +visually assessed by applying the generic function `plot()` to a +'`qris`' object. We demonstrate this feature using the following `qris` +fit, where the standard errors are obtained using `se = "pmb"`, +$t_0 = 1$, and all other parameters are set to their default values. We +update the `qris` fit with extended quantiles over +${0.4, 0.5, 0.6, 0.7}$ and plot the covariate effects against these +quantiles using the `plot()` function. + +```{r} +#| echo: true +fit <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, se = "pmb") +fit2 <- qris.extend(fit, Qs = 4:7 / 10) +``` + +The extended '`qris`' fit generated by the `qris.extend()` function +inherits all the attributes from the original '`qris`' object and +includes additional `ggdat` components. The following code compares the +components of the returned values from the extended '`qris`' fit and the +original '`qris`' fit. + +```{r} +#| echo: true +class(fit2) +names(fit) +setdiff(names(fit2), names(fit)) +``` + +Specifically, the extended '`qris`' fit inherits `call`, `coefficient`, +`para`, `stderr`, `varNames`, and `vcov` from the original '`qris`' +object. The `call` component is the function call from the original +`qris()` fit, while `coefficient`, `stderr`, and `vcov` are used to +store the point estimates, standard error estimates, and covariance +matrix, respectively. The `para` component is a list containing the +parameters specified during the fitting of the quantile regression +model, and `varNames` is a character string representing the variable +names in the function call. The newly added values are `ggdat` and `gg`. +The `ggdat` is a data frame containing covariate information generated +under the different quantiles and base times specified in the +`qris.extend()`. Finally, the corresponding covariate effect plot can be +generated by plotting the extended '`qris`' fit as follows. + +```{r} +#| echo: false +#| eval: true +plot(fit2) +``` + +The true values of $\beta$'s at different quantiles and base times, +computed from Equation \@ref(eq:simweibull), can be implemented in the +following commands. + +```{r} +## Global parameters +r <- 2:1 * sqrt(log(2)) / 10 +k <- 2 +trueB <- function(t0, tau) { + b <- log(1 / r * ((r * t0) ^ k - log(1 - tau))^(1 / k) - t0) + c(b[1], b[2] - b[1]) +} +true_Q <- c(t(sapply(4:7 / 10, trueB, t0 = 1))) +true_t0 <- c(t(sapply(1:3, trueB, tau = .5))) +``` + +The following code extends the '`ggplot`' objects generated by +`plot.qris()` by adding additional layers of true value curves and +incorporating various `ggplot` options. The resulting figures, +Figure \@ref(fig:figsimulation-quantile) and +Figure \@ref(fig:figsimulation-t0), present the output based on whether +the covariate effects are plotted against quantiles or base times, +respectively. This observed trend aligns with the specifications +described in Equation \@ref(eq:simweibull), where increasing $\tau$ +corresponds to an increasing $\beta_0$ while keeping $\rho$ and $X$ +fixed. On the other hand, the covariate effect does not change with +quantiles but slightly increases with base times, echoing the model +specification where $\beta_0$ is inversely related to $t_0$ and +$\beta_1$ increases as $t_0$ increases. + +```{r} +#| echo: true +library(ggplot2) +plot(fit2) + theme(legend.position = "bottom") + + geom_line(aes(x = Qs, y = true_Q, col = variable, linetype = "True value")) + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) +b <- plot(fit2, t0s = 1:3, byQs = F) +b + theme(legend.position = "bottom") + + geom_line(aes(x = t0s, y = true_t0, col = variable, + linetype = "True value")) + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) +``` + +::: figure* +```{r figsimulation-quantile, echo=FALSE , fig.cap="Plot for $Q\\in\\{0.4, \\ldots, 0.7\\}$ at $t_0 = 1$", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("simulation_smooth_quantile.png")) +``` + +```{r figsimulation-t0, echo=FALSE , fig.cap="Plot for $t_0\\in\\{1, \\ldots, 3\\}$ at $Q = 0.5$", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("simulation_smooth_t0.png")) +``` +::: + +### North Central Cancer Treatment Group Lung Cancer Data {#subsec:lung} + +The North Central Cancer Treatment Group Lung Cancer Data records the +survival of patients with advanced lung cancer, along with assessments +of the patients' performance status measured by both physicians and the +patients themselves [@loprinzi1994prospective]. The original objective +of the study was to ascertain whether descriptive information from a +patient-completed questionnaire could offer prognostic insights. The +original objective of the study was to determine whether descriptive +information from a patient-completed questionnaire could provide +prognostic information. However, for this illustration, we focus on how +gender and weight loss affect the quantiles of residual life for +patients diagnosed with advanced lung cancer at different time points. +The lung cancer data are publicly available from the +[**survival**](https://CRAN.R-project.org/package=survival) package +[@survivalpackage] as `lung`. The following code displays the structure +of the `lung` dataset with variables of interest. + +```{r} +#| echo: true +data(cancer, package = "survival") +str(subset(lung, select = c(time, status, sex, wt.loss))) +``` + +The `lung` data contains 228 patients whose observed survival times in +days and censoring status (1 = censored, 2 = dead) are recorded in the +`time` and the `status` columns, respectively. Although the censoring +status in this dataset is not recorded in the typical 0-1 fashion, the +`Surv()` function is still applicable to create the corresponding +"`Surv`\" object. The `lung` data yields a censoring rate of $27.6\%$ +with a median survival time of 310 days. The covariates of interest are +gender (`sex = 1` if male, `sex = 2` if female) and weight loss +(`wt.loss`). In the following, we use the proposed semiparametric +quantile regression models to assess the gender and standardized weight +loss effects on different quantiles of residual life at different base +times. + +We first model the median residual life (`Q = 0.5`) when the base time +is one month (`t0 = 30`). Since the estimated median survival times for +combined lung cancers are typically less than one year, with a range of +8 to 13 months [@siegel2021cancer], setting the base time at one month +provides insight into how gender and weight loss impact the residual +time in early follow-up. In the following, we obtain the regression +coefficient estimates using the induced smoothing functions and the +corresponding variance estimate with the partial multiplier bootstrap +approach. + +```{r} +#| echo: true +lung$male <- factor(lung$sex, 1:2, c("Male", "Female")) +lung$std.wt.loss <- scale(lung$wt.loss) +fit1 <- qris(Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = .5, nB = 100, + method = "smooth", se = "pmb") +summary(fit1) +``` + + +Subjects with missing values (in any of the variables relevant for the +modeling task) are automatically removed when `qris()` is called. The +estimated intercept implies that the median residual life for patients +who have survived up to 30 days is $\exp(5.5611) = 260.1$ days for a +male with an average weight loss. More interestingly, the summary shows +that the gender effect is statistically significant at the 0.05 +significance level, indicating that a female patient is expected to have +a median residual life at 30 days that is $\exp(0.4804) = 1.617$ times +that of a male patient with the same weight loss. The effect of the +weight loss is not statistically significant at the 0.05 level. In +addition to `summary()`, important statistics such as the coefficient +and variance estimates can be extracted by `S3` methods `coef()` and +`vcov()`, respectively. + +```{r} +#| echo: true +coef(fit) +``` + +Moreover, the corresponding 95% Wald-type confidence interval can be +printed by applying the `confint()` function to the '`qris`' object. + +```{r} +#| echo: true +confint(fit1) +``` + +The `update()` function can be conveniently applied to update existing +'`qris`' objects. The following examples update the `method` and `se` +arguments from `fit1`. The updated results yield similar coefficient +estimates, but the non-smooth procedure (`method = "nonsmooth"`) yields +slightly greater standard error estimates. + +```{r} +#| echo: true +summary(fit2 <- update(fit1, method = "nonsmooth", se = "fmb")) +``` + +```{r} +#| echo: true +summary(update(fit1, method = "iterative")) +``` + +At a lower (`Q = 0.25`) and a higher (`Q = 0.75`) quantiles, the gender +effect remains significant at the 0.05 significance level indicating +female patients are associated with longer lower-quantile and +higher-quantile residual life than male patients with the same weight +loss. Among these models, we observed that female patients tend to have +higher coefficient estimates when fitting higher-quantile residual life. +While the sign of the estimated regression coefficient for weight loss +changes to a negative value when considering the lower quantile, the +effects remain statistically insignificant for both the lower and higher +quantiles. + +```{r} +#| echo: true +summary(update(fit1, Q = 0.25)) +``` + +```{r} +#| echo: true +summary(update(fit1, Q = 0.75)) +``` + +We also consider the base time at six months `t0 = 180`, which enables +us to assess gender and weight loss effects in median residual time at a +moderate length of follow-up. The estimated effect for the gender and +weight loss increases as $t_0$ increases from $30$ days to $180$ days +and becomes significant at the 0.05 significant level. Additionally, the +effect of the weight loss seems to be associated with a shorter survival +time after $180$ days, with a $p$-value of $0.0008$. + +```{r} +#| echo: true +summary(update(fit1, t0 = 180)) +``` + +The '`qris`' object is designed to be compatible with `S3` methods: +`predict()` and `residuals()` functions. The following presents the +fitted survival times for two hypothetical male and female patients with +no weight loss, as well as the first five residual values for the +dataset. + +```{r} +#| echo: true +lung.new <- data.frame(male = c("Male", "Female"), std.wt.loss = 0) +predict(fit2, newdata = lung.new) +head(residuals(fit2), 5) +``` + +To better understand the covariate effects on different quantiles of +residual time and across different base times, we plot the estimated +regression coefficients of the intercept, sex, and weight loss in `fit1` +and `fit2`. Figures \@ref(fig:figrealdata-smooth) +and \@ref(fig:figrealdata-nonsmooth) display the estimated regression +coefficients when `method = "smooth"` and `method = "nonsmooth"`, +respectively, at different quantiles ranging from 0.2 and 0.5 at +$t_0 = 30$ days. The `plot.qris()` function is currently not available +for the iterative estimator. This is mainly due to an extended +computation time involved, as indicated by our simulation results, and +the nature of plotting that necessitates computations across various +quantiles or base times. As expected, the two plots show very similar +patterns. We plot the estimated regression coefficients of the +intercept, sex, and weight loss for different quantiles in the range of +0.2 to 0.5 at $t_0= 50$, 60, 70, and 80 days +(Figure \@ref(fig:figrealdata-multi-quantile)), as well as for different +base times in the range of 50 to 80 days at $\tau=0.2$, 0.3, 0.4, and +0.5 (Figure \@ref(fig:figrealdata-multi-basetime)). The estimation +method used is non-iterative induced smoothed estimation +(`method = "smooth"`). In Figure \@ref(fig:figrealdata-multi-quantile), +the estimated intercept increases as the quantile increases (for a given +base time). The estimated slopes for sex remain largely the same, but +those for weight loss tend to decrease slightly across different +quantiles (for a given base time). These patterns remain consistent for +different base times. In Figure \@ref(fig:figrealdata-multi-basetime), +the estimated intercepts increase as the quantiles increase, but with a +given quantile, they remain flat across the different base times +considered. The estimated regression coefficients for the two covariates +do not appear to change significantly for different base times. + +```{r} +#| echo: true +hide <- theme(legend.position = "none") +plot(fit1, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) +plot(fit2, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) +plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = TRUE, ggextra = hide) +plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = FALSE, ggextra = hide) +``` + + +::: figure* +```{r figrealdata-smooth, echo=FALSE , fig.cap="method = ”smooth” and se = ”pmb”", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_smooth_quantile.png")) +``` + +```{r figrealdata-nonsmooth, echo=FALSE , fig.cap="method = ”nonsmooth” and se = ”fmb”", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_nonsmooth_quantile.png")) +``` + +\ + +```{r figrealdata-multi-quantile, echo=FALSE , fig.cap="method = ”smooth” and se = ”pmb”", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_multi_quantile.png")) +``` + +```{r figrealdata-multi-basetime, echo=FALSE , fig.cap="Multiple covariate effect plot against base time", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_multi_basetime.png")) +``` +::: + +## Conclusion {#sec:conclusion} + +The purpose of the [**qris**](https://CRAN.R-project.org/package=qris) +package is to provide a comprehensive tool for fitting quantile +regression models on residual life for right-censored survival data, +with the aim of promoting widespread dissemination and utilization. This +package implements one estimation method based on non-smooth estimating +functions and two estimation methods based on their induced smoothed +versions. The non-smooth estimator is calculated through $L_{1}$-type +minimization while incorporating the IPCW technique, and its variance is +calculated using full multiplier bootstrapping. The first type of the +induced smoothed estimator, a non-iterative version, directly solves +estimating functions, and its variance can be calculated using either +the full multiplier bootstrapping or the robust sandwich form with +partial multiplier bootstrapping. As evidenced by the simulation +results, this enables one to substantially reduce computing times +without sacrificing estimation accuracy and stability compared to the +original non-smooth function-based method. The iterative smoothed +estimator has an advantage in obtaining more precise estimates than its +non-iterative version, although it requires longer computing times. For +all these methods, estimates of the regression coefficients and their +variances can be calculated at user-defined quantiles and base times, as +long as they are identifiable. Additionally, the package provides +features for plotting estimates with associated 95% confidence intervals +against quantiles and base times using the generic `plot` function. +These plots visualize patterns of estimates at different quantiles and +base times, helping users to easily grasp the overall picture. The +package [**qris**](https://CRAN.R-project.org/package=qris) and its +included functions are verified through illustrations using simulated +data with interpretation of the results demonstrated through a real data +application. + +Some possible directions for extending our package are as follows. +Efforts can be made to reduce the computational burden associated with +variance estimation, which currently accounts for a significant portion +of the computing time. In particular, the iterative-induced smoothed +method employs the partial multiplier bootstrap method to calculate +variance estimates in each iteration. Since this method requires +multiple iterations, it is crucial to explore more computationally +efficient variance estimation procedures for each iteration to reduce +the currently relatively longer computation time. One approach is to +utilize a closed-form estimation of the mid-part of the sandwich-type +variance, as discussed in @chiou2014fast [@choi2018smoothed]. +Implementing this direct variance estimation in each iteration is +expected to further enhance computation efficiency. Another direction is +to generalize the approaches to allow for the inclusion of sampling +weights, which is useful for bias correction when failure time data are +generated from non-random sampling designs, such as case-cohort designs +[@prentice1986case; @chiou2015semiparametric]. The current estimating +functions implemented in the +[**qris**](https://CRAN.R-project.org/package=qris) package assume that +the data are randomly sampled, with sampling weights set to 1.\" To the +best of our knowledge, there is a lack of model-checking procedures and +model-comparison methods specifically designed for the non-smooth +estimator, and a logical next step would be to develop these procedures +for subsequent integration into the package. +:::::::::::::::::::::::::::::::: diff --git a/_articles/RJ-2024-007/RJ-2024-007.html b/_articles/RJ-2024-007/RJ-2024-007.html new file mode 100644 index 0000000000..7779a64452 --- /dev/null +++ b/_articles/RJ-2024-007/RJ-2024-007.html @@ -0,0 +1,3441 @@ + + + + + + + + + + + + + + + + + + + + + + Fitting a Quantile Regression Model for Residual Life with the R Package qris + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    Fitting a Quantile Regression Model for Residual Life with the R Package qris

    + + + +

    In survival analysis, regression modeling has traditionally focused on +assessing covariate effects on survival times, which is defined as the +elapsed time between a baseline and event time. Nevertheless, focusing +on residual life can provide a more dynamic assessment of covariate +effects, as it offers more updated information at specific time points +between the baseline and event occurrence. Statistical methods for +fitting quantile regression models have recently been proposed, +providing favorable alternatives to modeling the mean of residual +lifetimes. Despite these progresses, the lack of computer software +that implements these methods remains an obstacle for researchers +analyzing data in practice. In this paper, we introduce an R package +qris (Kim et al. 2022), which +implements methods for fitting semiparametric quantile regression +models on residual life subject to right censoring. We demonstrate the +effectiveness and versatility of this package through comprehensive +simulation studies and a real-world data example, showcasing its +valuable contributions to survival analysis research.

    +
    + + + +
    +
    +

    1 Introduction

    +

    In the analysis of time-to-event data, standard statistical inference +procedures often focus on quantities based on failure time and its +relationship with covariates measured at baseline. However, throughout +the follow-up process, inference procedures based on residual life +become increasingly intuitive for assessing the survival of subjects and +can offer insights into the effectiveness of treatments in prolonging +the remaining lifetime. As covariates can substantially change over time +and models based solely on baseline covariates have limited potential +for long-term prognosis, there is a growing interest in modeling the +remaining lifetime of a surviving subject with updated patient +information. Many efforts have been made to model the mean residual life +including proportional mean residual life models +(Oakes and Dasu 1990, 2003; Maguluri and Zhang 1994; Chen et al. 2005), +additive mean residual life models +(Chen and Cheng 2006; Chen 2007; Zhang et al. 2010), and +proportional scaled mean residual life models (Liu and Ghosh 2008). +Given that failure times are usually right-skewed and heavy-tailed, the +mean of the residual life might not be identifiable if the follow-up +time is not sufficiently long. For this reason, quantiles, which are +robust under skewed distribution, have traditionally been used more +frequently as alternative summary measures. For example, the approach on +the semiparametric quantile regression model for continuous responses +(Koenker and Bassett Jr 1978) has been extended to uncensored failure time +data (Jung 1996; Portnoy and Koenker 1997; Wei et al. 2006) and +censored failure times data +(Ying et al. 1995; Portnoy 2003; Peng and Huang 2008; Huang 2010).

    +

    When the outcome variable is the residual life, semiparametric quantile +models that apply the inverse probability of censoring weighting (IPCW) +principle to address right-censored observations have been explored +(Jung et al. 2009; Kim et al. 2012; Li et al. 2016). These +approaches are based on non-smooth estimating functions with respect to +regression parameters, and the estimates of the regression parameters +are obtained either through zero-crossing of non-smooth estimating +functions using grid search techniques (Jung et al. 2009) or by +optimizing non-smooth objective functions with \(L_1\)-minimization +algorithms (Kim et al. 2012; Li et al. 2016). While these methods are +relatively straightforward to implement, an additional challenge lies in +standard error estimation, which necessitates the computationally +intensive use of a multiplier bootstrap method (Li et al. 2016). +Alternatively, Jung et al. (2009) and Kim et al. (2012) utilized the +minimum dispersion statistic and the empirical likelihood method, +respectively, to bypass the need to directly estimate the variance of +the regression parameter estimator for hypothesis testing and +constructing confidence intervals. The non-smooth nature of the +estimating functions in these approaches precludes the estimation of +variance using the robust sandwich-type variance estimator typically +employed in equation-based estimation methods. To lessen the associated +computational burden, an induced smoothing was proposed +(Brown and Wang 2005), which modifies the non-smooth estimating equations +into smooth ones. Leveraging the asymptotic normality of the non-smooth +estimator, the smooth estimating functions are constructed by averaging +out the random perturbations inherent in the non-smooth estimating +functions. The resulting estimating functions become smooth with respect +to the regression parameters, allowing for the straightforward +application of standard numerical algorithms, such as the Newton-Raphson +method. Furthermore, these smoothed estimating functions facilitate the +straightforward computation of variances using the robust sandwich-type +estimator. The induced smoothing approach has been employed in fitting +semiparametric accelerated failure time (AFT) models via the rank-based +approach +(Johnson and Strawderman 2009; Chiou et al. 2015a, 2021; Kang 2017). +Regarding quantile regression, Choi et al. (2018) considered the induced +smoothing approach under a competing-risks setting. All of these methods +are based on modeling event times. Recently, Kim et al. (2023) proposed +an induced smoothing estimator for fitting a semiparametric quantile +regression model for residual life.

    +

    The availability of published R packages for fitting quantile regression +models is somewhat limited. The rq(), nlrq(), rqss(), and crq() +functions in the package +quantreg +(Koenker 2022) are predominantly used and provide various features +for fitting linear, nonlinear, non-parametric, and censored quantile +regression models, respectively. The rq() function minimizes +non-smooth objective functions to obtain point estimates of regression +coefficients and can accommodate right-censored survival times by +incorporating weights. By redefining survival times as the remaining +lifetime at time \(t_0\), one can also obtain a non-smoothed estimator for +quantile regression models for residual life (Kim et al. 2012). On the +other hand, the nlrq() function is designed to fit a nonlinear +quantile regression model, while the rqss() function fits additive +quantile regression models with nonparametric terms, including +univariate components and bivariate components, using smoothing splines +and total variation regularization techniques +(Koenker et al. 1994; Koenker and Mizera 2004). Furthermore, the crq() +function fits a quantile regression model for censored data on the +\(\tau\)-th conditional quantile function of the response variable. +Overall, the quantreg +implements three methods for handling right-censored survival times: +Powell (1986)’s estimator, Portnoy (2003)’s estimator and +Peng and Huang (2008)’s estimator. However, none of the implemented methods +in the nlrq(), rqss(), or crq() functions are applicable for +handling censored residual life using the induced smoothing methods. The +only function that implements the induced smoothing method is the +aftsrr() function in the package +aftgee +(Chiou et al. 2021), but it is specifically designed for fitting +semiparametric AFT models, which are not directly applicable to fitting +quantile regression models.

    +

    Other R packages that can be used to fit quantile regression models for +survival data include the package +ctqr (Frumento 2021), +package Brq (Alhamzawi 2020), +package brms +(Bürkner 2018), and package +cmprskQR +(Dlugosz et al. 2019). The ctqr() function in the package +ctqr implements the +methods proposed in Frumento (2021) for right or interval-censored failure +times with left-truncation. The Bqr() function in the package +Brq implements Bayesian +methods based on the asymmetric Laplace distribution. In the package +brms, the brm() +function with the family=asym_laplace() option enables the +implementation of full Bayesian inference. The crrQR() function in the +package cmprskQR +allows fitting quantile regression models with competing risks. All of +these R packages are designed for fitting quantile regression models for +failure times defined from a baseline and are not applicable to the +residual life setting.

    +

    The recently developed R package +qris (Kim et al. 2022) provides +an efficient tool for fitting semiparametric quantile regression models +for residual life subject to right censoring. The +qris package offers three +methods for estimating the regression parameters: \(L_1\)-minimization of +non-smooth objective functions, induced smoothing with a non-iterative +approach, and an iterative procedure. For standard error estimation, the +qris package provides two +resampling-based approaches: the partial multiplier bootstrap and the +full multiplier bootstrap methods. The partial multiplier bootstrap +method utilizes the robust sandwich-type estimator by incorporating the +sample variance of perturbed estimating functions, while the full +multiplier bootstrap method is obtained by considering the sample +variance from the solutions of perturbed estimating functions. To +enhance the interpretability of results, the +qris package incorporates +graphical visualizations of covariate effects at different quantiles and +base times, utilizing the plotting environment similar to that in the +ggplot2 package +(Wickham et al. 2022), thereby allowing for extensive flexibility and +customization. The ultimate goal of creating the +qris package is to +facilitate the easy incorporation of quantile regression for residual +life into daily routines. The package +qris is available on the +Comprehensive R Archive Network (CRAN) at +https://CRAN.R-project.org/package=qris.

    +

    The rest of the article is organized as follows: Section 2 +introduces a semiparametric regression model for quantiles of residual +life and the estimation methods implemented in the package. +Section 3 provides details about computing +algorithms. Illustrations of the package using a simulated dataset and +the real data from the North Central Cancer Treatment Group are +presented in Section 4. Finally, in +Section 5, concluding remarks are provided along with +some discussions.

    +

    2 Semiparametric quantile regression for residual life

    +

    Define \(T\) as the potential failure time that is subject to right +censoring by \(C\) and \(\mathbf{X}\) as a \(p \times 1\) vector of +covariates, where \(p\) is the number of covariates, including an +intercept. The observed data consists of \(n\) independent copies of +\((Z, \delta, \mathbf{X})\), where \(Z = \min(T, C)\), +\(\delta = I(T \leq C)\), and \(I(\cdot)\) is an indicator function. We also +assume \(T\) and \(C\) are marginally independent. Define the \(\tau\)-th +quantile of the residual life at \(t_0 > 0\) as \(\theta_{\tau}(t_0)\) that +satisfies +\(P(T_i - t_0 \geq \theta_{\tau}(t_0) \ | \ T_i > t_0) = 1 - \tau\). We +consider the semiparametric quantile regression model for the residual +life (Kim et al. 2012; Kim et al. 2023). Given \(T_i > t_0\), +\[\label{qr:mod1} + \log(T_i - t_0) = \mathbf{X}_{i}^{\top}\boldsymbol{\mathbf{\beta}}_0(\tau, t_0) + \epsilon_i, i = 1, \ldots, n, %\label{qr:mod2} \tag{1}\] +where \(\boldsymbol{\mathbf{\beta}}_0(\tau, t_0)\) is a \(p \times 1\) +vector of regression coefficients, and \(\epsilon_i\) is a random error +having zero \(\tau\)-th quantile. The quantile regression model for a +continuous response (Koenker and Bassett Jr 1978) is a special case of +Equation (1) when \(t_0 = 0\). For ease of notation, we omit +\(\tau\) and \(t_0\) in \(\boldsymbol{\mathbf{\beta}}_0(\tau, t_0)\) and +\(\theta_{\tau}(t_0)\) and write \(\boldsymbol{\mathbf{\beta}}_0\) and +\(\theta\). We present different estimation procedures to estimate +\(\boldsymbol{\mathbf{\beta}}_0\) given \(\tau\) and \(t_0\) in the following.

    +

    Estimation using non-smooth functions

    +

    When there is no censoring, an estimator for \(\beta_0\) in +Equation (1) can be obtained by solving the estimating +equation (Kim et al. 2012), where +\[\label{eq:ns:obj1} + \frac{1}{n}\sum_{i=0}^{n}I[T_i \ge t_0] \mathbf{X}_i \left\{I\left[\log(T_i - t_0) \leq \mathbf{X}_i^{\top}\boldsymbol{\mathbf{\beta}} \right] - \tau \right\} = 0. \tag{2}\] +However, Equation (2) cannot be directly used when +\(T_i - t_0\) is subject to right censoring. The IPCW technique can be +incorporated into Equation (2) to account for the right +censoring (Li et al. 2016). Specifically, in the presence of right +censoring, the estimator for \(\boldsymbol{\mathbf{\beta}}_0\) in +Equation (1) can be obtained as the root of the following +weighted estimating equations: +\[\label{eq:nsm:ipw} + U_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau) = \frac{1}{n}\sum_{i=1}^{n}I[Z_i \ge t_0] \mathbf{X}_i \left\{I \left[\log(Z_i - t_0) \leq \mathbf{X}_i^{\top} \boldsymbol{\mathbf{\beta}} \right]\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0)} -\tau \right\}, \tag{3}\] +where \(\widehat{G}(\cdot)\) is the Kaplan-Meier estimate of the survival +function \(G(\cdot)\) of the censoring time \(C\) and +\(\widehat{G}(t) = \prod_{i: t_i \leq t} (1 - \sum_{j=1}^n (1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n I(Z_j \geq t_i))\). +A computational challenge arises because the exact solution to +Equation (3) might not exist due to the non-smoothness in +\(\beta\) caused by the involvement of indicator functions. When the exact +solutions do not exist, the root of Equation (3) can be +approximated by minimizing the \(L_1\)-objective function +\(L_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau)\) (Li et al. 2016), +\[\begin{aligned} + \label{l1:nsm} + \nonumber + L_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/\widehat{G}(t_0)} \left| \log(Z_i - t_0) - \mathbf{X}_i^{\top}\beta \right| + \\ + & \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n - \mathbf{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}(Z_l)/\widehat{G}(t_0)}\right| + + \ \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n 2\tau \mathbf{X}_l I[Z_l > t_0]\right|, +\end{aligned} \tag{4}\] +where \(M > 0\) bounds +\(\left| \boldsymbol{\mathbf{\beta}}^{\top}\sum_{i=1}^n - \mathbf{X}_i \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/ \widehat{G}(t_0)}\right|\) +and +\(\left| \boldsymbol{\mathbf{\beta}}^{\top}\sum_{i=1}^n 2\tau \mathbf{X}_i I[Z_i > t_0]\right|\) +from above. Numerically, the limit \(M\) is set to be an extremely large +number, and the qris() function uses \(M = 10^6\). Denote the resulting +estimator to be \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}\). It +has been shown that \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}\) +is consistent for \(\boldsymbol{\mathbf{\beta}}_0\) and asymptotically +normally distributed (Li et al. 2016).

    +

    Despite the well-established asymptotic properties, directly estimating +the variance of \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}\) is +impractical because it involves the derivative of non-smooth functions. +A multiplier bootstrap method has typically been employed +(Li et al. 2016) to address this difficulty. The multiplier bootstrap +method considers the perturbed version of \(U_{t_0}(\beta, \tau)\), +defined as +\[\label{eq:nsm:rev} + U_{t_0}^{\ast}(\beta, \tau) = \frac{1}{n}\sum_{i=1}^{n} \eta_i I[Z_i \ge t_0] \mathbf{X}_i \left\{I \left[\log(Z_i - t_0) \leq \mathbf{X}_i^{\top} \boldsymbol{\mathbf{\beta}} \right]\frac{\delta_i}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} -\tau \right\}, \tag{5}\] +where \(\eta_i, i = 1, \ldots, n,\) are independently and identically +(iid) generated from a positive random variable with unity mean and +variance, and \(\widehat{G}^\ast(\cdot)\) is a perturbed version of +\(\widehat{G}(\cdot)\), constructed as \(\widehat{G}^\ast(t) = +\prod_{i: t_i \leq t} (1 - \sum_{j=1}^n \eta_j(1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n \eta_jI(Z_j \geq t_i))\) +for a given realization of \(\eta_i\). On the other hand, a perturbed +\(L_1\)-objective function, denoted as +\(L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau)\), can be similarly +constructed, where +\[\begin{aligned} + L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} \left| \log(Z_i - t_0) - \mathbf{X}_i^{\top}\boldsymbol{\mathbf{\beta}} \right| + \nonumber \\ + & \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n - \mathbf{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}^{\ast}(Z_l)/\widehat{G}^{\ast}(t_0)}\right| + + \ \left| M - \beta^{\top}\sum_{l=1}^n 2\tau \mathbf{X}_l \eta_l I[Z_l > t_0]\right|. +\end{aligned}\] +Solving for \(U_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau) = 0\), or +equivalently, minimizing +\(L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau)\), yields one +realization of \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}\). The +multiplier bootstrap variance is computed as the sample variance of a +large number of realizations of +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}\).

    +

    Estimation using induced smoothed functions

    +

    The regression coefficient in Equation (1) can be more +efficiently obtained through the induced smoothed version of +Equation (3). The induced smoothed estimating functions are +constructed by taking the expectation with respect to a mean-zero random +noise added to the regression parameters in Equation (3). +Specifically, +\[\begin{aligned} +\label{eq:is} + \widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, H) & = E_w \{U_{t_0}(\boldsymbol{\mathbf{\beta}}+\mathbf{H}^{1/2}\mathbf{W}, \tau)\}\nonumber\\ + & = \frac{1}{n} \sum_{i=1}^{n} I[Z_i > t_0] \mathbf{X}_i \left\{ \Phi\left(\frac{\mathbf{X}_i^\top\boldsymbol{\mathbf{\beta}}-\log(Z_i-t_0)}{\sqrt{\mathbf{X}_i^{\top} \mathbf{H} \mathbf{X}_{i}}}\right)\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0) } -\tau \right\}, +\end{aligned} \tag{6}\] +where \(\mathbf{H} = O(n^{-1})\), \(\mathbf{W} \sim N(0, \mathbf{I}_p)\) is +a standard normal random vector, \(\mathbf{I}_p\) is the \(p \times p\) +identity matrix, and \(\Phi(\cdot)\) is the cumulative distribution +function of a standard normal random variable. A typical choice for +\(\mathbf{H}\) is to fix it at \(n^{-1}\mathbf{I}_p\), while some +alternative choices are explored in Chiou et al. (2015b). Let +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) be the solution to +\(\widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H}) = 0\). +Since Equation (6) is a smooth function in +\(\boldsymbol{\mathbf{\beta}}\), the estimator can be obtained using +standard numerical algorithms such as the Newton-Raphson method. +Moreover, the induced smoothed estimator for +\(\boldsymbol{\mathbf{\beta}}_0\) has been shown to be asymptotically +equivalent to its non-smooth counterpart (Kim et al. 2023).

    +

    Following the idea in Section 2.1, the multiplier +bootstrap procedure can be similarly employed to estimate the variance +of \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\). The perturbed +version of Equation (6) takes the form of +\[\label{eq:7} + \widetilde{U}^{\ast}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H}) = \frac{1}{n} \sum_{i=1}^{n} \eta_i I[Z_i > t_0] \mathbf{X}_i \left\{ \Phi\left(\frac{\mathbf{X}_i^\top\boldsymbol{\mathbf{\beta}} - \log(Z_i-t_0)}{\sqrt{\mathbf{X}_i^{\top} \mathbf{H} \mathbf{X}_{i}}}\right)\frac{\widehat{G}^{\ast}(t_0) \delta_i}{\widehat{G}^{\ast}(Z_i)} -\tau \right\}. \tag{7}\] +The multiplier bootstrap procedure estimates the variance of +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) by calculating the +sample variance of a large number of realizations of +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) obtained by +repeatedly solving Equation (7).

    +

    It has been shown that the asymptotic variance +\(\mathop{\rm Var}\nolimits(\boldsymbol{\mathbf{\beta}}, \tau)\) can be +decomposed into +\(\mathbf{A}(\boldsymbol{\mathbf{\beta}})^{\top} \mathbf{V}(\boldsymbol{\mathbf{\beta}}) \mathbf{A}(\boldsymbol{\mathbf{\beta}})\) +(Kim et al. 2023), where the two components, +\(\mathbf{A}(\boldsymbol{\mathbf{\beta}})\) and +\(\mathbf{V}(\boldsymbol{\mathbf{\beta}})\), can be estimated separately. +Since Equation (6) is a smooth function in +\(\boldsymbol{\mathbf{\beta}}\), the slope matrix, +\(\mathbf{A}(\boldsymbol{\mathbf{\beta}})\), can be conveniently estimated +by differentiating +\(\widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H})\) +with respect to \(\boldsymbol{\mathbf{\beta}}\). The explicit form of +\(\mathbf{A}(\boldsymbol{\mathbf{\beta}})\) is as follows: +\[\begin{aligned} +\label{eq:cov:slp} + \mathbf{A}(\boldsymbol{\mathbf{\beta}}) & = \frac{\partial \widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H})}{\partial \boldsymbol{\mathbf{\beta}}} \nonumber \\ + & = \frac{1}{n}\sum_{i=1}^{n} I[Z_i > t_0] \mathbf{X}_i \frac{G(t_0) \delta_i}{G(Z_i)} \phi\left(\frac{{\mathbf{X}_i}^{\top}\boldsymbol{\mathbf{\beta}} - \log(Z_i-t_0)}{\sqrt{{\mathbf{X}_i}^{\top}\mathbf{H} \mathbf{X}_i}}\right)\left(\frac{-{\mathbf{X}_i}}{\sqrt{{\mathbf{X}_i}^{\top} \mathbf{H} {\mathbf{X}_i}}}\right), +\end{aligned} \tag{8}\] +where \(\phi (\cdot)\) is the density function of the standard normal +random variable.

    +

    The slope matrix, +\(\widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})\), +can be evaluated directly by plugging in +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) and +\(\widehat{G}(\cdot)\). On the other hand, the variance of the estimating +function, \(\widehat{\mathbf{V}}(\boldsymbol{\mathbf{\beta}})\), can be +obtained by a computationally efficient resampling method motivated by +the multiplier bootstrap procedure in Section 2.1. +Specifically, we propose estimating +\(\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})\) +as the simple variance of a large set of realizations of the perturbed +version of +\(\widetilde{U}_{t_0}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}, \tau, \mathbf{H})\) +presented in Equation (7). We refer to this procedure as the +partial multiplier bootstrapping approach because it utilizes the +perturbed estimating function, similar to the full multiplier +bootstrapping approach, but the computation of +\(\widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})\) +and +\(\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})\) +does not involve the repeated solving of the perturbed estimating +equations. Thus, the partial multiplier bootstrapping approach is +expected to be computationally more efficient than the multiplier +bootstrap method. A similar procedure and its performance have been +studied in modeling failure times with semiparametric AFT models +(Chiou et al. 2014, 2021).

    +

    Iterative procedure in induced smoothing estimation

    +

    The induced estimator \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) +is obtained with a fixed \(\mathbf{H}\), as described in +Section 2.2, and its variance is estimated separately. +This estimation procedure can be viewed as a special case of the +following iterative procedure, which updates \(\mathbf{H}\) and +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) iteratively. +Specifically, the iterative algorithm utilizes the Newton-Raphson method +while sequentially updating +\(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}\) and +\(\widehat{\mathop{\rm Var}\nolimits}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})\) +until convergence. Similar iterative algorithms have also been +considered previously in the induced smoothing approach for +semiparametric AFT models +(Johnson and Strawderman 2009; Chiou et al. 2014, 2015a; Choi et al. 2018). +The iterative procedure is summarized as follows:

    +
    +
    Step 1:
    +
    +

    Set the initial values +\(\widehat{\boldsymbol{\mathbf{\beta}}}^{(0)}\), +\(\widehat{\mathbf{\Sigma}}^{(0)} = \mathbf{I}_{p}\), and +\(\mathbf{H}^{(0)} = n^{-1}\widehat{\mathbf{\Sigma}}^{(0)}\).

    +
    +
    Step 2:
    +
    +

    Given \(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}\) and +\(\mathbf{H}^{(k)}\) at the \(k\)-th step, update +\(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}\) by +\[\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}=\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)} - \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)})^{-1}{\widetilde{U}_{t_0}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}, \tau, \mathbf{H}^{(k)}}).\]

    +
    +
    Step 3:
    +
    +

    Given \(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}\) and +\(\widehat{\mathbf{\Sigma}}^{(k)}\), update +\(\widehat{\mathbf{\Sigma}}^{(k)}\) by +\[\widehat{\mathbf{\Sigma}}^{(k+1)} = \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)})^{-1} \widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}, \tau) \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)})^{-1}.\]

    +
    +
    Step 4:
    +
    +

    Set \(\mathbf{H}^{(k+1)} = n^{-1}\widehat{\mathbf{\Sigma}}^{(k+1)}\). +Repeat Steps 2, 3 and 4 until +\(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}\) and +\(\widehat{\mathbf{\Sigma}}^{(k)}\) converge.

    +
    +
    +

    The initial value, \(\widehat{\boldsymbol{\mathbf{\beta}}}^{(0)}\), could +be chosen as \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}\). We +define \(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IT}\) and +\(\widehat{\boldsymbol{\mathbf{\Sigma}}}_{\tiny IT}\) as the values of +\(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}\) and +\(\widehat{\mathbf{\Sigma}}^{(k)}\) at convergence, and +\(\widehat{\mathop{\rm Var}\nolimits}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IT}) = n^{-1}\widehat{\mathbf{\Sigma}}_{\tiny IT}\). +In Step 3, +\(\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}, \tau)\) +is obtained using the partial multiplier bootstrap approach. However, +the full multiplier bootstrap approach can also be employed but would +require longer computation times.

    +

    3 Package implementation

    +

    The main function in the +qris package for +estimating the regression parameters in the quantile regression model +for residual life is the qris() function. The qris() function is +written in C++ and incorporated into R using the +Rcpp (Eddelbuettel et al. 2022a) and +RcppArmadillo +(Eddelbuettel et al. 2022b) packages. The synopsis of qris is:

    +
    +
    +
    args(qris)
    +
    +
    function (formula, data, t0 = 0, Q = 0.5, nB = 100, method = c("smooth", 
    +    "iterative", "nonsmooth"), se = c("fmb", "pmb"), init = c("rq", 
    +    "noeffect"), verbose = FALSE, control = qris.control()) 
    +NULL
    +
    +

    The required argument is formula, which specifies the quantile +regression model to be fitted using the variables in data. The +formula assumes that the response variable is a ‘Surv’ object +created by the Surv() function in the +survival package +(Therneau 2021). This formula structure is commonly adopted for +handling survival data in R, as seen in functions like survreg() and +coxph() in the +survival package. The +argument t0 specifies the base time used in defining residual life. +The default value of t0 is set to zero, in which case residual life +reduces to a failure time. The Q argument is used to specify the +target quantile of residual life to estimate, with the default value +being set to 0.5 (median). The nB argument specifies the bootstrapping +size used in standard error estimation, with the default value set to +100. The method argument specifies one of the three estimation +methods: "nonsmooth", "smooth", and "iterative", corresponding to +the estimating procedures outlined in Sections 2.1, +2.2, and 2.3, respectively. Given the point +estimates of the regression parameters, their standard errors can be +estimated using one of two implemented methods: se = "fmb" and +se = "pmb". The se = "fmb" method employs a full-multiplier +bootstrapping approach to estimate the variance by the sample variance +of large realizations of \(\widehat\beta\). The se = "pmb" method +estimates the variance using a robust sandwich variance estimator and +employs the computationally efficient partial multiplier bootstrapping +approach described in Section 2.2. The "fmb" option is +available for all three point estimation methods, whereas the "pmb" +option is not available for the "nonsmooth" point estimation method +due to the lack of a closed-form sandwich variance estimator. The init +argument allows users to specify the initial value for estimating +regression parameters by either a \(p\)-dimensional numerical vector or a +character string. In the latter case, the options init = "rq" and +init = "noeffect" correspond to the point estimate obtained from the +rq() function in the +quantreg package and +a \(p\)-dimensional vector of zeros, respectively. The default value for +init is init = "rq". Among the three methods implemented for point +estimation, method = "smooth" and method = "nonsmooth" are +non-iterative, in the sense that point estimation is performed +separately from the estimation of standard errors. On the other hand, +method = "iterative" calculates point estimates and the corresponding +standard error estimates simultaneously through iterative updates. When +method = "iterative", users can define specific convergence criteria +using qris.control(). The available options in qris.control() are as +follows.

    +
    +
    +
    args(qris.control)
    +
    +
    function (maxiter = 10, tol = 0.001, trace = FALSE) 
    +NULL
    +
    +

    The maxiter argument specifies the maximum number of iterations. The +default value for maxiter is ten, as the proposed algorithm typically +converges within ten steps based on our exploration. The convergence +tolerance is controlled using the tol argument, which has a default +value of 1e-3. The trace argument takes a logical value and is used +to determine whether to print the result for each iteration. The default +setting is trace = FALSE. The ‘qris’ object is fully compatible with +many of R’s generic functions, including coef(), confint(), +plot(), predict(), print(), residuals(), summary(), and +vcov().

    +

    Among the available S3 methods, a unique feature of the +qris package’s S3 plot +method, when applied to a ‘qris’ object, is its ability to +automatically update the original object by extending the range of +\(\tau\) or \(t_0\) values. This extension enables the generation of a +covariate effect plot over the newly specified values of \(\tau\) or +\(t_0\), providing a comprehensive visualization of the covariate effects +across the extended range. The S3 method for plotting a ‘qris’ +object is shown below.

    +
    +
    +
    argsAnywhere(plot.qris)
    +
    +
    function (x, t0s = NULL, Qs = NULL, nB = NULL, vari = NULL, byQs = FALSE, 
    +    ggextra = NULL, ...) 
    +NULL
    +
    +

    The argument x is a ‘qris’ object created using the qris() +function. The t0s and Qs arguments are numeric vectors that enable +users to specify the values of \(t_0\) or \(\tau\) for plotting the +covariate effect. If t0s and Qs are not specified, the covariate +effects are plotted against \(\tau = 0.1, 0.2, \ldots, 0.9\) at the base +time (\(t_0\)) inherited from the ‘qris’ object specified in x. The +nB argument is a numerical variable that controls the sample size for +bootstrapping, used to compute standard error estimations based on the +variance estimation specified in the original ‘qris’ object. When nB +is specified, the function calculates standard errors for all +combinations of \(t_0\) and \(\tau\) specified in t0s and Qs, computes +95% confidence intervals accordingly, and includes them in the covariate +effect plot. The vari argument is a character string that allows users +to specify the names of the covariates they want to display in the +effect plots. When the vari argument is not specified, all covariates +will be included in the plots by default. The coefficient event plot can +be plotted against the specified quantiles by setting byQs = TRUE or +against the specified base times by setting byQs = FALSE. Finally, the +ggextra argument allows users to pass additional graphical parameters +to the ggplot2 +package, offering further customization options for the plots. When the +plot() function is called, it internally invokes the qris.extend() +function to compute the covariate effects at additional values. The +syntax for the qris.extend() function is provided below:

    +
    +
    +
    args(qris.extend)
    +
    +
    function (x, t0s = NULL, Qs = NULL, nB = NULL, vari = NULL) 
    +NULL
    +
    +

    The arguments in qris.extend() are inherited from the arguments +specified in the plot() function. To reduce runtime when repeatedly +calling the plot(), one can calculate the desired covariate effects by +applying qris.extend() outside of plot() first and then supply the +results to plot(). This approach allows for pre-computation of the +covariate effects, making it more efficient when generating multiple +plots. Overall, the unique plotting feature in +qris provides users with +a seamless and effortless approach to conducting a comprehensive +assessment of the covariate effects across different quantiles or base +times.

    +

    4 Illustration

    +

    Simulated data

    +

    In this subsection, we present a simple simulation example to validate +the implementations in the proposed +qris package. The +simulation involves five covariates, denoted as \(X_1, \ldots, X_5\). +Among these covariates, \(X_1\) and \(X_4\) follow a standard uniform +distribution, \(X_2\) follows a binomial distribution with a success +probability of 0.5, \(X_3\) follows a standard normal distribution, and +\(X_5\) follows a standard exponential distribution. We assume that +\(X_2, X_3, X_4\), and \(X_5\) do not impact the residual life, meaning +their corresponding coefficient values \(\beta_2\), \(\beta_3\), \(\beta_4\), +and \(\beta_5\) are zero. The survival time \(T\) is generated from a +Weibull distribution with the survival function +\(S(t) = \exp\{-(\rho t)^\kappa\}\) for \(t > 0\), where \(\kappa = 2\), and +\(\rho\) is obtained by solving +\[\label{eq:sim:weibull} + \rho^{-1}\{ (\rho t_0)^\kappa - \log (1-\tau) \}^{(1/\kappa)}- t_0 = \exp\{\beta_0 + \beta_1 X_1\}, \tag{9}\] +for a specified \(t_0\) and \(\tau\). We set the intercept +\(\beta_0 = \log(5)\) and \(\beta_1 = \log(2)\) at \(t_0 = 0\). Given \(\rho\), +\(\tau\), and \(X_1\), the true values of \(\beta_0\) and \(\beta_1\) can be +obtained sequentially from Equation (9) for different +\(t_0 > 0\). In our case, the corresponding true values of \(\beta_0\) are +approximately 1.411 and 1.219 for \(t_0=1\) and 2, respectively. +Similarly, the true values of \(\beta_1\) are approximately 0.797 and +0.907 for \(t_0=1\) and 2, respectively. The closed-form expression for +generating \(T\) is then \(\{ -\log(1 - u) \}^{1/\kappa} / \rho\), where \(u\) +is a uniform random variable over \((0, 1)\). Given these specifications, +we have implemented the data.gen() function to generate simulation +data. The data.gen() function takes four arguments: n, t0, cen, +and Q, representing the sample size, \(t_0\), censoring proportion, and +\(\tau\), respectively. We generate censoring times \(C\) from an +independent uniform distribution over \((0, c)\), where \(c\) is chosen to +achieve the desired censoring proportions of 10% and 30%. Using the +generated dataset, we fit the model using three different estimation +methods: induced smoothing, non-smooth, and iterative-induced smoothing. +All analyses were conducted on a 4.2 GHz Intel(R) quad Core(TM) i7-7700K +central processing unit (CPU) using R 4.3.0 (R Core Team 2021). The following code +demonstrates the implementation of data.gen() to generate a simulation +dataset.

    +

    The data.gen() function generates a data.frame containing seven +variables. The Time variable represents the observed survival time, +while the status variable serves as the event indicator, taking the +value 1 for observed events and 0 for censored observations. The +variables X1, \(\ldots\), X5 are the covariates. The implementation in +the data.gen() function generates the Weibull survival times using the +inverse probability integral transform technique. Alternatively, users +can use the rweibull() function with the parameters shape = 2 and +scale = 1 / rho to generate these Weibull survival times directly.

    +

    We assess the performance of the proposed implementation across various +scenarios, including three sample sizes (\(n = 200, 400, 1000\)), three +levels of \(t_0\) (\(0, 1, 2\)), two censoring proportions (10% and 30%), +and two values of \(\tau\) (0.25 and 0.50). For a given dataset, we apply +the full-multiplier bootstrapping approach with 200 bootstrap samples to +all three available estimating procedures: method = "nonsmooth", +method = "smooth", and method = "iterative". To facilitate the +evaluation process, we create the do_fmb() function to record the +coefficient estimates, standard errors, and computing times for fitting +a single simulated dataset generated from data.gen(). The following is +the implementation of the do_fmb() function and the corresponding code +to run the simulation with 200 replications. We present the code and +result of the simulation experiments conducted at three different sample +sizes, with \(t_0\) values set to 0 and 1, while holding the censoring +proportion at 30% and \(\tau\) value at 0.5. The results for other +simulation scenarios are provided in the Supplementary Materials.

    +
    +
    +
    do_fmb <- function(n, t0, cen, Q, nB) {
    +  dat <- data.gen(n, t0, cen, Q)
    +  fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5
    +  stamp <- NULL
    +  stamp[1] <- Sys.time()
    +  f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "fmb")
    +  stamp[2] <- Sys.time()
    +  f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "nonsmooth", se = "fmb")
    +  stamp[3] <- Sys.time()
    +  f3 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "fmb")
    +  stamp[4] <- Sys.time()
    +  list(smooth = c(f1$coef, f1$std),
    +       nonsmooth = c(f2$coef, f2$std),
    +       iter = c(f3$coef, f3$std),
    +       times = diff(stamp))
    +}
    +
    +B <- 200
    +set.seed(2)
    +sims0_fmb <- mapply(function(n, t0)
    +    replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)),
    +    n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F)
    +sim1_fmb <- mapply(function(n, t0)
    +    replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)),
    +    n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F)
    +
    +
    +

    Figure 1 displays violin plots that provide visualizations +of the empirical distribution of the coefficient estimates. As expected, +all three estimators exhibit small biases, which are calculated as the +difference between the point estimates (PE) and the true regression +coefficients. Furthermore, the empirical distributions of the PEs +demonstrate a normal-like shape, aligning with the asymptotic properties +of the proposed method (Li et al. 2016; Kim et al. 2023). When the +sample size is smaller (e.g., \(n = 200\) and 400), the nonsmooth +approach appears to yield slightly larger empirical standard errors +(ESE) compared to the smooth or iterative approaches. However, when +\(n = 1000\), the ESEs are similar across all approaches. On the other +hand, the comprehensive simulation results presented in Table 1 of the +Supplementary Materials confirm that all coefficient estimates closely +approximate the true regression coefficients. On the other hand, the +ESEs and the averaged estimated standard errors (ASE) are in close +agreement for all scenarios, indicating the validity of the variance +estimation. Furthermore, the computation times, which are presented +separately in the upper panel of Table 1, indicate that +when employing the full multiplier bootstrapping approach, the +nonsmooth approach demonstrates a slight advantage in terms of +computational efficiency over the smooth approach, while the +iterative approach takes 5.1 to 9.5 times longer than the smooth +approach. In summary, the timing results show that the proposed method +can yield valid inference results within seconds, even with large +datasets of up to 1000 observations or when using the computationally +demanding full multiplier bootstrapping approach for variance +estimation.

    +
    +
    +
    +graphic without alt text +

    +Figure 1: \(t_0 = 0\) +

    +
    +
    +


    +

    +
    +
    +graphic without alt text +

    +Figure 2: \(t_0 = 1\) +

    +
    +
    +
    +

    When \(t_0 = 0\), the targeted semiparametric quantile regression model +for residual life simplifies to the standard quantile regression model +for survival time. In such cases, existing functions like crq() from +the quantreg package +(Koenker 2022) can be employed. A comparison between the performance +of crq() and our proposed implementation when \(t_0 = 0\) is presented +in the Supplementary Materials, where the standard errors of the crq() +are obtained from the bootstrap method with 200 bootstrap samples. +Overall, the performance of crq() is comparable to the proposed +methods in terms of bias and standard errors. However, we have +occasionally encountered situations where the crq() function fails to +converge, particularly when the sample size is large, as in the case of +\(n = 1000\). In the other extended simulation scenarios outlined in the +Supplementary Materials, which encompass various levels of \(t_0\), +censoring proportions, and \(\tau\), the proposed methods consistently +exhibit satisfactory performance across all settings.

    +

    The true potential of the proposed smooth approach lies in its +capability for efficient variance estimation through the implementation +of the partial multiplier bootstrapping approach. This approach +eliminates the need for repetitive solving of estimating equations, +resulting in improved computational efficiency in variance estimation. +To demonstrate its usefulness, we conducted a simulation using both the +smooth approach and the iterative approach with the partial multiplier +bootstrapping approach (se = "pmb"). This simulation was conducted +under the settings of \(\tau = 0.5\), \(t_0 = 0\) and \(1\), and a 30% +censoring rate. The do_pmb() function was accordingly modified as +follows.

    +
    +
    +
    do_pmb <- function(n, t0, cen, Q, nB) {
    +  dat <- data.gen(n, t0, cen, Q)
    +  fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5
    +  stamp <- NULL
    +  stamp[1] <- Sys.time()
    +  f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "pmb")
    +  stamp[2] <- Sys.time()
    +  f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "pmb")
    +  stamp[3] <- Sys.time()
    +  list(smooth = c(f1$coef, f1$std),
    +       iter = c(f2$coef, f2$std),
    +       times = diff(stamp))
    +}
    +
    +set.seed(2)
    +sims0_pmb <- mapply(function(n, t0)
    +    replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)),
    +    n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F)
    +sims1_pmb <- mapply(function(n, t0)
    +    replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)),
    +    n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F)
    +
    +
    +

    The simulation results obtained using the partial multiplier +bootstrapping approach are presented in Figure 3 and +Tables 7 – 12 in the Supplementary Materials, while the computing times +are displayed in the lower panel of Table 1. Overall, the +estimation results obtained using se = "pmb" in Figure 3 +closely resemble those in Figure 1 with se = "fmb". As +seen in Tables 7 and 8, the ESEs from the non-iterative and iterative +methods are comparable, while the ASEs slightly overestimate the ESEs +when the sample size is small. The gaps are slightly smaller for the +iterative method, as shown in some cases +(Johnson and Strawderman 2009; Kim et al. 2021). The magnitudes of the +differences are not large, and they also become smaller when the sample +size reaches \(n = 1000\). More importantly, the computing times with +se = "pmb" show significant speed improvements compared to when +se = "fmb" is used in every case; we observed up to 79% timing +improvements.

    +
    +
    +
    +graphic without alt text +

    +Figure 3: \(t_0 = 0\) +

    +
    +
    +


    +

    +
    +
    +graphic without alt text +

    +Figure 4: \(t_0 = 1\) +

    +
    +
    +
    +
    + + ++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Table 1: Runtimes (in seconds) when se = fmb and se = pmb.
    \(t_0 = 0\)\(t_0 = 1\)
    semethod20040010002004001000
    fmbSmooth0.1030.1740.4710.1060.1780.480
    Nonsmooth0.0800.1420.4720.0800.1410.468
    Iterative0.9811.5002.4100.9851.5672.882
    pmbSmooth0.0220.0520.2230.0220.0530.224
    Iterative0.2960.5801.4070.2960.5811.435
    +
    +

    After confirming the satisfactory performance of the proposed +methodologies, we now proceed to illustrate the application of the +init argument. This argument controls the initial values assigned to +the root-finding algorithm’s estimates and the plotting capacity of the +qris package. For this +illustrative example, we consider a simpler simulation scenario that +involves a single binary covariate. This simplified simulation can be +generated using the revised version of the data.gen() function +provided below.

    +
    +
    +
    ## Global parameters
    +rho0 <- .2 * sqrt(log(2))
    +rho1 <- .1 * sqrt(log(2))
    +data.gen <- function(n) {
    +    dat <- data.frame(censoring = runif(n, 0, 23.41),
    +                      Time0 = sqrt(-log(1 - runif(n))),
    +                      X = rbinom(n, 1, .5))
    +    dat$Time0 <- ifelse(dat$X > 0, dat$Time0 / rho1, dat$Time0 / rho0)
    +    dat$Time <- pmin(dat$Time0, dat$censoring)
    +    dat$status <- 1 * (dat$Time0 < dat$censoring)
    +    subset(dat, select = c(Time, status, X))
    +}
    +set.seed(10)
    +head(dat <- data.gen(200))
    +
    +
           Time status X
    +1  6.034713      1 1
    +2  7.181451      0 1
    +3  9.993908      0 1
    +4 16.225520      0 1
    +5  1.993033      0 1
    +6  5.277471      0 0
    +
    +

    The updated data.gen() function returns a data.frame comprising +three variables: Time, status, and X, representing the observed +survival time, event indicator, and binary covariate, respectively. We +will first illustrate the usage of the argument init by considering +three different initial values: init = "rq", init = c(1,1), and a +random vector init = rnorm(2), all used in conjunction with the smooth +estimator method = "smooth". The following codes provide an example +with different initial values.

    +
    +
    +
    (random <- rnorm(2))
    +
    +
    [1] 1.5025446 0.5904095
    +
    +
    f1 <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, init = "rq", nB = 0)
    +f2 <- update(f1, init = c(1, 1))
    +f3 <- update(f1, init = random)
    +all.equal(f1$coef, f2$coef)
    +
    +
    [1] TRUE
    +
    +
    all.equal(f2$coef, f3$coef)
    +
    +
    [1] TRUE
    +
    +

    The ‘qris’ object, with its call component, is compatible with the +update() function, a built-in function commonly used for updating the +attributes of an existing object without requiring redundant and +repetitive code. In the example above, we used the update() function +to modify the initial value specification in f1. We observed that +different initial values yield identical point estimates, thereby +affirming the robustness of the proposed method against fluctuations in +initial values.

    +

    The covariate effects, along with their associated 95% point-wise +confidence intervals across various quantiles or base times, can be +visually assessed by applying the generic function plot() to a +‘qris’ object. We demonstrate this feature using the following qris +fit, where the standard errors are obtained using se = "pmb", +\(t_0 = 1\), and all other parameters are set to their default values. We +update the qris fit with extended quantiles over +\({0.4, 0.5, 0.6, 0.7}\) and plot the covariate effects against these +quantiles using the plot() function.

    +
    +
    +
    fit <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, se = "pmb")
    +fit2 <- qris.extend(fit, Qs = 4:7 / 10)
    +
    +
    +

    The extended ‘qris’ fit generated by the qris.extend() function +inherits all the attributes from the original ‘qris’ object and +includes additional ggdat components. The following code compares the +components of the returned values from the extended ‘qris’ fit and the +original ‘qris’ fit.

    +
    +
    +
    class(fit2)
    +
    +
    [1] "qris"
    +
    +
    names(fit)
    +
    +
    [1] "call"        "coefficient" "data"        "formula"    
    +[5] "para"        "stderr"      "varNames"    "vcov"       
    +
    +
    setdiff(names(fit2), names(fit))
    +
    +
    [1] "ggdat"
    +
    +

    Specifically, the extended ‘qris’ fit inherits call, coefficient, +para, stderr, varNames, and vcov from the original ‘qris’ +object. The call component is the function call from the original +qris() fit, while coefficient, stderr, and vcov are used to +store the point estimates, standard error estimates, and covariance +matrix, respectively. The para component is a list containing the +parameters specified during the fitting of the quantile regression +model, and varNames is a character string representing the variable +names in the function call. The newly added values are ggdat and gg. +The ggdat is a data frame containing covariate information generated +under the different quantiles and base times specified in the +qris.extend(). Finally, the corresponding covariate effect plot can be +generated by plotting the extended ‘qris’ fit as follows.

    +
    +

    +
    +

    The true values of \(\beta\)’s at different quantiles and base times, +computed from Equation (9), can be implemented in the +following commands.

    +
    +
    +
    ## Global parameters 
    +r <- 2:1 * sqrt(log(2)) / 10
    +k <- 2
    +trueB <- function(t0, tau) {
    +    b <- log(1 / r * ((r * t0) ^ k - log(1 - tau))^(1 / k) - t0)
    +    c(b[1], b[2] - b[1])
    +}
    +true_Q <- c(t(sapply(4:7 / 10, trueB, t0 = 1)))
    +true_t0 <- c(t(sapply(1:3, trueB, tau = .5)))
    +
    +
    +

    The following code extends the ‘ggplot’ objects generated by +plot.qris() by adding additional layers of true value curves and +incorporating various ggplot options. The resulting figures, +Figure 5 and +Figure 6, present the output based on whether +the covariate effects are plotted against quantiles or base times, +respectively. This observed trend aligns with the specifications +described in Equation (9), where increasing \(\tau\) +corresponds to an increasing \(\beta_0\) while keeping \(\rho\) and \(X\) +fixed. On the other hand, the covariate effect does not change with +quantiles but slightly increases with base times, echoing the model +specification where \(\beta_0\) is inversely related to \(t_0\) and +\(\beta_1\) increases as \(t_0\) increases.

    +
    +
    +
    library(ggplot2)
    +plot(fit2) + theme(legend.position = "bottom") + 
    +    geom_line(aes(x = Qs, y = true_Q, col = variable, linetype = "True value")) +
    +    scale_linetype_manual(name = "", values = c("True value" = "dotdash"))
    +
    + +
    +
    b <- plot(fit2, t0s = 1:3, byQs = F)
    +b + theme(legend.position = "bottom") +
    +    geom_line(aes(x = t0s, y = true_t0, col = variable,
    +                  linetype = "True value")) +
    +    scale_linetype_manual(name = "", values = c("True value" = "dotdash"))
    +
    +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 5: Plot for \(Q\in\{0.4, \ldots, 0.7\}\) at \(t_0 = 1\) +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 6: Plot for \(t_0\in\{1, \ldots, 3\}\) at \(Q = 0.5\) +

    +
    +
    +
    +

    North Central Cancer Treatment Group Lung Cancer Data

    +

    The North Central Cancer Treatment Group Lung Cancer Data records the +survival of patients with advanced lung cancer, along with assessments +of the patients’ performance status measured by both physicians and the +patients themselves (Loprinzi et al. 1994). The original objective +of the study was to ascertain whether descriptive information from a +patient-completed questionnaire could offer prognostic insights. The +original objective of the study was to determine whether descriptive +information from a patient-completed questionnaire could provide +prognostic information. However, for this illustration, we focus on how +gender and weight loss affect the quantiles of residual life for +patients diagnosed with advanced lung cancer at different time points. +The lung cancer data are publicly available from the +survival package +(Therneau 2021) as lung. The following code displays the structure +of the lung dataset with variables of interest.

    +
    +
    +
    data(cancer, package = "survival")
    +str(subset(lung, select = c(time, status, sex, wt.loss)))
    +
    +
    'data.frame':   228 obs. of  4 variables:
    + $ time   : num  306 455 1010 210 883 ...
    + $ status : num  2 2 1 2 2 1 2 2 2 2 ...
    + $ sex    : num  1 1 1 1 1 1 2 2 1 1 ...
    + $ wt.loss: num  NA 15 15 11 0 0 10 1 16 34 ...
    +
    +

    The lung data contains 228 patients whose observed survival times in +days and censoring status (1 = censored, 2 = dead) are recorded in the +time and the status columns, respectively. Although the censoring +status in this dataset is not recorded in the typical 0-1 fashion, the +Surv() function is still applicable to create the corresponding +“Surv" object. The lung data yields a censoring rate of \(27.6\%\) +with a median survival time of 310 days. The covariates of interest are +gender (sex = 1 if male, sex = 2 if female) and weight loss +(wt.loss). In the following, we use the proposed semiparametric +quantile regression models to assess the gender and standardized weight +loss effects on different quantiles of residual life at different base +times.

    +

    We first model the median residual life (Q = 0.5) when the base time +is one month (t0 = 30). Since the estimated median survival times for +combined lung cancers are typically less than one year, with a range of +8 to 13 months (Siegel et al. 2021), setting the base time at one month +provides insight into how gender and weight loss impact the residual +time in early follow-up. In the following, we obtain the regression +coefficient estimates using the induced smoothing functions and the +corresponding variance estimate with the partial multiplier bootstrap +approach.

    +
    +
    +
    lung$male <- factor(lung$sex, 1:2, c("Male", "Female"))
    +lung$std.wt.loss <- scale(lung$wt.loss)
    +fit1 <- qris(Surv(time, status) ~ male + std.wt.loss,
    +             data = lung, t0 = 30, Q = .5, nB = 100,
    +             method = "smooth", se = "pmb")
    +summary(fit1)
    +
    +
    Call:
    +qris(formula = Surv(time, status) ~ male + std.wt.loss, data = lung, 
    +    t0 = 30, Q = 0.5, nB = 100, method = "smooth", se = "pmb")
    +
    +qris Estimator
    +            estimate std.Error z.value p.value    
    +(Intercept)   5.5611    0.0950  58.539  <2e-16 ***
    +maleFemale    0.4804    0.1616   2.972  0.0030 ** 
    +std.wt.loss  -0.0731    0.0807  -0.905  0.3652    
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +

    Subjects with missing values (in any of the variables relevant for the +modeling task) are automatically removed when qris() is called. The +estimated intercept implies that the median residual life for patients +who have survived up to 30 days is \(\exp(5.5611) = 260.1\) days for a +male with an average weight loss. More interestingly, the summary shows +that the gender effect is statistically significant at the 0.05 +significance level, indicating that a female patient is expected to have +a median residual life at 30 days that is \(\exp(0.4804) = 1.617\) times +that of a male patient with the same weight loss. The effect of the +weight loss is not statistically significant at the 0.05 level. In +addition to summary(), important statistics such as the coefficient +and variance estimates can be extracted by S3 methods coef() and +vcov(), respectively.

    +
    +
    +
    coef(fit)
    +
    +
    (Intercept)           X 
    +  1.4302204   0.9322628 
    +
    +

    Moreover, the corresponding 95% Wald-type confidence interval can be +printed by applying the confint() function to the ‘qris’ object.

    +
    +
    +
    confint(fit1)
    +
    +
                     2.5 %     97.5 %
    +(Intercept)  5.3749261 5.74731362
    +maleFemale   0.1636200 0.79726457
    +std.wt.loss -0.2312535 0.08510084
    +
    +

    The update() function can be conveniently applied to update existing +‘qris’ objects. The following examples update the method and se +arguments from fit1. The updated results yield similar coefficient +estimates, but the non-smooth procedure (method = "nonsmooth") yields +slightly greater standard error estimates.

    +
    +
    +
    summary(fit2 <- update(fit1, method = "nonsmooth", se = "fmb"))
    +
    +
    Call:
    +qris(formula = Surv(time, status) ~ male + std.wt.loss, data = lung, 
    +    t0 = 30, Q = 0.5, nB = 100, method = "nonsmooth", se = "fmb")
    +
    +qris Estimator
    +            estimate std.Error z.value p.value    
    +(Intercept)   5.5585    0.1025  54.249  <2e-16 ***
    +maleFemale    0.4695    0.1872   2.508  0.0122 *  
    +std.wt.loss  -0.0668    0.0944  -0.708  0.4789    
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +
    +
    +
    summary(update(fit1, method = "iterative"))
    +
    +
    Call:
    +qris(formula = Surv(time, status) ~ male + std.wt.loss, data = lung, 
    +    t0 = 30, Q = 0.5, nB = 100, method = "iterative", se = "pmb")
    +
    +qris Estimator
    +            estimate std.Error z.value p.value    
    +(Intercept)   5.5599    0.0904  61.518  <2e-16 ***
    +maleFemale    0.4818    0.1786   2.698  0.0070 ** 
    +std.wt.loss  -0.0716    0.0942  -0.761  0.4467    
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +

    At a lower (Q = 0.25) and a higher (Q = 0.75) quantiles, the gender +effect remains significant at the 0.05 significance level indicating +female patients are associated with longer lower-quantile and +higher-quantile residual life than male patients with the same weight +loss. Among these models, we observed that female patients tend to have +higher coefficient estimates when fitting higher-quantile residual life. +While the sign of the estimated regression coefficient for weight loss +changes to a negative value when considering the lower quantile, the +effects remain statistically insignificant for both the lower and higher +quantiles.

    +
    +
    +
    summary(update(fit1, Q = 0.25))
    +
    +
    Call:
    +qris(formula = Surv(time, status) ~ male + std.wt.loss, data = lung, 
    +    t0 = 30, Q = 0.25, nB = 100, method = "smooth", se = "pmb")
    +
    +qris Estimator
    +            estimate std.Error z.value p.value    
    +(Intercept)   4.9111    0.1000  49.115  <2e-16 ***
    +maleFemale    0.4651    0.1851   2.513  0.0120 *  
    +std.wt.loss   0.0543    0.0548   0.991  0.3218    
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +
    +
    +
    summary(update(fit1, Q = 0.75))
    +
    +
    Call:
    +qris(formula = Surv(time, status) ~ male + std.wt.loss, data = lung, 
    +    t0 = 30, Q = 0.75, nB = 100, method = "smooth", se = "pmb")
    +
    +qris Estimator
    +            estimate std.Error z.value p.value    
    +(Intercept)   6.0748    0.1085  55.971  <2e-16 ***
    +maleFemale    0.5237    0.1621   3.231  0.0012 ** 
    +std.wt.loss  -0.0171    0.1083  -0.158  0.8746    
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +

    We also consider the base time at six months t0 = 180, which enables +us to assess gender and weight loss effects in median residual time at a +moderate length of follow-up. The estimated effect for the gender and +weight loss increases as \(t_0\) increases from \(30\) days to \(180\) days +and becomes significant at the 0.05 significant level. Additionally, the +effect of the weight loss seems to be associated with a shorter survival +time after \(180\) days, with a \(p\)-value of \(0.0008\).

    +
    +
    +
    summary(update(fit1, t0 = 180))
    +
    +
    Call:
    +qris(formula = Surv(time, status) ~ male + std.wt.loss, data = lung, 
    +    t0 = 180, Q = 0.5, nB = 100, method = "smooth", se = "pmb")
    +
    +qris Estimator
    +            estimate std.Error z.value p.value    
    +(Intercept)   5.2243    0.0923  56.629  <2e-16 ***
    +maleFemale    0.5821    0.1975   2.948  0.0032 ** 
    +std.wt.loss  -0.2515    0.0891  -2.821  0.0048 ** 
    +---
    +Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +
    +

    The ‘qris’ object is designed to be compatible with S3 methods: +predict() and residuals() functions. The following presents the +fitted survival times for two hypothetical male and female patients with +no weight loss, as well as the first five residual values for the +dataset.

    +
    +
    +
    lung.new <- data.frame(male = c("Male", "Female"), std.wt.loss = 0)
    +predict(fit2, newdata = lung.new)
    +
    +
           1        2 
    +444.9026 289.4422 
    +
    +
    head(residuals(fit2), 5)
    +
    +
             1          2          3          4          5 
    + -20.86127 -575.86127  232.44474 -416.82295 -555.82295 
    +
    +

    To better understand the covariate effects on different quantiles of +residual time and across different base times, we plot the estimated +regression coefficients of the intercept, sex, and weight loss in fit1 +and fit2. Figures 7 +and 8 display the estimated regression +coefficients when method = "smooth" and method = "nonsmooth", +respectively, at different quantiles ranging from 0.2 and 0.5 at +\(t_0 = 30\) days. The plot.qris() function is currently not available +for the iterative estimator. This is mainly due to an extended +computation time involved, as indicated by our simulation results, and +the nature of plotting that necessitates computations across various +quantiles or base times. As expected, the two plots show very similar +patterns. We plot the estimated regression coefficients of the +intercept, sex, and weight loss for different quantiles in the range of +0.2 to 0.5 at \(t_0= 50\), 60, 70, and 80 days +(Figure 9), as well as for different +base times in the range of 50 to 80 days at \(\tau=0.2\), 0.3, 0.4, and +0.5 (Figure 10). The estimation +method used is non-iterative induced smoothed estimation +(method = "smooth"). In Figure 9, +the estimated intercept increases as the quantile increases (for a given +base time). The estimated slopes for sex remain largely the same, but +those for weight loss tend to decrease slightly across different +quantiles (for a given base time). These patterns remain consistent for +different base times. In Figure 10, +the estimated intercepts increase as the quantiles increase, but with a +given quantile, they remain flat across the different base times +considered. The estimated regression coefficients for the two covariates +do not appear to change significantly for different base times.

    +
    +
    +
    hide <- theme(legend.position = "none")
    +plot(fit1, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide)
    +
    + +
    +
    plot(fit2, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide)
    +
    + +
    +
    plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = TRUE, ggextra = hide)
    +
    + +
    +
    plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = FALSE, ggextra = hide)
    +
    +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 7: method = ”smooth” and se = ”pmb” +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 8: method = ”nonsmooth” and se = ”fmb” +

    +
    +
    +


    +

    +
    +
    +graphic without alt text +

    +Figure 9: method = ”smooth” and se = ”pmb” +

    +
    +
    +
    +
    +graphic without alt text +

    +Figure 10: Multiple covariate effect plot against base time +

    +
    +
    +
    +

    5 Conclusion

    +

    The purpose of the qris +package is to provide a comprehensive tool for fitting quantile +regression models on residual life for right-censored survival data, +with the aim of promoting widespread dissemination and utilization. This +package implements one estimation method based on non-smooth estimating +functions and two estimation methods based on their induced smoothed +versions. The non-smooth estimator is calculated through \(L_{1}\)-type +minimization while incorporating the IPCW technique, and its variance is +calculated using full multiplier bootstrapping. The first type of the +induced smoothed estimator, a non-iterative version, directly solves +estimating functions, and its variance can be calculated using either +the full multiplier bootstrapping or the robust sandwich form with +partial multiplier bootstrapping. As evidenced by the simulation +results, this enables one to substantially reduce computing times +without sacrificing estimation accuracy and stability compared to the +original non-smooth function-based method. The iterative smoothed +estimator has an advantage in obtaining more precise estimates than its +non-iterative version, although it requires longer computing times. For +all these methods, estimates of the regression coefficients and their +variances can be calculated at user-defined quantiles and base times, as +long as they are identifiable. Additionally, the package provides +features for plotting estimates with associated 95% confidence intervals +against quantiles and base times using the generic plot function. +These plots visualize patterns of estimates at different quantiles and +base times, helping users to easily grasp the overall picture. The +package qris and its +included functions are verified through illustrations using simulated +data with interpretation of the results demonstrated through a real data +application.

    +

    Some possible directions for extending our package are as follows. +Efforts can be made to reduce the computational burden associated with +variance estimation, which currently accounts for a significant portion +of the computing time. In particular, the iterative-induced smoothed +method employs the partial multiplier bootstrap method to calculate +variance estimates in each iteration. Since this method requires +multiple iterations, it is crucial to explore more computationally +efficient variance estimation procedures for each iteration to reduce +the currently relatively longer computation time. One approach is to +utilize a closed-form estimation of the mid-part of the sandwich-type +variance, as discussed in Chiou et al. (2014; Choi et al. 2018). +Implementing this direct variance estimation in each iteration is +expected to further enhance computation efficiency. Another direction is +to generalize the approaches to allow for the inclusion of sampling +weights, which is useful for bias correction when failure time data are +generated from non-random sampling designs, such as case-cohort designs +(Prentice 1986; Chiou et al. 2015a). The current estimating +functions implemented in the +qris package assume that +the data are randomly sampled, with sampling weights set to 1." To the +best of our knowledge, there is a lack of model-checking procedures and +model-comparison methods specifically designed for the non-smooth +estimator, and a logical next step would be to develop these procedures +for subsequent integration into the package.

    +
    +
    +

    6 CRAN packages used

    +

    qris, quantreg, aftgee, ctqr, Brq, brms, cmprskQR, ggplot2, Rcpp, RcppArmadillo, survival

    +

    7 CRAN Task Views implied by cited packages

    +

    Bayesian, ChemPhys, ClinicalTrials, Econometrics, Environmetrics, HighPerformanceComputing, MetaAnalysis, MixedModels, NumericalMathematics, Optimization, Phylogenetics, ReproducibleResearch, Robust, Spatial, Survival, TeachingStatistics

    +

    8 Note

    +

    This article is converted from a Legacy LaTeX article using the +texor package. +The pdf version is the official version. To report a problem with the html, +refer to CONTRIBUTE on the R Journal homepage.

    +
    +
    +R. Alhamzawi. Brq: Bayesian analysis of quantile regression models. 2020. URL https://CRAN.R-project.org/package=Brq. R package version 3.0. +
    +
    +B. Brown and Y.-G. Wang. Standard errors and covariance matrices for smoothed rank estimators. Biometrika, 92(1): 149–158, 2005. URL https://doi.org/10.1093/biomet/92.1.149. +
    +
    +P.-C. Bürkner. Advanced Bayesian multilevel modeling with the R package brms. The R Journal, 10(1): 395–411, 2018. DOI 10.32614/RJ-2018-017. +
    +
    +Y. Q. Chen. Additive expectancy regression. Journal of the American Statistical Association, 102(477): 153–166, 2007. URL https://doi.org/10.1198/016214506000000870. +
    +
    +Y. Q. Chen and S. Cheng. Linear life expectancy regression with censored data. Biometrika, 93(2): 303–313, 2006. URL https://doi.org/10.1093/biomet/93.2.303. +
    +
    +Y. Chen, N. Jewell, X. Lei and S. Cheng. Semiparametric estimation of proportional mean residual life model in presence of censoring. Biometrics, 61(1): 170–178, 2005. URL https://doi.org/10.1111/j.0006-341X.2005.030224.x. +
    +
    +S. H. Chiou, S. Kang and J. Yan. Aftgee: Accelerated failure time model with generalized estimating equations. 2021. URL https://CRAN.R-project.org/package=aftgee. R package version 1.1.6. +
    +
    +S. H. Chiou, S. Kang and J. Yan. Fast accelerated failure time modeling for case-cohort data. Statistics and Computing, 24(4): 559–568, 2014. URL https://doi.org/10.1007/s11222-013-9388-2. +
    +
    +S. H. Chiou, S. Kang and J. Yan. Semiparametric accelerated failure time modeling for clustered failure times from stratified sampling. Journal of the American Statistical Association, 110(510): 621–629, 2015a. URL https://doi.org/10.1080/01621459.2014.917978. +
    +
    +S. Chiou, S. Kang and J. Yan. Rank-based estimating equations with general weight for accelerated failure time models: An induced smoothing approach. Statistics in Medicine, 34(9): 1495–1510, 2015b. URL https://doi.org/10.1002/sim.6415. +
    +
    +S. Choi, S. Kang and X. Huang. Smoothed quantile regression analysis of competing risks. Biometrical Journal, 60(5): 934–946, 2018. URL https://doi.org/10.1002/bimj.201700104. +
    +
    +S. Dlugosz, L. Peng, R. Li and S. Shi. cmprskQR: Analysis of competing risks using quantile regressions. 2019. URL https://CRAN.R-project.org/package=cmprskQR. R package version 0.9.2. +
    +
    +D. Eddelbuettel, R. Francois, J. Allaire, K. Ushey, Q. Kou, N. Russell, I. Ucar, D. Bates and J. Chambers. Rcpp: Seamless r and c++ integration. 2022a. URL https://CRAN.R-project.org/package=Rcpp. R package version 1.0.9. +
    +
    +D. Eddelbuettel, R. Francois, D. Bates, B. Ni and C. Sanderson. RcppArmadillo: “Rcpp” integration for the “armadillo” templated linear algebra library. 2022b. URL https://CRAN.R-project.org/package=RcppArmadillo. R package version 0.11.1.1.0. +
    +
    +P. Frumento. Ctqr: Censored and truncated quantile regression. 2021. URL https://CRAN.R-project.org/package=ctqr. R package version 2.0. +
    +
    +Y. Huang. Quantile calculus and censored regression. Annals of Statistics, 38(3): 1607, 2010. DOI 10.1214/09-AOS771. +
    +
    +L. M. Johnson and R. L. Strawderman. Induced smoothing for the semiparametric accelerated failure time model: Asymptotics and extensions to clustered data. Biometrika, 96(3): 577–590, 2009. URL https://doi.org/10.1093/biomet/asp025. +
    +
    +S.-H. Jung. Quasi-likelihood for median regression models. Journal of the American Statistical Association, 91(433): 251–257, 1996. URL https://doi.org/10.1080/01621459.1996.10476683. +
    +
    +S.-H. Jung, J.-H. Jeong and H. Bandos. Regression on quantile residual life. Biometrics, 65(4): 1203–1212, 2009. URL https://doi.org/10.1111/j.1541-0420.2009.01196.x. +
    +
    +S. Kang. Fitting semiparametric accelerated failure time models for nested case–control data. Journal of Statistical Computation and Simulation, 87(4): 652–663, 2017. URL https://doi.org/10.1080/00949655.2016.1222611. +
    +
    +K. H. Kim, D. J. Caplan and S. Kang. Smoothed quantile regression for censored residual life. Computational Statistics, 38: 1001–1022, 2023. URL https://doi.org/10.1007/s00180-022-01262-z. +
    +
    +K. H. Kim, S. Kang and S. H. Chiou. Qris: Quantile regression model for residual lifetime using an induced smoothing approach. 2022. URL https://CRAN.R-project.org/package=qris. R package version 1.0.0. +
    +
    +K. Kim, J. Ko and S. Kang. Comparison of variance estimation methods in semiparametric accelerated failure time models for multivariate failure time data. Japanese Journal of Statistics and Data Science, 4(2): 1179–1202, 2021. URL https://doi.org/10.1007/s42081-021-00126-y. +
    +
    +M.-O. Kim, M. Zhou and J.-H. Jeong. Censored quantile regression for residual lifetimes. Lifetime Data Analysis, 18(2): 177–194, 2012. URL https://doi.org/10.1007/s10985-011-9212-2. +
    +
    +R. Koenker. Quantreg: Quantile regression. 2022. URL https://CRAN.R-project.org/package=quantreg. R package version 5.87. +
    +
    +R. Koenker and G. Bassett Jr. Regression quantiles. Econometrica: Journal of the Econometric Society, 33–50, 1978. URL https://doi.org/10.2307/1913643. +
    +
    +R. Koenker and I. Mizera. Penalized triograms: Total variation regularization for bivariate smoothing. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 66(1): 145–163, 2004. URL https://doi.org/10.1111/j.1467-9868.2004.00437.x. +
    +
    +R. Koenker, P. Ng and S. Portnoy. Quantile smoothing splines. Biometrika, 81(4): 673–680, 1994. URL https://doi.org/10.1093/biomet/81.4.673. +
    +
    +R. Li, X. Huang and J. E. Cortes. Quantile residual life regression with longitudinal biomarker measurements for dynamic prediction. Journal of the Royal Statistical Society. Series C: Applied Statistics, 65(5): 755–773, 2016. URL http://www.jstor.org/stable/44681854. +
    +
    +S. Liu and S. K. Ghosh. Regression analysis of mean residual life function. North Carolina State University. Dept. of Statistics. 2008. URL https://repository.lib.ncsu.edu/bitstream/handle/1840.4/3041/mimeo2613.pdf?sequence=1. +
    +
    +C. L. Loprinzi, J. A. Laurie, H. S. Wieand, J. E. Krook, P. J. Novotny, J. W. Kugler, J. Bartel, M. Law, M. Bateman and N. E. Klatt. Prospective evaluation of prognostic variables from patient-completed questionnaires. North Central Cancer Treatment Group. Journal of Clinical Oncology, 12(3): 601–607, 1994. URL https://doi.org/10.1200/JCO.1994.12.3.601. +
    +
    +G. Maguluri and C.-H. Zhang. Estimation in the mean residual life regression model. Journal of the Royal Statistical Society: Series B (Methodological), 56(3): 477–489, 1994. URL https://doi.org/10.1111/j.2517-6161.1994.tb01994.x. +
    +
    +D. Oakes and T. Dasu. A note on residual life. Biometrika, 77(2): 409–410, 1990. URL https://doi.org/10.1093/biomet/77.2.409. +
    +
    +D. Oakes and T. Dasu. Inference for the proportional mean residual life model. Lecture Notes-Monograph Series, 105–116, 2003. URL http://www.jstor.org/stable/4356266. +
    +
    +L. Peng and Y. Huang. Survival analysis with quantile regression models. Journal of the American Statistical Association, 103(482): 637–649, 2008. URL https://doi.org/10.1198/016214508000000355. +
    +
    +S. Portnoy. Censored regression quantiles. Journal of the American Statistical Association, 98(464): 1001–1012, 2003. URL https://doi.org/10.1198/016214503000000954. +
    +
    +S. Portnoy and R. Koenker. The gaussian hare and the laplacian tortoise: Computability of squared-error versus absolute-error estimators. Statistical Science, 12(4): 279–300, 1997. URL https://doi.org/10.1214/ss/1030037960. +
    +
    +J. L. Powell. Censored regression quantiles. Journal of Econometrics, 32(1): 143–155, 1986. URL https://doi.org/10.1016/0304-4076(86)90016-3. +
    +
    +R. L. Prentice. A case-cohort design for epidemiologic cohort studies and disease prevention trials. Biometrika, 73(1): 1–11, 1986. URL https://doi.org/10.1093/biomet/73.1.1. +
    +
    +R Core Team. R: A language and environment for statistical computing. Vienna, Austria: R Foundation for Statistical Computing, 2021. URL https://www.R-project.org/. +
    +
    +R. L. Siegel, K. D. Miller, H. E. Fuchs and A. Jemal. Cancer statistics, 2021. CA: A Cancer Journal for Clinicians, 71(1): 7–33, 2021. URL https://doi.org/10.3322/caac.21654. +
    +
    +T. M. Therneau. Survival: Survival analysis. 2021. URL https://CRAN.R-project.org/package=survival. R package version 3.2-13. +
    +
    +Y. Wei, A. Pere, R. Koenker and X. He. Quantile regression methods for reference growth charts. Statistics in Medicine, 25(8): 1369–1382, 2006. URL https://doi.org/10.1002/sim.2271. +
    +
    +H. Wickham, W. Chang, L. Henry, T. L. Pedersen, K. Takahashi, C. Wilke, K. Woo, H. Yutani and D. Dunnington. ggplot2: Create elegant data visualisations using the grammar of graphics. 2022. URL https://CRAN.R-project.org/package=ggplot2. R package version 3.3.6. +
    +
    +Z. Ying, S.-H. Jung and L.-J. Wei. Survival analysis with median regression models. Journal of the American Statistical Association, 90(429): 178–184, 1995. URL https://doi.org/10.1080/01621459.1995.10476500. +
    +
    +Z. Zhang, X. Zhao and L. Sun. Goodness-of-fit tests for additive mean residual life model under right censoring. Lifetime Data Analysis, 16(3): 385–408, 2010. URL https://doi.org/10.1007/s10985-010-9152-2. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Kim, et al., "Fitting a Quantile Regression Model for Residual Life with the R Package qris", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-007,
    +  author = {Kim, Kyu Hyun and Kang, Sangwook and Chiou, Sy Han},
    +  title = {Fitting a Quantile Regression Model for Residual Life with the R Package qris},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-007},
    +  doi = {10.32614/RJ-2024-007},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {114-134}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-007/RJ-2024-007.pdf b/_articles/RJ-2024-007/RJ-2024-007.pdf new file mode 100644 index 0000000000..a5bdd1048a Binary files /dev/null and b/_articles/RJ-2024-007/RJ-2024-007.pdf differ diff --git a/_articles/RJ-2024-007/RJournal.sty b/_articles/RJ-2024-007/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-007/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-007/RJwrapper.md b/_articles/RJ-2024-007/RJwrapper.md new file mode 100644 index 0000000000..0604f20e34 --- /dev/null +++ b/_articles/RJ-2024-007/RJwrapper.md @@ -0,0 +1,1268 @@ +--- +abstract: | + In survival analysis, regression modeling has traditionally focused on + assessing covariate effects on survival times, which is defined as the + elapsed time between a baseline and event time. Nevertheless, focusing + on residual life can provide a more dynamic assessment of covariate + effects, as it offers more updated information at specific time points + between the baseline and event occurrence. Statistical methods for + fitting quantile regression models have recently been proposed, + providing favorable alternatives to modeling the mean of residual + lifetimes. Despite these progresses, the lack of computer software + that implements these methods remains an obstacle for researchers + analyzing data in practice. In this paper, we introduce an R package + [**qris**](https://CRAN.R-project.org/package=qris) [@R:qris], which + implements methods for fitting semiparametric quantile regression + models on residual life subject to right censoring. We demonstrate the + effectiveness and versatility of this package through comprehensive + simulation studies and a real-world data example, showcasing its + valuable contributions to survival analysis research. +address: +- | + Kyu Hyun Kim\ + Department of Statistics and Data Science *and* Department of Applied + Statistics\ + Yonsei University\ + 50 Yonsei-ro, Seodaemun-gu, Seoul\ + Republic of Korea\ + [kyuhyunkim07@yonsei.ac.kr](kyuhyunkim07@yonsei.ac.kr){.uri} +- | + Sangwook Kang\ + Department of Statistics and Data Science *and* Department of Applied + Statistics\ + Yonsei University\ + 50 Yonsei-ro, Seodaemun-gu, Seoul\ + Republic of Korea\ + [kanggi1@yonsei.ac.kr](kanggi1@yonsei.ac.kr){.uri} +- | + Sy Han Chiou\ + Department of Statistics and Data Science\ + Southern Methodist University\ + P.O. Box 750332, Dallas, TX\ + USA\ + [schiou@smu.edu](schiou@smu.edu){.uri}\ + +author: +- Kyu Hyun Kim, Sangwook Kang, and Sy Han Chiou +bibliography: +- 2022-185_R3.bib +title: "Fitting a Quantile Regression Model for Residual Life with the R + Package [**qris**](https://CRAN.R-project.org/package=qris)" +--- + +:::::::::::::::::::::::::::::::: article +## Introduction {#sec:intro} + +In the analysis of time-to-event data, standard statistical inference +procedures often focus on quantities based on failure time and its +relationship with covariates measured at baseline. However, throughout +the follow-up process, inference procedures based on residual life +become increasingly intuitive for assessing the survival of subjects and +can offer insights into the effectiveness of treatments in prolonging +the remaining lifetime. As covariates can substantially change over time +and models based solely on baseline covariates have limited potential +for long-term prognosis, there is a growing interest in modeling the +remaining lifetime of a surviving subject with updated patient +information. Many efforts have been made to model the mean residual life +including proportional mean residual life models +[@maguluri1994estimation; @oakes1990note; @oakes2003inference; @chen2005semiparametric], +additive mean residual life models +[@chen2006linear; @chen2007additive; @zhang2010goodness], and +proportional scaled mean residual life models [@liu2008regression]. +Given that failure times are usually right-skewed and heavy-tailed, the +mean of the residual life might not be identifiable if the follow-up +time is not sufficiently long. For this reason, quantiles, which are +robust under skewed distribution, have traditionally been used more +frequently as alternative summary measures. For example, the approach on +the semiparametric quantile regression model for continuous responses +[@koenker1978regression] has been extended to uncensored failure time +data [@jung1996quasi; @portnoy1997gaussian; @wei2006quantile] and +censored failure times data +[@ying1995survival; @portnoy2003censored; @peng2008survival; @huang2010quantile]. + +When the outcome variable is the residual life, semiparametric quantile +models that apply the inverse probability of censoring weighting (IPCW) +principle to address right-censored observations have been explored +[@jung2009regression; @kim2012censored; @li2016quantile]. These +approaches are based on non-smooth estimating functions with respect to +regression parameters, and the estimates of the regression parameters +are obtained either through zero-crossing of non-smooth estimating +functions using grid search techniques [@jung2009regression] or by +optimizing non-smooth objective functions with $L_1$-minimization +algorithms [@kim2012censored; @li2016quantile]. While these methods are +relatively straightforward to implement, an additional challenge lies in +standard error estimation, which necessitates the computationally +intensive use of a multiplier bootstrap method [@li2016quantile]. +Alternatively, @jung2009regression and @kim2012censored utilized the +minimum dispersion statistic and the empirical likelihood method, +respectively, to bypass the need to directly estimate the variance of +the regression parameter estimator for hypothesis testing and +constructing confidence intervals. The non-smooth nature of the +estimating functions in these approaches precludes the estimation of +variance using the robust sandwich-type variance estimator typically +employed in equation-based estimation methods. To lessen the associated +computational burden, an induced smoothing was proposed +[@brown2005standard], which modifies the non-smooth estimating equations +into smooth ones. Leveraging the asymptotic normality of the non-smooth +estimator, the smooth estimating functions are constructed by averaging +out the random perturbations inherent in the non-smooth estimating +functions. The resulting estimating functions become smooth with respect +to the regression parameters, allowing for the straightforward +application of standard numerical algorithms, such as the Newton-Raphson +method. Furthermore, these smoothed estimating functions facilitate the +straightforward computation of variances using the robust sandwich-type +estimator. The induced smoothing approach has been employed in fitting +semiparametric accelerated failure time (AFT) models via the rank-based +approach +[@johnson2009induced; @aftgeepackage; @chiou2015semiparametric; @Kang:fitt:2016]. +Regarding quantile regression, @choi2018smoothed considered the induced +smoothing approach under a competing-risks setting. All of these methods +are based on modeling event times. Recently, @kim2023smoothed proposed +an induced smoothing estimator for fitting a semiparametric quantile +regression model for residual life. + +The availability of published R packages for fitting quantile regression +models is somewhat limited. The `rq()`, `nlrq()`, `rqss()`, and `crq()` +functions in the package +[**quantreg**](https://CRAN.R-project.org/package=quantreg) +[@quantregpackage] are predominantly used and provide various features +for fitting linear, nonlinear, non-parametric, and censored quantile +regression models, respectively. The `rq()` function minimizes +non-smooth objective functions to obtain point estimates of regression +coefficients and can accommodate right-censored survival times by +incorporating weights. By redefining survival times as the remaining +lifetime at time $t_0$, one can also obtain a non-smoothed estimator for +quantile regression models for residual life [@kim2012censored]. On the +other hand, the `nlrq()` function is designed to fit a nonlinear +quantile regression model, while the `rqss()` function fits additive +quantile regression models with nonparametric terms, including +univariate components and bivariate components, using smoothing splines +and total variation regularization techniques +[@koenker1994quantile; @koenker2004penalized]. Furthermore, the `crq()` +function fits a quantile regression model for censored data on the +$\tau$-th conditional quantile function of the response variable. +Overall, the [**quantreg**](https://CRAN.R-project.org/package=quantreg) +implements three methods for handling right-censored survival times: +@powell1986censored's estimator, @portnoy2003censored's estimator and +@peng2008survival's estimator. However, none of the implemented methods +in the `nlrq()`, `rqss()`, or `crq()` functions are applicable for +handling censored residual life using the induced smoothing methods. The +only function that implements the induced smoothing method is the +`aftsrr()` function in the package +[**aftgee**](https://CRAN.R-project.org/package=aftgee) +[@aftgeepackage], but it is specifically designed for fitting +semiparametric AFT models, which are not directly applicable to fitting +quantile regression models. + +Other R packages that can be used to fit quantile regression models for +survival data include the package +[**ctqr**](https://CRAN.R-project.org/package=ctqr) [@ctqrpackage], +package [**Brq**](https://CRAN.R-project.org/package=Brq) [@Brqpackage], +package [**brms**](https://CRAN.R-project.org/package=brms) +[@brmspackage], and package +[**cmprskQR**](https://CRAN.R-project.org/package=cmprskQR) +[@cmprskQRpackage]. The `ctqr()` function in the package +[**ctqr**](https://CRAN.R-project.org/package=ctqr) implements the +methods proposed in @ctqrpackage for right or interval-censored failure +times with left-truncation. The `Bqr()` function in the package +[**Brq**](https://CRAN.R-project.org/package=Brq) implements Bayesian +methods based on the asymmetric Laplace distribution. In the package +[**brms**](https://CRAN.R-project.org/package=brms), the `brm()` +function with the `family=asym_laplace()` option enables the +implementation of full Bayesian inference. The `crrQR()` function in the +package [**cmprskQR**](https://CRAN.R-project.org/package=cmprskQR) +allows fitting quantile regression models with competing risks. All of +these R packages are designed for fitting quantile regression models for +failure times defined from a baseline and are not applicable to the +residual life setting. + +The recently developed R package +[**qris**](https://CRAN.R-project.org/package=qris) [@R:qris] provides +an efficient tool for fitting semiparametric quantile regression models +for residual life subject to right censoring. The +[**qris**](https://CRAN.R-project.org/package=qris) package offers three +methods for estimating the regression parameters: $L_1$-minimization of +non-smooth objective functions, induced smoothing with a non-iterative +approach, and an iterative procedure. For standard error estimation, the +[**qris**](https://CRAN.R-project.org/package=qris) package provides two +resampling-based approaches: the partial multiplier bootstrap and the +full multiplier bootstrap methods. The partial multiplier bootstrap +method utilizes the robust sandwich-type estimator by incorporating the +sample variance of perturbed estimating functions, while the full +multiplier bootstrap method is obtained by considering the sample +variance from the solutions of perturbed estimating functions. To +enhance the interpretability of results, the +[**qris**](https://CRAN.R-project.org/package=qris) package incorporates +graphical visualizations of covariate effects at different quantiles and +base times, utilizing the plotting environment similar to that in the +[**ggplot2**](https://CRAN.R-project.org/package=ggplot2) package +[@ggplot2package], thereby allowing for extensive flexibility and +customization. The ultimate goal of creating the +[**qris**](https://CRAN.R-project.org/package=qris) package is to +facilitate the easy incorporation of quantile regression for residual +life into daily routines. The package +[**qris**](https://CRAN.R-project.org/package=qris) is available on the +Comprehensive R Archive Network (CRAN) at +. + +The rest of the article is organized as follows: Section [2](#sec:nsm) +introduces a semiparametric regression model for quantiles of residual +life and the estimation methods implemented in the package. +Section [3](#sec:implementation) provides details about computing +algorithms. Illustrations of the package using a simulated dataset and +the real data from the North Central Cancer Treatment Group are +presented in Section [4](#sec:illustration). Finally, in +Section [5](#sec:conclusion), concluding remarks are provided along with +some discussions. + +## Semiparametric quantile regression for residual life {#sec:nsm} + +Define $T$ as the potential failure time that is subject to right +censoring by $C$ and $\mathbf{X}$ as a $p \times 1$ vector of +covariates, where $p$ is the number of covariates, including an +intercept. The observed data consists of $n$ independent copies of +$(Z, \delta, \mathbf{X})$, where $Z = \min(T, C)$, +$\delta = I(T \leq C)$, and $I(\cdot)$ is an indicator function. We also +assume $T$ and $C$ are marginally independent. Define the $\tau$-th +quantile of the residual life at $t_0 > 0$ as $\theta_{\tau}(t_0)$ that +satisfies +$P(T_i - t_0 \geq \theta_{\tau}(t_0) \ | \ T_i > t_0) = 1 - \tau$. We +consider the semiparametric quantile regression model for the residual +life [@kim2012censored; @kim2023smoothed]. Given $T_i > t_0$, +$$\label{qr:mod1} + \log(T_i - t_0) = \mathbf{X}_{i}^{\top}\boldsymbol{\mathbf{\beta}}_0(\tau, t_0) + \epsilon_i, i = 1, \ldots, n, %\label{qr:mod2} (\#eq:qrmod1)$$ +where $\boldsymbol{\mathbf{\beta}}_0(\tau, t_0)$ is a $p \times 1$ +vector of regression coefficients, and $\epsilon_i$ is a random error +having zero $\tau$-th quantile. The quantile regression model for a +continuous response [@koenker1978regression] is a special case of +Equation \@ref(eq:qrmod1) when $t_0 = 0$. For ease of notation, we omit +$\tau$ and $t_0$ in $\boldsymbol{\mathbf{\beta}}_0(\tau, t_0)$ and +$\theta_{\tau}(t_0)$ and write $\boldsymbol{\mathbf{\beta}}_0$ and +$\theta$. We present different estimation procedures to estimate +$\boldsymbol{\mathbf{\beta}}_0$ given $\tau$ and $t_0$ in the following. + +### Estimation using non-smooth functions {#sec:nsm:pt} + +When there is no censoring, an estimator for $\beta_0$ in +Equation \@ref(eq:qrmod1) can be obtained by solving the estimating +equation [@kim2012censored], where +$$\label{eq:ns:obj1} + \frac{1}{n}\sum_{i=0}^{n}I[T_i \ge t_0] \mathbf{X}_i \left\{I\left[\log(T_i - t_0) \leq \mathbf{X}_i^{\top}\boldsymbol{\mathbf{\beta}} \right] - \tau \right\} = 0. (\#eq:nsobj1)$$ +However, Equation \@ref(eq:nsobj1) cannot be directly used when +$T_i - t_0$ is subject to right censoring. The IPCW technique can be +incorporated into Equation \@ref(eq:nsobj1) to account for the right +censoring [@li2016quantile]. Specifically, in the presence of right +censoring, the estimator for $\boldsymbol{\mathbf{\beta}}_0$ in +Equation \@ref(eq:qrmod1) can be obtained as the root of the following +weighted estimating equations: +$$\label{eq:nsm:ipw} + U_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau) = \frac{1}{n}\sum_{i=1}^{n}I[Z_i \ge t_0] \mathbf{X}_i \left\{I \left[\log(Z_i - t_0) \leq \mathbf{X}_i^{\top} \boldsymbol{\mathbf{\beta}} \right]\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0)} -\tau \right\}, (\#eq:nsmipw)$$ +where $\widehat{G}(\cdot)$ is the Kaplan-Meier estimate of the survival +function $G(\cdot)$ of the censoring time $C$ and +$\widehat{G}(t) = \prod_{i: t_i \leq t} (1 - \sum_{j=1}^n (1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n I(Z_j \geq t_i))$. +A computational challenge arises because the exact solution to +Equation \@ref(eq:nsmipw) might not exist due to the non-smoothness in +$\beta$ caused by the involvement of indicator functions. When the exact +solutions do not exist, the root of Equation \@ref(eq:nsmipw) can be +approximated by minimizing the $L_1$-objective function +$L_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau)$ [@li2016quantile], +$$\begin{aligned} + \label{l1:nsm} + \nonumber + L_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/\widehat{G}(t_0)} \left| \log(Z_i - t_0) - \mathbf{X}_i^{\top}\beta \right| + \\ + & \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n - \mathbf{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}(Z_l)/\widehat{G}(t_0)}\right| + + \ \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n 2\tau \mathbf{X}_l I[Z_l > t_0]\right|, +\end{aligned} (\#eq:l1nsm)$$ +where $M > 0$ bounds +$\left| \boldsymbol{\mathbf{\beta}}^{\top}\sum_{i=1}^n - \mathbf{X}_i \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/ \widehat{G}(t_0)}\right|$ +and +$\left| \boldsymbol{\mathbf{\beta}}^{\top}\sum_{i=1}^n 2\tau \mathbf{X}_i I[Z_i > t_0]\right|$ +from above. Numerically, the limit $M$ is set to be an extremely large +number, and the `qris()` function uses $M = 10^6$. Denote the resulting +estimator to be $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. It +has been shown that $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$ +is consistent for $\boldsymbol{\mathbf{\beta}}_0$ and asymptotically +normally distributed [@li2016quantile]. + +Despite the well-established asymptotic properties, directly estimating +the variance of $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$ is +impractical because it involves the derivative of non-smooth functions. +A multiplier bootstrap method has typically been employed +[@li2016quantile] to address this difficulty. The multiplier bootstrap +method considers the perturbed version of $U_{t_0}(\beta, \tau)$, +defined as +$$\label{eq:nsm:rev} + U_{t_0}^{\ast}(\beta, \tau) = \frac{1}{n}\sum_{i=1}^{n} \eta_i I[Z_i \ge t_0] \mathbf{X}_i \left\{I \left[\log(Z_i - t_0) \leq \mathbf{X}_i^{\top} \boldsymbol{\mathbf{\beta}} \right]\frac{\delta_i}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} -\tau \right\}, (\#eq:nsmrev)$$ +where $\eta_i, i = 1, \ldots, n,$ are independently and identically +(iid) generated from a positive random variable with unity mean and +variance, and $\widehat{G}^\ast(\cdot)$ is a perturbed version of +$\widehat{G}(\cdot)$, constructed as $\widehat{G}^\ast(t) = +\prod_{i: t_i \leq t} (1 - \sum_{j=1}^n \eta_j(1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n \eta_jI(Z_j \geq t_i))$ +for a given realization of $\eta_i$. On the other hand, a perturbed +$L_1$-objective function, denoted as +$L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau)$, can be similarly +constructed, where +$$\begin{aligned} + L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} \left| \log(Z_i - t_0) - \mathbf{X}_i^{\top}\boldsymbol{\mathbf{\beta}} \right| + \nonumber \\ + & \left| M - \boldsymbol{\mathbf{\beta}}^{\top}\sum_{l=1}^n - \mathbf{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}^{\ast}(Z_l)/\widehat{G}^{\ast}(t_0)}\right| + + \ \left| M - \beta^{\top}\sum_{l=1}^n 2\tau \mathbf{X}_l \eta_l I[Z_l > t_0]\right|. +\end{aligned}$$ +Solving for $U_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau) = 0$, or +equivalently, minimizing +$L_{t_0}^{\ast}(\boldsymbol{\mathbf{\beta}}, \tau)$, yields one +realization of $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. The +multiplier bootstrap variance is computed as the sample variance of a +large number of realizations of +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. + +### Estimation using induced smoothed functions {#sec:IS:pt} + +The regression coefficient in Equation \@ref(eq:qrmod1) can be more +efficiently obtained through the induced smoothed version of +Equation \@ref(eq:nsmipw). The induced smoothed estimating functions are +constructed by taking the expectation with respect to a mean-zero random +noise added to the regression parameters in Equation \@ref(eq:nsmipw). +Specifically, +$$\begin{aligned} +\label{eq:is} + \widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, H) & = E_w \{U_{t_0}(\boldsymbol{\mathbf{\beta}}+\mathbf{H}^{1/2}\mathbf{W}, \tau)\}\nonumber\\ + & = \frac{1}{n} \sum_{i=1}^{n} I[Z_i > t_0] \mathbf{X}_i \left\{ \Phi\left(\frac{\mathbf{X}_i^\top\boldsymbol{\mathbf{\beta}}-\log(Z_i-t_0)}{\sqrt{\mathbf{X}_i^{\top} \mathbf{H} \mathbf{X}_{i}}}\right)\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0) } -\tau \right\}, +\end{aligned} (\#eq:is)$$ +where $\mathbf{H} = O(n^{-1})$, $\mathbf{W} \sim N(0, \mathbf{I}_p)$ is +a standard normal random vector, $\mathbf{I}_p$ is the $p \times p$ +identity matrix, and $\Phi(\cdot)$ is the cumulative distribution +function of a standard normal random variable. A typical choice for +$\mathbf{H}$ is to fix it at $n^{-1}\mathbf{I}_p$, while some +alternative choices are explored in @chiou2015rank. Let +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ be the solution to +$\widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H}) = 0$. +Since Equation \@ref(eq:is) is a smooth function in +$\boldsymbol{\mathbf{\beta}}$, the estimator can be obtained using +standard numerical algorithms such as the Newton-Raphson method. +Moreover, the induced smoothed estimator for +$\boldsymbol{\mathbf{\beta}}_0$ has been shown to be asymptotically +equivalent to its non-smooth counterpart [@kim2023smoothed]. + +Following the idea in Section [2.1](#sec:nsm:pt), the multiplier +bootstrap procedure can be similarly employed to estimate the variance +of $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$. The perturbed +version of Equation \@ref(eq:is) takes the form of +$$\label{eq:7} + \widetilde{U}^{\ast}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H}) = \frac{1}{n} \sum_{i=1}^{n} \eta_i I[Z_i > t_0] \mathbf{X}_i \left\{ \Phi\left(\frac{\mathbf{X}_i^\top\boldsymbol{\mathbf{\beta}} - \log(Z_i-t_0)}{\sqrt{\mathbf{X}_i^{\top} \mathbf{H} \mathbf{X}_{i}}}\right)\frac{\widehat{G}^{\ast}(t_0) \delta_i}{\widehat{G}^{\ast}(Z_i)} -\tau \right\}. (\#eq:7)$$ +The multiplier bootstrap procedure estimates the variance of +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ by calculating the +sample variance of a large number of realizations of +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ obtained by +repeatedly solving Equation \@ref(eq:7). + +It has been shown that the asymptotic variance +$\mathop{\rm Var}\nolimits(\boldsymbol{\mathbf{\beta}}, \tau)$ can be +decomposed into +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})^{\top} \mathbf{V}(\boldsymbol{\mathbf{\beta}}) \mathbf{A}(\boldsymbol{\mathbf{\beta}})$ +[@kim2023smoothed], where the two components, +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})$ and +$\mathbf{V}(\boldsymbol{\mathbf{\beta}})$, can be estimated separately. +Since Equation \@ref(eq:is) is a smooth function in +$\boldsymbol{\mathbf{\beta}}$, the slope matrix, +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})$, can be conveniently estimated +by differentiating +$\widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H})$ +with respect to $\boldsymbol{\mathbf{\beta}}$. The explicit form of +$\mathbf{A}(\boldsymbol{\mathbf{\beta}})$ is as follows: +$$\begin{aligned} + \label{eq:cov:slp} + \mathbf{A}(\boldsymbol{\mathbf{\beta}}) & = \frac{\partial \widetilde{U}_{t_0}(\boldsymbol{\mathbf{\beta}}, \tau, \mathbf{H})}{\partial \boldsymbol{\mathbf{\beta}}} \nonumber \\ + & = \frac{1}{n}\sum_{i=1}^{n} I[Z_i > t_0] \mathbf{X}_i \frac{G(t_0) \delta_i}{G(Z_i)} \phi\left(\frac{{\mathbf{X}_i}^{\top}\boldsymbol{\mathbf{\beta}} - \log(Z_i-t_0)}{\sqrt{{\mathbf{X}_i}^{\top}\mathbf{H} \mathbf{X}_i}}\right)\left(\frac{-{\mathbf{X}_i}}{\sqrt{{\mathbf{X}_i}^{\top} \mathbf{H} {\mathbf{X}_i}}}\right), +\end{aligned} (\#eq:covslp)$$ +where $\phi (\cdot)$ is the density function of the standard normal +random variable. + +The slope matrix, +$\widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$, +can be evaluated directly by plugging in +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ and +$\widehat{G}(\cdot)$. On the other hand, the variance of the estimating +function, $\widehat{\mathbf{V}}(\boldsymbol{\mathbf{\beta}})$, can be +obtained by a computationally efficient resampling method motivated by +the multiplier bootstrap procedure in Section [2.1](#sec:nsm:pt). +Specifically, we propose estimating +$\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +as the simple variance of a large set of realizations of the perturbed +version of +$\widetilde{U}_{t_0}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}, \tau, \mathbf{H})$ +presented in Equation \@ref(eq:7). We refer to this procedure as the +partial multiplier bootstrapping approach because it utilizes the +perturbed estimating function, similar to the full multiplier +bootstrapping approach, but the computation of +$\widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +and +$\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +does not involve the repeated solving of the perturbed estimating +equations. Thus, the partial multiplier bootstrapping approach is +expected to be computationally more efficient than the multiplier +bootstrap method. A similar procedure and its performance have been +studied in modeling failure times with semiparametric AFT models +[@chiou2014fast; @aftgeepackage]. + +### Iterative procedure in induced smoothing estimation {#sec:iter} + +The induced estimator $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ +is obtained with a fixed $\mathbf{H}$, as described in +Section [2.2](#sec:IS:pt), and its variance is estimated separately. +This estimation procedure can be viewed as a special case of the +following iterative procedure, which updates $\mathbf{H}$ and +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ iteratively. +Specifically, the iterative algorithm utilizes the Newton-Raphson method +while sequentially updating +$\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS}$ and +$\widehat{\mathop{\rm Var}\nolimits}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IS})$ +until convergence. Similar iterative algorithms have also been +considered previously in the induced smoothing approach for +semiparametric AFT models +[@johnson2009induced; @chiou2014fast; @chiou2015semiparametric; @choi2018smoothed]. +The iterative procedure is summarized as follows: + +**Step 1:** + +: Set the initial values + $\widehat{\boldsymbol{\mathbf{\beta}}}^{(0)}$, + $\widehat{\mathbf{\Sigma}}^{(0)} = \mathbf{I}_{p}$, and + $\mathbf{H}^{(0)} = n^{-1}\widehat{\mathbf{\Sigma}}^{(0)}$. + +**Step 2:** + +: Given $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ and + $\mathbf{H}^{(k)}$ at the $k$-th step, update + $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ by + $$\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}=\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)} - \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)})^{-1}{\widetilde{U}_{t_0}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}, \tau, \mathbf{H}^{(k)}}).$$ + +**Step 3:** + +: Given $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}$ and + $\widehat{\mathbf{\Sigma}}^{(k)}$, update + $\widehat{\mathbf{\Sigma}}^{(k)}$ by + $$\widehat{\mathbf{\Sigma}}^{(k+1)} = \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)})^{-1} \widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}, \tau) \widehat{\mathbf{A}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)})^{-1}.$$ + +**Step 4:** + +: Set $\mathbf{H}^{(k+1)} = n^{-1}\widehat{\mathbf{\Sigma}}^{(k+1)}$. + Repeat Steps 2, 3 and 4 until + $\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ and + $\widehat{\mathbf{\Sigma}}^{(k)}$ converge. + +The initial value, $\widehat{\boldsymbol{\mathbf{\beta}}}^{(0)}$, could +be chosen as $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny NS}$. We +define $\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IT}$ and +$\widehat{\boldsymbol{\mathbf{\Sigma}}}_{\tiny IT}$ as the values of +$\widehat{\boldsymbol{\mathbf{\beta}}}^{(k)}$ and +$\widehat{\mathbf{\Sigma}}^{(k)}$ at convergence, and +$\widehat{\mathop{\rm Var}\nolimits}(\widehat{\boldsymbol{\mathbf{\beta}}}_{\tiny IT}) = n^{-1}\widehat{\mathbf{\Sigma}}_{\tiny IT}$. +In Step 3, +$\widehat{\mathbf{V}}(\widehat{\boldsymbol{\mathbf{\beta}}}^{(k+1)}, \tau)$ +is obtained using the partial multiplier bootstrap approach. However, +the full multiplier bootstrap approach can also be employed but would +require longer computation times. + +## Package implementation {#sec:implementation} + +The main function in the +[**qris**](https://CRAN.R-project.org/package=qris) package for +estimating the regression parameters in the quantile regression model +for residual life is the `qris()` function. The `qris()` function is +written in C++ and incorporated into R using the +[**Rcpp**](https://CRAN.R-project.org/package=Rcpp) [@Rcpppackage] and +[**RcppArmadillo**](https://CRAN.R-project.org/package=RcppArmadillo) +[@RcppArmadillopackage] packages. The synopsis of `qris` is: + +::: example +\> args(qris) function (formula, data, t0 = 0, Q = 0.5, nB = 100, method += c(\"smooth\", \"iterative\", \"nonsmooth\"), se = c(\"fmb\", \"pmb\"), +init = c(\"rq\", \"noeffect\"), verbose = FALSE, control = +qris.control()) +::: + +The required argument is `formula`, which specifies the quantile +regression model to be fitted using the variables in `data`. The +`formula` assumes that the response variable is a '`Surv`' object +created by the `Surv()` function in the +[**survival**](https://CRAN.R-project.org/package=survival) package +[@survivalpackage]. This formula structure is commonly adopted for +handling survival data in R, as seen in functions like `survreg()` and +`coxph()` in the +[**survival**](https://CRAN.R-project.org/package=survival) package. The +argument `t0` specifies the base time used in defining residual life. +The default value of `t0` is set to zero, in which case residual life +reduces to a failure time. The `Q` argument is used to specify the +target quantile of residual life to estimate, with the default value +being set to 0.5 (median). The `nB` argument specifies the bootstrapping +size used in standard error estimation, with the default value set to +100. The `method` argument specifies one of the three estimation +methods: `"nonsmooth"`, `"smooth"`, and `"iterative"`, corresponding to +the estimating procedures outlined in Sections [2.1](#sec:nsm:pt), +[2.2](#sec:IS:pt), and [2.3](#sec:iter), respectively. Given the point +estimates of the regression parameters, their standard errors can be +estimated using one of two implemented methods: `se = "fmb"` and +`se = "pmb"`. The `se = "fmb"` method employs a full-multiplier +bootstrapping approach to estimate the variance by the sample variance +of large realizations of $\widehat\beta$. The `se = "pmb"` method +estimates the variance using a robust sandwich variance estimator and +employs the computationally efficient partial multiplier bootstrapping +approach described in Section [2.2](#sec:IS:pt). The `"fmb"` option is +available for all three point estimation methods, whereas the `"pmb"` +option is not available for the `"nonsmooth"` point estimation method +due to the lack of a closed-form sandwich variance estimator. The `init` +argument allows users to specify the initial value for estimating +regression parameters by either a $p$-dimensional numerical vector or a +character string. In the latter case, the options `init = "rq"` and +`init = "noeffect"` correspond to the point estimate obtained from the +`rq()` function in the +[**quantreg**](https://CRAN.R-project.org/package=quantreg) package and +a $p$-dimensional vector of zeros, respectively. The default value for +`init` is `init = "rq"`. Among the three methods implemented for point +estimation, `method = "smooth"` and `method = "nonsmooth"` are +non-iterative, in the sense that point estimation is performed +separately from the estimation of standard errors. On the other hand, +`method = "iterative"` calculates point estimates and the corresponding +standard error estimates simultaneously through iterative updates. When +`method = "iterative"`, users can define specific convergence criteria +using `qris.control()`. The available options in `qris.control()` are as +follows. + +::: example +\> args(qris.control) function (maxiter = 10, tol = 0.001, trace = +FALSE) +::: + +The `maxiter` argument specifies the maximum number of iterations. The +default value for `maxiter` is ten, as the proposed algorithm typically +converges within ten steps based on our exploration. The convergence +tolerance is controlled using the `tol` argument, which has a default +value of `1e-3`. The `trace` argument takes a logical value and is used +to determine whether to print the result for each iteration. The default +setting is `trace = FALSE`. The '`qris`' object is fully compatible with +many of R's generic functions, including `coef()`, `confint()`, +`plot()`, `predict()`, `print()`, `residuals()`, `summary()`, and +`vcov()`. + +Among the available `S3` methods, a unique feature of the +[**qris**](https://CRAN.R-project.org/package=qris) package's `S3 plot` +method, when applied to a '`qris`' object, is its ability to +automatically update the original object by extending the range of +$\tau$ or $t_0$ values. This extension enables the generation of a +covariate effect plot over the newly specified values of $\tau$ or +$t_0$, providing a comprehensive visualization of the covariate effects +across the extended range. The `S3` method for plotting a '`qris`' +object is shown below. + +::: example +\> argsAnywhere(plot.qris) function (x, t0s = NULL, Qs = NULL, nB = +NULL, vari = NULL, byQs = FALSE, ggextra = NULL, \...) NULL +::: + +The argument `x` is a '`qris`' object created using the `qris()` +function. The `t0s` and `Qs` arguments are numeric vectors that enable +users to specify the values of $t_0$ or $\tau$ for plotting the +covariate effect. If `t0s` and `Qs` are not specified, the covariate +effects are plotted against $\tau = 0.1, 0.2, \ldots, 0.9$ at the base +time ($t_0$) inherited from the '`qris`' object specified in `x`. The +`nB` argument is a numerical variable that controls the sample size for +bootstrapping, used to compute standard error estimations based on the +variance estimation specified in the original '`qris`' object. When `nB` +is specified, the function calculates standard errors for all +combinations of $t_0$ and $\tau$ specified in `t0s` and `Qs`, computes +95% confidence intervals accordingly, and includes them in the covariate +effect plot. The `vari` argument is a character string that allows users +to specify the names of the covariates they want to display in the +effect plots. When the `vari` argument is not specified, all covariates +will be included in the plots by default. The coefficient event plot can +be plotted against the specified quantiles by setting `byQs = TRUE` or +against the specified base times by setting `byQs = FALSE`. Finally, the +`ggextra` argument allows users to pass additional graphical parameters +to the [**ggplot2**](https://CRAN.R-project.org/package=ggplot2) +package, offering further customization options for the plots. When the +`plot()` function is called, it internally invokes the `qris.extend()` +function to compute the covariate effects at additional values. The +syntax for the `qris.extend()` function is provided below: + +::: example +\> args(qris.extend) function (x, t0s = NULL, Qs = NULL, nB = NULL, vari += NULL) NULL +::: + +The arguments in `qris.extend()` are inherited from the arguments +specified in the `plot()` function. To reduce runtime when repeatedly +calling the `plot()`, one can calculate the desired covariate effects by +applying `qris.extend()` outside of `plot()` first and then supply the +results to `plot()`. This approach allows for pre-computation of the +covariate effects, making it more efficient when generating multiple +plots. Overall, the unique plotting feature in +[**qris**](https://CRAN.R-project.org/package=qris) provides users with +a seamless and effortless approach to conducting a comprehensive +assessment of the covariate effects across different quantiles or base +times. + +## Illustration {#sec:illustration} + +### Simulated data {#subsec:simulation} + +In this subsection, we present a simple simulation example to validate +the implementations in the proposed +[**qris**](https://CRAN.R-project.org/package=qris) package. The +simulation involves five covariates, denoted as $X_1, \ldots, X_5$. +Among these covariates, $X_1$ and $X_4$ follow a standard uniform +distribution, $X_2$ follows a binomial distribution with a success +probability of 0.5, $X_3$ follows a standard normal distribution, and +$X_5$ follows a standard exponential distribution. We assume that +$X_2, X_3, X_4$, and $X_5$ do not impact the residual life, meaning +their corresponding coefficient values $\beta_2$, $\beta_3$, $\beta_4$, +and $\beta_5$ are zero. The survival time $T$ is generated from a +Weibull distribution with the survival function +$S(t) = \exp\{-(\rho t)^\kappa\}$ for $t > 0$, where $\kappa = 2$, and +$\rho$ is obtained by solving +$$\label{eq:sim:weibull} + \rho^{-1}\{ (\rho t_0)^\kappa - \log (1-\tau) \}^{(1/\kappa)}- t_0 = \exp\{\beta_0 + \beta_1 X_1\}, (\#eq:simweibull)$$ +for a specified $t_0$ and $\tau$. We set the intercept +$\beta_0 = \log(5)$ and $\beta_1 = \log(2)$ at $t_0 = 0$. Given $\rho$, +$\tau$, and $X_1$, the true values of $\beta_0$ and $\beta_1$ can be +obtained sequentially from Equation \@ref(eq:simweibull) for different +$t_0 > 0$. In our case, the corresponding true values of $\beta_0$ are +approximately 1.411 and 1.219 for $t_0=1$ and 2, respectively. +Similarly, the true values of $\beta_1$ are approximately 0.797 and +0.907 for $t_0=1$ and 2, respectively. The closed-form expression for +generating $T$ is then $\{ -\log(1 - u) \}^{1/\kappa} / \rho$, where $u$ +is a uniform random variable over $(0, 1)$. Given these specifications, +we have implemented the `data.gen()` function to generate simulation +data. The `data.gen()` function takes four arguments: `n`, `t0`, `cen`, +and `Q`, representing the sample size, $t_0$, censoring proportion, and +$\tau$, respectively. We generate censoring times $C$ from an +independent uniform distribution over $(0, c)$, where $c$ is chosen to +achieve the desired censoring proportions of 10% and 30%. Using the +generated dataset, we fit the model using three different estimation +methods: induced smoothing, non-smooth, and iterative-induced smoothing. +All analyses were conducted on a 4.2 GHz Intel(R) quad Core(TM) i7-7700K +central processing unit (CPU) using R 4.3.0 [@r2021]. The following code +demonstrates the implementation of `data.gen()` to generate a simulation +dataset. + +The `data.gen()` function generates a `data.frame` containing seven +variables. The `Time` variable represents the observed survival time, +while the `status` variable serves as the event indicator, taking the +value 1 for observed events and 0 for censored observations. The +variables `X1`, $\ldots$, `X5` are the covariates. The implementation in +the `data.gen()` function generates the Weibull survival times using the +inverse probability integral transform technique. Alternatively, users +can use the `rweibull()` function with the parameters `shape = 2` and +`scale = 1 / rho` to generate these Weibull survival times directly. + +We assess the performance of the proposed implementation across various +scenarios, including three sample sizes ($n = 200, 400, 1000$), three +levels of $t_0$ ($0, 1, 2$), two censoring proportions (10% and 30%), +and two values of $\tau$ (0.25 and 0.50). For a given dataset, we apply +the full-multiplier bootstrapping approach with 200 bootstrap samples to +all three available estimating procedures: `method = "nonsmooth"`, +`method = "smooth"`, and `method = "iterative"`. To facilitate the +evaluation process, we create the `do_fmb()` function to record the +coefficient estimates, standard errors, and computing times for fitting +a single simulated dataset generated from `data.gen()`. The following is +the implementation of the `do_fmb()` function and the corresponding code +to run the simulation with 200 replications. We present the code and +result of the simulation experiments conducted at three different sample +sizes, with $t_0$ values set to 0 and 1, while holding the censoring +proportion at 30% and $\tau$ value at 0.5. The results for other +simulation scenarios are provided in the Supplementary Materials. + +::: example +\> do_fmb \<- function(n, t0, cen, Q, nB) + dat \<- data.gen(n, t0, +cen, Q) + fm \<- Surv(Time, status)   X1 + X2 + X3 + X4 + X5 + stamp \<- +NULL + stamp\[1\] \<- Sys.time() + f1 \<- qris(fm, data = dat, t0 = t0, +Q = Q, nB = nB, method = \"smooth\", se = \"fmb\") + stamp\[2\] \<- +Sys.time() + f2 \<- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method += \"nonsmooth\", se = \"fmb\") + stamp\[3\] \<- Sys.time() + f3 \<- +qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = \"iterative\", se += \"fmb\") + stamp\[4\] \<- Sys.time() + list(smooth = +c(f1$coef, f1$std), + nonsmooth = c(f2$coef, f2$std), + iter = +c(f3$coef, f3$std), + times = diff(stamp)) + \> B \<- 200 \> set.seed(2) +\> sims0_fmb \<- mapply(function(n, t0) + replicate(B, do_fmb(n, t0 = +t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(0, 0, +0), SIMPLIFY = F) \> sim1_fmb \<- mapply(function(n, t0) + replicate(B, +do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, +1000), t0 = c(1, 1, 1), SIMPLIFY = F) +::: + +Figure \@ref(fig:sim1) displays violin plots that provide visualizations +of the empirical distribution of the coefficient estimates. As expected, +all three estimators exhibit small biases, which are calculated as the +difference between the point estimates (PE) and the true regression +coefficients. Furthermore, the empirical distributions of the PEs +demonstrate a normal-like shape, aligning with the asymptotic properties +of the proposed method [@li2016quantile; @kim2023smoothed]. When the +sample size is smaller (e.g., $n = 200$ and 400), the `nonsmooth` +approach appears to yield slightly larger empirical standard errors +(ESE) compared to the `smooth` or `iterative` approaches. However, when +$n = 1000$, the ESEs are similar across all approaches. On the other +hand, the comprehensive simulation results presented in Table 1 of the +Supplementary Materials confirm that all coefficient estimates closely +approximate the true regression coefficients. On the other hand, the +ESEs and the averaged estimated standard errors (ASE) are in close +agreement for all scenarios, indicating the validity of the variance +estimation. Furthermore, the computation times, which are presented +separately in the upper panel of Table \@ref(tab:time), indicate that +when employing the full multiplier bootstrapping approach, the +`nonsmooth` approach demonstrates a slight advantage in terms of +computational efficiency over the `smooth` approach, while the +`iterative` approach takes 5.1 to 9.5 times longer than the `smooth` +approach. In summary, the timing results show that the proposed method +can yield valid inference results within seconds, even with large +datasets of up to 1000 observations or when using the computationally +demanding full multiplier bootstrapping approach for variance +estimation. + +::: figure* +```{r figsim1t0, echo=FALSE , fig.cap="t_0 = 0", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_t0_c3_Q50.png")) +``` + +\ + +```{r figsim1t1, echo=FALSE , fig.cap="t_0 = 1", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_t1_c3_Q50.png")) +``` +::: + +When $t_0 = 0$, the targeted semiparametric quantile regression model +for residual life simplifies to the standard quantile regression model +for survival time. In such cases, existing functions like `crq()` from +the [**quantreg**](https://CRAN.R-project.org/package=quantreg) package +[@quantregpackage] can be employed. A comparison between the performance +of `crq()` and our proposed implementation when $t_0 = 0$ is presented +in the Supplementary Materials, where the standard errors of the `crq()` +are obtained from the bootstrap method with 200 bootstrap samples. +Overall, the performance of `crq()` is comparable to the proposed +methods in terms of bias and standard errors. However, we have +occasionally encountered situations where the `crq()` function fails to +converge, particularly when the sample size is large, as in the case of +$n = 1000$. In the other extended simulation scenarios outlined in the +Supplementary Materials, which encompass various levels of $t_0$, +censoring proportions, and $\tau$, the proposed methods consistently +exhibit satisfactory performance across all settings. + +The true potential of the proposed smooth approach lies in its +capability for efficient variance estimation through the implementation +of the partial multiplier bootstrapping approach. This approach +eliminates the need for repetitive solving of estimating equations, +resulting in improved computational efficiency in variance estimation. +To demonstrate its usefulness, we conducted a simulation using both the +smooth approach and the iterative approach with the partial multiplier +bootstrapping approach (`se = "pmb"`). This simulation was conducted +under the settings of $\tau = 0.5$, $t_0 = 0$ and $1$, and a 30% +censoring rate. The `do_pmb()` function was accordingly modified as +follows. + +::: example +\> do_pmb \<- function(n, t0, cen, Q, nB) + dat \<- data.gen(n, t0, +cen, Q) + fm \<- Surv(Time, status)   X1 + X2 + X3 + X4 + X5 + stamp \<- +NULL + stamp\[1\] \<- Sys.time() + f1 \<- qris(fm, data = dat, t0 = t0, +Q = Q, nB = nB, method = \"smooth\", se = \"pmb\") + stamp\[2\] \<- +Sys.time() + f2 \<- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method += \"iterative\", se = \"pmb\") + stamp\[3\] \<- Sys.time() + list(smooth += c(f1$coef, f1$std), + iter = c(f2$coef, f2$std), + times = +diff(stamp)) + \> B \<- 200 \> set.seed(2) \> sims0_pmb \<- +mapply(function(n, t0) + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = +.5, nB = 200)), + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) +\> sims1_pmb \<- mapply(function(n, t0) + replicate(B, do_pmb(n, t0 = +t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(1, 1, +1), SIMPLIFY = F) +::: + +The simulation results obtained using the partial multiplier +bootstrapping approach are presented in Figure \@ref(fig:sim2) and +Tables 7 -- 12 in the Supplementary Materials, while the computing times +are displayed in the lower panel of Table \@ref(tab:time). Overall, the +estimation results obtained using `se = "pmb"` in Figure \@ref(fig:sim2) +closely resemble those in Figure \@ref(fig:sim1) with `se = "fmb"`. As +seen in Tables 7 and 8, the ESEs from the non-iterative and iterative +methods are comparable, while the ASEs slightly overestimate the ESEs +when the sample size is small. The gaps are slightly smaller for the +iterative method, as shown in some cases +[@johnson2009induced; @kim2021comparison]. The magnitudes of the +differences are not large, and they also become smaller when the sample +size reaches $n = 1000$. More importantly, the computing times with +`se = "pmb"` show significant speed improvements compared to when +`se = "fmb"` is used in every case; we observed up to 79% timing +improvements. + +::: figure* +```{r figsim2t0, echo=FALSE , fig.cap="t_0 = 0", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_pmb_t0_c3_Q50.png")) +``` + +\ + +```{r figsim2t1, echo=FALSE , fig.cap="t_0 = 1", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="95.0%"} +knitr::include_graphics(c("vplot_pmb_t1_c3_Q50.png")) +``` +::: + +::: {#tab:time} + -------------------------------------------------------------------------------------------------------------------- + $t_0 = 0$ $t_0 = 1$ + ------------------------------------------ ----------- ----------- ------- ------- ----------- ------- ------- -- -- + l3ptr3pt)1-2l3ptr3pt)3-5 l3ptr3pt)6-8 se method 200 400 1000 200 400 1000 + + `fmb` Smooth 0.103 0.174 0.471 0.106 0.178 0.480 + + Nonsmooth 0.080 0.142 0.472 0.080 0.141 0.468 + + Iterative 0.981 1.500 2.410 0.985 1.567 2.882 + + `pmb` Smooth 0.022 0.052 0.223 0.022 0.053 0.224 + + Iterative 0.296 0.580 1.407 0.296 0.581 1.435 + -------------------------------------------------------------------------------------------------------------------- + + : Table 1: Runtimes (in seconds) when `se = fmb` and `se = pmb`. +::: + +After confirming the satisfactory performance of the proposed +methodologies, we now proceed to illustrate the application of the +`init` argument. This argument controls the initial values assigned to +the root-finding algorithm's estimates and the plotting capacity of the +[**qris**](https://CRAN.R-project.org/package=qris) package. For this +illustrative example, we consider a simpler simulation scenario that +involves a single binary covariate. This simplified simulation can be +generated using the revised version of the `data.gen()` function +provided below. + +::: example +\> \## Global parameters + rho0 \<- .2 \* sqrt(log(2)) + rho1 \<- .1 \* +sqrt(log(2)) \> data.gen \<- function(n) + dat \<- data.frame(censoring += runif(n, 0, 23.41), + Time0 = sqrt(-log(1 - runif(n))), + X = +rbinom(n, 1, .5)) + dat$Time0 <- ifelse(dat$X \> 0, +dat$Time0 / rho1, dat$Time0 / rho0) + dat$Time <- pmin(dat$Time0, +dat$censoring) + + dat$status \<- 1 \* (dat$Time0 < dat$censoring) + subset(dat, +select = c(Time, status, X)) + \> set.seed(10) \> head(dat \<- +data.gen(200)) Time status X 1 6.034713 1 1 2 7.181451 0 1 3 9.993908 0 +1 4 16.225520 0 1 5 1.993033 0 1 6 5.277471 0 0 +::: + +The updated `data.gen()` function returns a `data.frame` comprising +three variables: `Time`, `status`, and `X`, representing the observed +survival time, event indicator, and binary covariate, respectively. We +will first illustrate the usage of the argument `init` by considering +three different initial values: `init = "rq"`, `init = c(1,1)`, and a +random vector `init = rnorm(2)`, all used in conjunction with the smooth +estimator `method = "smooth"`. The following codes provide an example +with different initial values. + +::: example +\> (random \<- rnorm(2)) \[1\] 1.5025446 0.5904095 \> f1 \<- +qris(Surv(Time, status)   X, data = dat, t0 = 1, init = \"rq\", nB = 0) +\> f2 \<- update(f1, init = c(1, 1)) \> f3 \<- update(f1, init = random) +\> all.equal(f1$coef, f2$coef) \[1\] TRUE \> all.equal(f2$coef, f3$coef) +\[1\] TRUE +::: + +The '`qris`' object, with its `call` component, is compatible with the +`update()` function, a built-in function commonly used for updating the +attributes of an existing object without requiring redundant and +repetitive code. In the example above, we used the `update()` function +to modify the initial value specification in `f1`. We observed that +different initial values yield identical point estimates, thereby +affirming the robustness of the proposed method against fluctuations in +initial values. + +The covariate effects, along with their associated 95% point-wise +confidence intervals across various quantiles or base times, can be +visually assessed by applying the generic function `plot()` to a +'`qris`' object. We demonstrate this feature using the following `qris` +fit, where the standard errors are obtained using `se = "pmb"`, +$t_0 = 1$, and all other parameters are set to their default values. We +update the `qris` fit with extended quantiles over +${0.4, 0.5, 0.6, 0.7}$ and plot the covariate effects against these +quantiles using the `plot()` function. + +::: example +\> fit \<- qris(Surv(Time, status)   X, data = dat, t0 = 1, se = +\"pmb\") \> fit2 \<- qris.extend(fit, Qs = 4:7 / 10) +::: + +The extended '`qris`' fit generated by the `qris.extend()` function +inherits all the attributes from the original '`qris`' object and +includes additional `ggdat` components. The following code compares the +components of the returned values from the extended '`qris`' fit and the +original '`qris`' fit. + +::: example +\> class(fit2) \[1\] \"qris\" \> names(fit) \[1\] \"call\" +\"coefficient\" \"data\" \"formula\" \"para\" \[6\] \"stderr\" +\"varNames\" \"vcov\" \> setdiff(names(fit2), names(fit)) \[1\] +\"ggdat\" +::: + +Specifically, the extended '`qris`' fit inherits `call`, `coefficient`, +`para`, `stderr`, `varNames`, and `vcov` from the original '`qris`' +object. The `call` component is the function call from the original +`qris()` fit, while `coefficient`, `stderr`, and `vcov` are used to +store the point estimates, standard error estimates, and covariance +matrix, respectively. The `para` component is a list containing the +parameters specified during the fitting of the quantile regression +model, and `varNames` is a character string representing the variable +names in the function call. The newly added values are `ggdat` and `gg`. +The `ggdat` is a data frame containing covariate information generated +under the different quantiles and base times specified in the +`qris.extend()`. Finally, the corresponding covariate effect plot can be +generated by plotting the extended '`qris`' fit as follows. + +::: example +\> plot(fit2) +::: + +The true values of $\beta$'s at different quantiles and base times, +computed from Equation \@ref(eq:simweibull), can be implemented in the +following commands. + +::: example +\> \## Global parameters \> r \<- 2:1 \* sqrt(log(2)) / 10 \> k \<- 2 \> +\## Function to calculate true beta \> trueB \<- function(t0, tau) + b +\<- log(1 / r \* ((r \* t0) \^ k - log(1 - tau))\^(1 / k) - t0) + +c(b\[1\], b\[2\] - b\[1\]) + \> \## True beta calculation \> true_Q \<- +c(t(sapply(4:7 / 10, trueB, t0 = 1))) \> true_t0 \<- c(t(sapply(1:3, +trueB, tau = .5))) +::: + +The following code extends the '`ggplot`' objects generated by +`plot.qris()` by adding additional layers of true value curves and +incorporating various `ggplot` options. The resulting figures, +Figure \@ref(fig:figsimulation-quantile) and +Figure \@ref(fig:figsimulation-t0), present the output based on whether +the covariate effects are plotted against quantiles or base times, +respectively. This observed trend aligns with the specifications +described in Equation \@ref(eq:simweibull), where increasing $\tau$ +corresponds to an increasing $\beta_0$ while keeping $\rho$ and $X$ +fixed. On the other hand, the covariate effect does not change with +quantiles but slightly increases with base times, echoing the model +specification where $\beta_0$ is inversely related to $t_0$ and +$\beta_1$ increases as $t_0$ increases. + +::: example +\> library(ggplot2) \> plot(fit2) + theme(legend.position = +\"bottom\") + + geom_line(aes(x = Qs, y = true_Q, col = variable, +linetype = \"True value\")) + + scale_linetype_manual(name = \"\", +values = c(\"True value\" = \"dotdash\")) \> b \<- plot(fit2, t0s = 1:3, +byQs = F) \> b + theme(legend.position = \"bottom\") + + geom_line(aes(x += t0s, y = true_t0, col = variable, + linetype = \"True value\")) + + +scale_linetype_manual(name = \"\", values = c(\"True value\" = +\"dotdash\")) +::: + +::: figure* +```{r figsimulation-quantile, echo=FALSE , fig.cap="Plot for Q\in\{0.4, \ldots, 0.7\} at t_0 = 1", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("simulation_smooth_quantile.png")) +``` + +```{r figsimulation-t0, echo=FALSE , fig.cap="Plot for t_0\in\{1, \ldots, 3\} at Q = 0.5", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("simulation_smooth_t0.png")) +``` +::: + +### North Central Cancer Treatment Group Lung Cancer Data {#subsec:lung} + +The North Central Cancer Treatment Group Lung Cancer Data records the +survival of patients with advanced lung cancer, along with assessments +of the patients' performance status measured by both physicians and the +patients themselves [@loprinzi1994prospective]. The original objective +of the study was to ascertain whether descriptive information from a +patient-completed questionnaire could offer prognostic insights. The +original objective of the study was to determine whether descriptive +information from a patient-completed questionnaire could provide +prognostic information. However, for this illustration, we focus on how +gender and weight loss affect the quantiles of residual life for +patients diagnosed with advanced lung cancer at different time points. +The lung cancer data are publicly available from the +[**survival**](https://CRAN.R-project.org/package=survival) package +[@survivalpackage] as `lung`. The following code displays the structure +of the `lung` dataset with variables of interest. + +::: example +\> data(cancer, package = \"survival\") \> str(subset(lung, select = +c(time, status, sex, wt.loss))) 'data.frame': 228 obs. of 4 variables: +$time : num 306 455 1010 210 883 ...$ status : num 2 2 1 2 2 1 2 2 2 +2 \... $sex : num 1 1 1 1 1 1 2 2 1 1 ...$ wt.loss: num NA 15 15 11 +0 0 10 1 16 34 \... +::: + +The `lung` data contains 228 patients whose observed survival times in +days and censoring status (1 = censored, 2 = dead) are recorded in the +`time` and the `status` columns, respectively. Although the censoring +status in this dataset is not recorded in the typical 0-1 fashion, the +`Surv()` function is still applicable to create the corresponding +"`Surv`\" object. The `lung` data yields a censoring rate of $27.6\%$ +with a median survival time of 310 days. The covariates of interest are +gender (`sex = 1` if male, `sex = 2` if female) and weight loss +(`wt.loss`). In the following, we use the proposed semiparametric +quantile regression models to assess the gender and standardized weight +loss effects on different quantiles of residual life at different base +times. + +We first model the median residual life (`Q = 0.5`) when the base time +is one month (`t0 = 30`). Since the estimated median survival times for +combined lung cancers are typically less than one year, with a range of +8 to 13 months [@siegel2021cancer], setting the base time at one month +provides insight into how gender and weight loss impact the residual +time in early follow-up. In the following, we obtain the regression +coefficient estimates using the induced smoothing functions and the +corresponding variance estimate with the partial multiplier bootstrap +approach. + +::: example +\> lung$male <- factor(lung$sex, 1:2, c(\"Male\", \"Female\")) \> +lung$std.wt.loss <- scale(lung$wt.loss) \> fit1 \<- qris(Surv(time, +status)   male + std.wt.loss, + data = lung, t0 = 30, Q = .5, nB = +100, + method = \"smooth\", se = \"pmb\") \> summary(fit1) Call: +qris(formula = Surv(time, status)   male + std.wt.loss, data = lung, t0 += 30, Q = 0.5, nB = 100, method = \"smooth\", se = \"pmb\") + +qris Estimator estimate std.Error z.value p.value (Intercept) 5.5611 +0.0950 58.550 \<2e-16 \*\*\* maleFemale 0.4804 0.1805 2.661 0.0078 \*\* +std.wt.loss -0.0731 0.0837 -0.874 0.3824 --- Signif. codes: 0 '\*\*\*' +0.001 '\*\*' 0.01 '\*' 0.05 '.' 0.1 ' ' 1 +::: + +Subjects with missing values (in any of the variables relevant for the +modeling task) are automatically removed when `qris()` is called. The +estimated intercept implies that the median residual life for patients +who have survived up to 30 days is $\exp(5.5611) = 260.1$ days for a +male with an average weight loss. More interestingly, the summary shows +that the gender effect is statistically significant at the 0.05 +significance level, indicating that a female patient is expected to have +a median residual life at 30 days that is $\exp(0.4804) = 1.617$ times +that of a male patient with the same weight loss. The effect of the +weight loss is not statistically significant at the 0.05 level. In +addition to `summary()`, important statistics such as the coefficient +and variance estimates can be extracted by `S3` methods `coef()` and +`vcov()`, respectively. + +::: example +\> coef(fit1) (Intercept) maleFemale std.wt.loss 5.56111984 0.48044228 +-0.07307635 \> vcov(fit1) (Intercept) maleFemale std.wt.loss (Intercept) +0.009021459 -0.010944549 -0.003074041 maleFemale -0.010944549 +0.032594288 0.002847148 std.wt.loss -0.003074041 0.002847148 0.006998314 +::: + +Moreover, the corresponding 95% Wald-type confidence interval can be +printed by applying the `confint()` function to the '`qris`' object. + +::: example +\> confint(fit1) 2.5 (Intercept) 5.3749598 5.74727989 maleFemale +0.1265926 0.83429199 std.wt.loss -0.2370390 0.09088626 +::: + +The `update()` function can be conveniently applied to update existing +'`qris`' objects. The following examples update the `method` and `se` +arguments from `fit1`. The updated results yield similar coefficient +estimates, but the non-smooth procedure (`method = "nonsmooth"`) yields +slightly greater standard error estimates. + +::: example +\> summary(fit2 \<- update(fit1, method = \"nonsmooth\", se = \"fmb\")) +Call: qris(formula = Surv(time, status)   male + std.wt.loss, data = +lung, t0 = 30, Q = 0.5, nB = 100, method = \"nonsmooth\", se = \"fmb\") + +qris Estimator estimate std.Error z.value p.value (Intercept) 5.5585 +0.1132 49.106 \<2e-16 \*\*\* maleFemale 0.4695 0.2015 2.331 0.0198 \* +std.wt.loss -0.0668 0.1029 -0.650 0.5159 --- Signif. codes: 0 '\*\*\*' +0.001 '\*\*' 0.01 '\*' 0.05 '.' 0.1 ' ' 1 +::: + +::: example +\> summary(update(fit1, method = \"iterative\")) Call: qris(formula = +Surv(time, status)   male + std.wt.loss, data = lung, t0 = 30, Q = 0.5, +nB = 100, method = \"iterative\", se = \"pmb\") + +qris Estimator estimate std.Error z.value p.value (Intercept) 5.5605 +0.1016 54.712 \<2e-16 \*\*\* maleFemale 0.4807 0.1626 2.957 0.0031 \*\* +std.wt.loss -0.0720 0.0903 -0.797 0.4252 --- Signif. codes: 0 '\*\*\*' +0.001 '\*\*' 0.01 '\*' 0.05 '.' 0.1 ' ' 1 +::: + +At a lower (`Q = 0.25`) and a higher (`Q = 0.75`) quantiles, the gender +effect remains significant at the 0.05 significance level indicating +female patients are associated with longer lower-quantile and +higher-quantile residual life than male patients with the same weight +loss. Among these models, we observed that female patients tend to have +higher coefficient estimates when fitting higher-quantile residual life. +While the sign of the estimated regression coefficient for weight loss +changes to a negative value when considering the lower quantile, the +effects remain statistically insignificant for both the lower and higher +quantiles. + +::: example +\> summary(update(fit1, Q = 0.25)) Call: qris(formula = Surv(time, +status)   male + std.wt.loss, data = lung, t0 = 30, Q = 0.25, nB = 100, +method = \"smooth\", se = \"pmb\") + +qris Estimator estimate std.Error z.value p.value (Intercept) 4.9111 +0.1034 47.480 \<2e-16 \*\*\* maleFemale 0.4651 0.2041 2.279 0.0227 \* +std.wt.loss 0.0543 0.0584 0.930 0.3525 --- Signif. codes: 0 '\*\*\*' +0.001 '\*\*' 0.01 '\*' 0.05 '.' 0.1 ' ' 1 +::: + +::: example +\> summary(update(fit1, Q = 0.75)) Call: qris(formula = Surv(time, +status)   male + std.wt.loss, data = lung, t0 = 30, Q = 0.75, nB = 100, +method = \"smooth\", se = \"pmb\") + +qris Estimator estimate std.Error z.value p.value (Intercept) 6.0748 +0.1063 57.126 \<2e-16 \*\*\* maleFemale 0.5237 0.1487 3.522 0.0004 +\*\*\* std.wt.loss -0.0171 0.1166 -0.147 0.8835 --- Signif. codes: 0 +'\*\*\*' 0.001 '\*\*' 0.01 '\*' 0.05 '.' 0.1 ' ' 1 +::: + +We also consider the base time at six months `t0 = 180`, which enables +us to assess gender and weight loss effects in median residual time at a +moderate length of follow-up. The estimated effect for the gender and +weight loss increases as $t_0$ increases from $30$ days to $180$ days +and becomes significant at the 0.05 significant level. Additionally, the +effect of the weight loss seems to be associated with a shorter survival +time after $180$ days, with a $p$-value of $0.0008$. + +::: example +\> summary(update(fit1, t0 = 180)) Call: qris(formula = Surv(time, +status)   male + std.wt.loss, data = lung, t0 = 180, Q = 0.5, nB = 100, +method = \"smooth\", se = \"pmb\") + +qris Estimator estimate std.Error z.value p.value (Intercept) 5.2243 +0.0912 57.255 \<2e-16 \*\*\* maleFemale 0.5821 0.1867 3.117 0.0018 \*\* +std.wt.loss -0.2515 0.0754 -3.337 0.0008 \*\*\* --- Signif. codes: 0 +'\*\*\*' 0.001 '\*\*' 0.01 '\*' 0.05 '.' 0.1 ' ' 1 +::: + +The '`qris`' object is designed to be compatible with `S3` methods: +`predict()` and `residuals()` functions. The following presents the +fitted survival times for two hypothetical male and female patients with +no weight loss, as well as the first five residual values for the +dataset. + +::: example +\> lung.new \<- data.frame(male = c(\"Male\", \"Female\"), std.wt.loss = +0) \> predict(fit2, newdata = lung.new) 1 2 444.9026 289.4422 \> +head(residuals(fit2), 5) 1 2 3 4 5 -20.86127 -575.86127 232.44474 +-416.82295 -555.82295 +::: + +To better understand the covariate effects on different quantiles of +residual time and across different base times, we plot the estimated +regression coefficients of the intercept, sex, and weight loss in `fit1` +and `fit2`. Figures \@ref(fig:figrealdata-smooth) +and \@ref(fig:figrealdata-nonsmooth) display the estimated regression +coefficients when `method = "smooth"` and `method = "nonsmooth"`, +respectively, at different quantiles ranging from 0.2 and 0.5 at +$t_0 = 30$ days. The `plot.qris()` function is currently not available +for the iterative estimator. This is mainly due to an extended +computation time involved, as indicated by our simulation results, and +the nature of plotting that necessitates computations across various +quantiles or base times. As expected, the two plots show very similar +patterns. We plot the estimated regression coefficients of the +intercept, sex, and weight loss for different quantiles in the range of +0.2 to 0.5 at $t_0= 50$, 60, 70, and 80 days +(Figure \@ref(fig:figrealdata-multi-quantile)), as well as for different +base times in the range of 50 to 80 days at $\tau=0.2$, 0.3, 0.4, and +0.5 (Figure \@ref(fig:figrealdata-multi-basetime)). The estimation +method used is non-iterative induced smoothed estimation +(`method = "smooth"`). In Figure \@ref(fig:figrealdata-multi-quantile), +the estimated intercept increases as the quantile increases (for a given +base time). The estimated slopes for sex remain largely the same, but +those for weight loss tend to decrease slightly across different +quantiles (for a given base time). These patterns remain consistent for +different base times. In Figure \@ref(fig:figrealdata-multi-basetime), +the estimated intercepts increase as the quantiles increase, but with a +given quantile, they remain flat across the different base times +considered. The estimated regression coefficients for the two covariates +do not appear to change significantly for different base times. + +::: example +\> hide \<- theme(legend.position = \"none\") \> plot(fit1, Qs = 2:5 / +10, byQs = TRUE, ggextra = hide) \> plot(fit2, Qs = 2:5 / 10, byQs = +TRUE, ggextra = hide) \> plot(fit1, Qs = 2:5 / 10, t0s = 5:8 \* 10, byQs += TRUE, ggextra = hide) \> plot(fit1, Qs = 2:5 / 10, t0s = 5:8 \* 10, +byQs = FALSE, ggextra = hide) +::: + +::: figure* +```{r figrealdata-smooth, echo=FALSE , fig.cap="method = ”smooth” and se = ”pmb”", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_smooth_quantile.png")) +``` + +```{r figrealdata-nonsmooth, echo=FALSE , fig.cap="method = ”nonsmooth” and se = ”fmb”", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_nonsmooth_quantile.png")) +``` + +\ + +```{r figrealdata-multi-quantile, echo=FALSE , fig.cap="method = ”smooth” and se = ”pmb”", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_multi_quantile.png")) +``` + +```{r figrealdata-multi-basetime, echo=FALSE , fig.cap="Multiple covariate effect plot against base time", fig.alt="graphic without alt text", fig.show='hold', fig.align="center", out.width="100.0%"} +knitr::include_graphics(c("realdata_multi_basetime.png")) +``` +::: + +## Conclusion {#sec:conclusion} + +The purpose of the [**qris**](https://CRAN.R-project.org/package=qris) +package is to provide a comprehensive tool for fitting quantile +regression models on residual life for right-censored survival data, +with the aim of promoting widespread dissemination and utilization. This +package implements one estimation method based on non-smooth estimating +functions and two estimation methods based on their induced smoothed +versions. The non-smooth estimator is calculated through $L_{1}$-type +minimization while incorporating the IPCW technique, and its variance is +calculated using full multiplier bootstrapping. The first type of the +induced smoothed estimator, a non-iterative version, directly solves +estimating functions, and its variance can be calculated using either +the full multiplier bootstrapping or the robust sandwich form with +partial multiplier bootstrapping. As evidenced by the simulation +results, this enables one to substantially reduce computing times +without sacrificing estimation accuracy and stability compared to the +original non-smooth function-based method. The iterative smoothed +estimator has an advantage in obtaining more precise estimates than its +non-iterative version, although it requires longer computing times. For +all these methods, estimates of the regression coefficients and their +variances can be calculated at user-defined quantiles and base times, as +long as they are identifiable. Additionally, the package provides +features for plotting estimates with associated 95% confidence intervals +against quantiles and base times using the generic `plot` function. +These plots visualize patterns of estimates at different quantiles and +base times, helping users to easily grasp the overall picture. The +package [**qris**](https://CRAN.R-project.org/package=qris) and its +included functions are verified through illustrations using simulated +data with interpretation of the results demonstrated through a real data +application. + +Some possible directions for extending our package are as follows. +Efforts can be made to reduce the computational burden associated with +variance estimation, which currently accounts for a significant portion +of the computing time. In particular, the iterative-induced smoothed +method employs the partial multiplier bootstrap method to calculate +variance estimates in each iteration. Since this method requires +multiple iterations, it is crucial to explore more computationally +efficient variance estimation procedures for each iteration to reduce +the currently relatively longer computation time. One approach is to +utilize a closed-form estimation of the mid-part of the sandwich-type +variance, as discussed in @chiou2014fast [@choi2018smoothed]. +Implementing this direct variance estimation in each iteration is +expected to further enhance computation efficiency. Another direction is +to generalize the approaches to allow for the inclusion of sampling +weights, which is useful for bias correction when failure time data are +generated from non-random sampling designs, such as case-cohort designs +[@prentice1986case; @chiou2015semiparametric]. The current estimating +functions implemented in the +[**qris**](https://CRAN.R-project.org/package=qris) package assume that +the data are randomly sampled, with sampling weights set to 1.\" To the +best of our knowledge, there is a lack of model-checking procedures and +model-comparison methods specifically designed for the non-smooth +estimator, and a logical next step would be to develop these procedures +for subsequent integration into the package. +:::::::::::::::::::::::::::::::: diff --git a/_articles/RJ-2024-007/RJwrapper.tex b/_articles/RJ-2024-007/RJwrapper.tex new file mode 100644 index 0000000000..34ce5a47a6 --- /dev/null +++ b/_articles/RJ-2024-007/RJwrapper.tex @@ -0,0 +1,99 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + +%\usepackage{orcidlink,thumbpdf,lmodern} +\usepackage{thumbpdf,lmodern} +%% another package (only for this demo article) +\usepackage{framed} +%\usepackage{subfigure} +\usepackage{pgfplots} +\usepackage{multirow} +\usepackage{amsthm} +\usepackage{soul} +\usepackage{bm} +\usepackage{alltt} +\usepackage{framed} +\usepackage{nameref} +\usepackage{graphicx} +\usepackage{subcaption} +\usepackage{caption} + +%% new custom commands +\newcommand{\class}[1]{`\code{#1}'} +\newcommand{\fct}[1]{\code{#1()}} +\newcommand{\Var}{\mathop{\rm Var}\nolimits} %Variance +\newcommand{\vect}[1]{\mathbf{#1}} +\newcommand{\matr}[1]{\mathbf{#1}} + +\newcommand{\bns}{\widehat{\bm{\beta}}_{\tiny\mbox{NS}}} +\newcommand{\bis}{\widehat{\bm{\beta}}_{\tiny\mbox{IS}}} +\newcommand{\bit}{\widehat{\bm{\beta}}_{\tiny\mbox{IT}}} + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% For rmd prints +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother +\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}} +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}} +\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}} + +\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}} +\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}} +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}} +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}} +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}} +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}} +\let\hlipl\hlkwb +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{ + \def\at@end@of@kframe{} + \ifinner\ifhmode + \def\at@end@of@kframe{\end{minipage}} +\begin{minipage}{\columnwidth} + \fi\fi + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth} + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}} +{\par\unskip\endMakeFramed + \at@end@of@kframe} +\makeatother +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} + +%% load any required packages FOLLOWING this line + +\begin{document} + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{114} + +%% replace RJtemplate with your article +\begin{article} + \input{2022-185_R3} +\end{article} + +\end{document} diff --git a/_articles/RJ-2024-007/figures/realdata_multi_basetime.pdf b/_articles/RJ-2024-007/figures/realdata_multi_basetime.pdf new file mode 100644 index 0000000000..11cebcd2c4 Binary files /dev/null and b/_articles/RJ-2024-007/figures/realdata_multi_basetime.pdf differ diff --git a/_articles/RJ-2024-007/figures/realdata_multi_quantile.pdf b/_articles/RJ-2024-007/figures/realdata_multi_quantile.pdf new file mode 100644 index 0000000000..7c27c5ed91 Binary files /dev/null and b/_articles/RJ-2024-007/figures/realdata_multi_quantile.pdf differ diff --git a/_articles/RJ-2024-007/figures/realdata_nonsmooth_quantile.pdf b/_articles/RJ-2024-007/figures/realdata_nonsmooth_quantile.pdf new file mode 100644 index 0000000000..ed9e860b67 Binary files /dev/null and b/_articles/RJ-2024-007/figures/realdata_nonsmooth_quantile.pdf differ diff --git a/_articles/RJ-2024-007/figures/realdata_smooth_quantile.pdf b/_articles/RJ-2024-007/figures/realdata_smooth_quantile.pdf new file mode 100644 index 0000000000..79dd9c51f5 Binary files /dev/null and b/_articles/RJ-2024-007/figures/realdata_smooth_quantile.pdf differ diff --git a/_articles/RJ-2024-007/figures/simulation_smooth_quantile.pdf b/_articles/RJ-2024-007/figures/simulation_smooth_quantile.pdf new file mode 100644 index 0000000000..30b4dcbd05 Binary files /dev/null and b/_articles/RJ-2024-007/figures/simulation_smooth_quantile.pdf differ diff --git a/_articles/RJ-2024-007/figures/simulation_smooth_t0.pdf b/_articles/RJ-2024-007/figures/simulation_smooth_t0.pdf new file mode 100644 index 0000000000..51c5c5dc18 Binary files /dev/null and b/_articles/RJ-2024-007/figures/simulation_smooth_t0.pdf differ diff --git a/_articles/RJ-2024-007/figures/vplot_pmb_t0_c3_Q50.pdf b/_articles/RJ-2024-007/figures/vplot_pmb_t0_c3_Q50.pdf new file mode 100644 index 0000000000..f7ddcc9ff3 Binary files /dev/null and b/_articles/RJ-2024-007/figures/vplot_pmb_t0_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/figures/vplot_pmb_t1_c3_Q50.pdf b/_articles/RJ-2024-007/figures/vplot_pmb_t1_c3_Q50.pdf new file mode 100644 index 0000000000..cd8fed6efc Binary files /dev/null and b/_articles/RJ-2024-007/figures/vplot_pmb_t1_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/figures/vplot_t0_c3_Q50.pdf b/_articles/RJ-2024-007/figures/vplot_t0_c3_Q50.pdf new file mode 100644 index 0000000000..3e9167526d Binary files /dev/null and b/_articles/RJ-2024-007/figures/vplot_t0_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/figures/vplot_t1_c3_Q50.pdf b/_articles/RJ-2024-007/figures/vplot_t1_c3_Q50.pdf new file mode 100644 index 0000000000..c10c772950 Binary files /dev/null and b/_articles/RJ-2024-007/figures/vplot_t1_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/qris.R b/_articles/RJ-2024-007/qris.R new file mode 100644 index 0000000000..4c28c15da5 --- /dev/null +++ b/_articles/RJ-2024-007/qris.R @@ -0,0 +1,241 @@ +## ######################################### +## Requiared packages +## ######################################### + +library(qris) +library(ggplot2) +library(knitr) +library(kableExtra) + +## ######################################### +## Section: Package implementation +## ######################################### + +## Introducing new functions +args(qris) +args(qris.control) +argsAnywhere(plot.qris) +args(qris.extend) + +## ######################################### +## Section: Simulated data +## ######################################### + +## Data generation +data.gen <- function(n, t0, cen = .3, Q = .5) { + if (!(t0 %in% 0:2)) + stop("T0 is limited to three specific values: 0, 1, or 2.") + if (!(cen %in% c(0, .1, .3))) + stop("Censoring is limited to three specific values: 0%, 10%, or 30%.") + if (!(Q %in% c(.25, .5))) + stop("Q is limited to two specific values: 0.25, or 0.50.") + censoring <- Inf + if (t0 == 0) { + if (cen == .1) censoring <- runif(n, 0, 125.1) + if (cen == .3) censoring <- runif(n, 0, 25.49) + beta0 <- log(5); beta1 <- log(2) + } + if (t0 == 1) { + if (cen == .1) censoring <- runif(n, 0, 120.8) + if (cen == .3) censoring <- runif(n, 0, 23.41) + beta0 <- 1.410748; beta1 <- 0.7974189 + } + if (t0 == 2) { + if (cen == .1) censoring <- runif(n, 0, 120.6) + if (cen == .3) censoring <- runif(n, 0, 26.20) + beta0 <- 1.219403; beta1 <- 0.9070615 + } + dat <- data.frame(censoring, + Time0 = sqrt(-log(1 - runif(n))), + X1 = runif(n), + X2 = rbinom(n, 1, .5), + X3 = rnorm(n), + X4 = runif(n), + X5 = rexp(n, 1)) + rho <- (-log(1 - Q))^0.5 * (((exp(beta0 + beta1 * dat$X1) + t0)^2 - t0^2)^-0.5) + dat$Time0 <- dat$Time0 / rho + dat$Time <- pmin(dat$Time0, dat$censoring) + dat$status <- 1 * (dat$Time0 < dat$censoring) + subset(dat, select = c(Time, status, X1, X2, X3, X4, X5)) +} + +## Data illustration +set.seed(3) +head(data.gen(200, 0)) + +## Function to run the simulation with se = fmb + +do_fmb <- function(n, t0, cen, Q, nB) { + dat <- data.gen(n, t0, cen, Q) + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + stamp <- NULL + stamp[1] <- Sys.time() + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "fmb") + stamp[2] <- Sys.time() + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "nonsmooth", se = "fmb") + stamp[3] <- Sys.time() + f3 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "fmb") + stamp[4] <- Sys.time() + list(smooth = c(f1$coef, f1$std), + nonsmooth = c(f2$coef, f2$std), + iter = c(f3$coef, f3$std), + times = diff(stamp)) +} + +## Example codes to run replications +## Simulation for other scenarios are carried out separately + +B <- 200 +set.seed(2) +sims0_fmb <- mapply(function(n, t0) + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) +sim1_fmb <- mapply(function(n, t0) + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) + +## Example codes to create simulation tables +## Tables for full simulation results are created separately + +makeTab_fmb <- function(...) { + d <- rbind(...) + c(colMeans(d), apply(d[,1:6], 2, sd)) +} + +tmp0_fmb <- sapply(sims0_fmb, function(s) + apply(s[1:3,], 1, do.call, what = makeTab_fmb)) + +tab0_fmb <- data.frame(t0 = rep(0, each = 18), + n = rep(c(200, 400, 1000), 1, each = 6), + b = c("$\\beta_0$", "$\\beta_1$", "$\\beta_2$", "$\\beta_3$", + "$\\beta_4$", "$\\beta_5$"), + do.call(rbind, apply(tmp0_fmb, 2, matrix, 6, simplify = F))) + +kable(tab0_fmb, digits = 3, 'latex', booktabs = T, escape = F, + col.names = c("$t_0$", "n", "$\\beta$", rep(c("PE", "ESE", "ASE"), 3)), + caption = "Result $t_0=0$ and se=fmb") %>% + add_header_above(c("", "", "", "Smooth+fmb" = 3, "Nonsmooth+fmb" = 3, "Iterative+fmb" = 3)) %>% + kable_styling() %>% + collapse_rows(columns = 1:2, valign = "top", latex_hline = "none") + +times0_fmb <- sapply(sims0_fmb, function(s) Reduce("+", s[4,]) / B) +rownames(times0_fmb) <- c("Smooth+fmb", "Nonsmooth+fmb", "Iterative+fmb") +kable(times0_fmb, digits = 3, 'latex', booktabs = T, row.names = T, + col.names = c("200", "400", "1000"), + caption = "Runtimes when $t_0=0$ and se=fmb") %>% + add_header_above(c("", "$t_0 = 0$" = 3), escape = F) + + +## Function to run the simulation with se = pmb + +do_pmb <- function(n, t0, cen, Q, nB) { + dat <- data.gen(n, t0, cen, Q) + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + stamp <- NULL + stamp[1] <- Sys.time() + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "pmb") + stamp[2] <- Sys.time() + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "pmb") + stamp[3] <- Sys.time() + list(smooth = c(f1$coef, f1$std), + iter = c(f2$coef, f2$std), + times = diff(stamp)) +} + +set.seed(2) +sims0_pmb <- mapply(function(n, t0) + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) +sims1_pmb <- mapply(function(n, t0) + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) + +## Simplified simulation +rho0 <- .2 * sqrt(log(2)) +rho1 <- .1 * sqrt(log(2)) +data.gen <- function(n) { + dat <- data.frame(censoring = runif(n, 0, 23.41), + Time0 = sqrt(-log(1 - runif(n))), + X = rbinom(n, 1, .5)) + dat$Time0 <- ifelse(dat$X > 0, dat$Time0 / rho1, dat$Time0 / rho0) + dat$Time <- pmin(dat$Time0, dat$censoring) + dat$status <- 1 * (dat$Time0 < dat$censoring) + subset(dat, select = c(Time, status, X)) +} +set.seed(10) +head(dat <- data.gen(200)) + +## Initial value illustration +(random <- rnorm(2)) +f1 <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, init = "rq", nB = 0) +f2 <- update(f1, init = c(1, 1)) +f3 <- update(f1, init = random) +all.equal(f1$coef, f2$coef) +all.equal(f2$coef, f3$coef) + +## More sophisticated coefficient effect plots +fit <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, se = "pmb") +fit2 <- qris.extend(fit, Qs = 4:7 / 10) +class(fit2) +names(fit) +setdiff(names(fit2), names(fit)) +plot(fit2) + +## Prepare true values to overlay on covariate effect plots +r <- 2:1 * sqrt(log(2)) / 10 +k <- 2 +trueB <- function(t0, tau) { + b <- log(1 / r * ((r * t0) ^ k - log(1 - tau))^(1 / k) - t0) + c(b[1], b[2] - b[1]) +} +true_Q <- c(t(sapply(4:7 / 10, trueB, t0 = 1))) +true_t0 <- c(t(sapply(1:3, trueB, tau = .5))) + +## Demonstrate ggplot options +plot(fit2) + theme(legend.position = "bottom") + + geom_line(aes(x = Qs, y = true_Q, col = variable, linetype = "True value")) + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) +b <- plot(fit2, t0s = 1:3, byQs = F) +b + theme(legend.position = "bottom") + + geom_line(aes(x = t0s, y = true_t0, col = variable, + linetype = "True value")) + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) + + + +## ######################################### +## Section: Lung Cancer Data +## ######################################### +## Load data +data(cancer, package = "survival") +str(subset(lung, select = c(time, status, sex, wt.loss))) + +## Prepare data and fit +lung$male <- factor(lung$sex, 1:2, c("Male", "Female")) +lung$std.wt.loss <- scale(lung$wt.loss) +fit1 <- qris(Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = .5, nB = 100, + method = "smooth", se = "pmb") +summary(fit1) +coef(fit1) +vcov(fit1) +confint(fit1) + +## summaries and compare to nonsmooth fit +summary(fit2 <- update(fit1, method = "nonsmooth", se = "fmb")) +summary(update(fit1, method = "iterative")) +summary(update(fit1, Q = 0.25)) +summary(update(fit1, Q = 0.75)) +summary(update(fit1, t0 = 180)) + +## predict and residuals +lung.new <- data.frame(male = c("Male", "Female"), std.wt.loss = 0) +predict(fit2, newdata = lung.new) +head(residuals(fit2), 5) + +## plots +hide <- theme(legend.position = "none") +plot(fit1, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) +plot(fit2, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) +plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = TRUE, ggextra = hide) +plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = FALSE, ggextra = hide) diff --git a/_articles/RJ-2024-007/qris.bib b/_articles/RJ-2024-007/qris.bib new file mode 100644 index 0000000000..3fd20bd409 --- /dev/null +++ b/_articles/RJ-2024-007/qris.bib @@ -0,0 +1,802 @@ +@preamble{ " \newcommand{\noop}[1]{} " } % a do-nothing command that serves a purpose + +@article{choi2018smoothed, + title={Smoothed quantile regression analysis of competing risks}, + author={Choi, Sangbum and Kang, Sangwook and Huang, Xuelin}, + journal={Biometrical Journal}, + volume={60}, + number={5}, + pages={934--946}, + year={2018}, + url={https://doi.org/10.1002/bimj.201700104}, + publisher={Wiley Online Library} +} + +@article{fan2018quantile, + title={Quantile regression for competing risks analysis under case-cohort design}, + author={Fan, Caiyun and Ma, Huijuan and Zhou, Yong}, + journal={Journal of Statistical Computation and Simulation}, + volume={88}, + number={6}, + pages={1060--1080}, + year={2018}, + url={https://doi.org/10.1080/00949655.2017.1419352}, + publisher={Taylor \& Francis} +} + +@article{jung2009regression, + title={Regression on quantile residual life}, + author={Jung, Sin-Ho and Jeong, Jong-Hyeon and Bandos, Hanna}, + journal={Biometrics}, + volume= 65, + number= 4, + pages={1203--1212}, + year=2009, + url={https://doi.org/10.1111/j.1541-0420.2009.01196.x}, + publisher={Wiley Online Library} +} + +@article{kim2012censored, + title={Censored quantile regression for residual lifetimes}, + author={Kim, Mi-Ok and Zhou, Mai and Jeong, Jong-Hyeon}, + journal={Lifetime Data Analysis}, + volume=18, + number=2, + pages={177--194}, + year=2012, + url={https://doi.org/10.1007/s10985-011-9212-2}, + publisher={Springer} +} + +@article{pang2012variance, + title={Variance estimation in censored quantile regression via induced smoothing}, + author={Pang, Lei and Lu, Wenbin and Wang, Huixia Judy}, + journal={Computational Statistics \& Data Analysis}, + volume=56, + number=4, + pages={785--796}, + year=2012, + url={https://doi.org/10.1016/j.csda.2010.10.018}, + publisher={Elsevier} +} + +@article{peng2009competing, + title={Competing risks quantile regression}, + author={Peng, Limin and Fine, Jason P}, + journal={Journal of the American Statistical Association}, + volume=104, + number=488, + pages={1440--1453}, + year=2009, + url={https://doi.org/10.1198/jasa.2009.tm08228}, + publisher={Taylor \& Francis} +} + +@article{peng2008survival, + title={Survival analysis with quantile regression models}, + author={Peng, Limin and Huang, Yijian}, + journal={Journal of the American Statistical Association}, + volume={103}, + number={482}, + pages={637--649}, + year={2008}, + url={https://doi.org/10.1198/016214508000000355}, + publisher={Taylor \& Francis} +} + +@article{chiou2015semiparametric, + title={Semiparametric accelerated failure time modeling for clustered failure times from stratified sampling}, + author={Chiou, Sy Han and Kang, Sangwook and Yan, Jun}, + journal={Journal of the American Statistical Association}, + volume=110, + number=510, + pages={621--629}, + year=2015, + url={https://doi.org/10.1080/01621459.2014.917978}, + publisher={Taylor \& Francis} +} + +@article{brown2007induced, + title={Induced smoothing for rank regression with censored survival times}, + author={Brown, BM and Wang, You-Gan}, + journal={Statistics in Medicine}, + volume=26, + number=4, + pages={828--836}, + year=2007, + url={https://doi.org/10.1002/sim.2576}, + publisher={Wiley Online Library} +} + +@article{koenker1978regression, + title={Regression quantiles}, + author={Koenker, Roger and Bassett Jr, Gilbert}, + journal={Econometrica: Journal of the Econometric Society}, + pages={33--50}, + year=1978, + url={https://doi.org/10.2307/1913643}, + publisher={JSTOR} +} + +@book{fleming2011counting, + title={Counting Processes and Survival Analysis}, + author={Fleming, Thomas R and Harrington, David P}, + volume=169, + year=2011, + url={https://doi.org/10.1002/9781118150672}, + publisher={John Wiley \& Sons} +} + +@article{caplan2019dental, + title={Dental restoration longevity among geriatric and special needs patients}, + author={Caplan, DJ and Li, Y and Wang, W and Kang, S and Marchini, L and Cowen, HJ and Yan, J}, + journal={JDR Clinical \& Translational Research}, + volume={4}, + number={1}, + pages={41--48}, + year={2019}, + url={https://journals.sagepub.com/doi/pdf/10.1177/2380084418799083}, + publisher={SAGE Publications Sage CA: Los Angeles, CA} +} + +@Manual{quantregpackage, + title = {quantreg: Quantile regression}, + author = {Roger Koenker}, + year = {2022}, + note = {R package version 5.87}, + url = {https://CRAN.R-project.org/package=quantreg} +} + +@Manual{R:qris, + title = {qris: Quantile regression model for residual lifetime using an induced smoothing approach}, + author = {Kyu Hyun Kim and Sangwook Kang and Sy Han Chiou}, + year = {2022}, + note = {R package version 1.0.0}, + url = {https://CRAN.R-project.org/package=qris} +} + +@article{li2016quantile, + title={Quantile residual life regression with longitudinal biomarker measurements for dynamic prediction}, + author={Li, Ruosha and Huang, Xuelin and Cortes, Jorge E}, + journal={Journal of the Royal Statistical Society. Series C: Applied Statistics}, + volume={65}, + number={5}, + pages={755--773}, + year={2016}, + url={http://www.jstor.org/stable/44681854}, + publisher={Wiley-Blackwell} +} + +@article{chiou2015rank, + title={Rank-based estimating equations with general weight for accelerated failure time models: {A}n induced smoothing approach}, + author={Chiou, S and Kang, Sangwook and Yan, J}, + journal={Statistics in Medicine}, + volume={34}, + number={9}, + pages={1495--1510}, + year={2015}, + url={https://doi.org/10.1002/sim.6415}, + publisher={Wiley Online Library} +} + +@article{zeng2008efficient, + title={Efficient resampling methods for nonsmooth estimating functions}, + author={Zeng, Donglin and Lin, DY}, + journal={Biostatistics}, + volume={9}, + number={2}, + pages={355--363}, + year={2008}, + url={https://doi.org/10.1093/biostatistics/kxm034}, + publisher={Oxford University Press} +} + +@article{cox1972regression, + title={Regression Models and Life-Tables}, + author={Cox, David R}, + journal={Journal of the Royal Statistical Society: Series B (Methodological)}, + volume={34}, + number={2}, + pages={187--202}, + year={1972}, + url={https://doi.org/10.1111/j.2517-6161.1972.tb00899.x}, + publisher={Wiley Online Library} +} + +@article{ying1995survival, + title={Survival Analysis with Median Regression Models}, + author={Ying, Zhiliang and Jung, Sin-Ho and Wei, Lee-Jen}, + journal={Journal of the American Statistical Association}, + volume={90}, + number={429}, + pages={178--184}, + year={1995}, + url={https://doi.org/10.1080/01621459.1995.10476500}, + publisher={Taylor \& Francis} +} + +@article{portnoy2003censored, + title={Censored regression quantiles}, + author={Portnoy, Stephen}, + journal={Journal of the American Statistical Association}, + volume={98}, + number={464}, + pages={1001--1012}, + year={2003}, + url={https://doi.org/10.1198/016214503000000954}, + publisher={Taylor \& Francis} +} + +@article{oakes2003inference, + title={Inference for the proportional mean residual life model}, + author={Oakes, David and Dasu, Tamraparni}, + journal={Lecture Notes-Monograph Series}, + pages={105--116}, + year={2003}, + url={http://www.jstor.org/stable/4356266}, + publisher={JSTOR} +} + +@article{chen2005semiparametric, + title={Semiparametric estimation of proportional mean residual life model in presence of censoring}, + author={Chen, YQ and Jewell, NP and Lei, X and Cheng, SC}, + journal={Biometrics}, + volume={61}, + number={1}, + pages={170--178}, + year={2005}, + url={https://doi.org/10.1111/j.0006-341X.2005.030224.x}, + publisher={Wiley Online Library} +} + +@article{maguluri1994estimation, + title={Estimation in the mean residual life regression model}, + author={Maguluri, Gangaji and Zhang, Cun-Hui}, + journal={Journal of the Royal Statistical Society: Series B (Methodological)}, + volume={56}, + number={3}, + pages={477--489}, + year={1994}, + url={https://doi.org/10.1111/j.2517-6161.1994.tb01994.x}, + publisher={Wiley Online Library} +} + +@article{oakes1990note, + title={A note on residual life}, + author={Oakes, David and Dasu, Tamraparni}, + journal={Biometrika}, + volume={77}, + number={2}, + pages={409--410}, + year={1990}, + url={https://doi.org/10.1093/biomet/77.2.409}, + publisher={Oxford University Press} +} + +@article{chen2006linear, + title={Linear life expectancy regression with censored data}, + author={Chen, Ying Qing and Cheng, Seu}, + journal={Biometrika}, + volume={93}, + number={2}, + pages={303--313}, + year={2006}, + url={https://doi.org/10.1093/biomet/93.2.303}, + publisher={Oxford University Press} +} + +@article{chen2007additive, + title={Additive Expectancy Regression}, + author={Chen, Ying Qing}, + journal={Journal of the American Statistical Association}, + volume={102}, + number={477}, + pages={153--166}, + year={2007}, + url={https://doi.org/10.1198/016214506000000870}, + publisher={Taylor \& Francis} +} + +@article{zhang2010goodness, + title={Goodness-of-fit tests for additive mean residual life model under right censoring}, + author={Zhang, Zhigang and Zhao, Xingqiu and Sun, Liuquan}, + journal={Lifetime Data Analysis}, + volume={16}, + number={3}, + pages={385--408}, + year={2010}, + url={https://doi.org/10.1007/s10985-010-9152-2}, + publisher={Springer} +} + +@techreport{liu2008regression, + title={Regression analysis of mean residual life function}, + author={Liu, Shufang and Ghosh, Sujit K}, + year={2008}, + institution={North Carolina State University. Dept. of Statistics}, + url = {https://repository.lib.ncsu.edu/bitstream/handle/1840.4/3041/mimeo2613.pdf?sequence=1} +} + +@article{sun2009class, + title={A class of transformed mean residual life models With Censored survival data}, + author={Sun, Liuquan and Zhang, Zhigang}, + journal={Journal of the American Statistical Association}, + volume={104}, + number={486}, + pages={803--815}, + year={2009}, + url={https://doi.org/10.1198/jasa.2009.0130}, + publisher={Taylor \& Francis} +} + +@article{sun2012mean, + title={Mean residual life models with time-dependent coefficients under right censoring}, + author={Sun, Liuquan and Song, Xinyuan and Zhang, Zhigang}, + journal={Biometrika}, + volume={99}, + number={1}, + pages={185--197}, + year={2012}, + url={https://doi.org/10.1093/biomet/asr065}, + publisher={Oxford University Press} +} + +@article{jung1996quasi, + title={Quasi-Likelihood for median regression models}, + author={Jung, Sin-Ho}, + journal={Journal of the American Statistical Association}, + volume={91}, + number={433}, + pages={251--257}, + year={1996}, + url={https://doi.org/10.1080/01621459.1996.10476683}, + publisher={Taylor \& Francis Group} +} + +@article{portnoy1997gaussian, + title={The Gaussian hare and the Laplacian tortoise: computability of squared-error versus absolute-error estimators}, + author={Portnoy, Stephen and Koenker, Roger}, + journal={Statistical Science}, + volume={12}, + number={4}, + pages={279--300}, + year={1997}, + url={https://doi.org/10.1214/ss/1030037960}, + publisher={Institute of Mathematical Statistics} +} + +@article{wei2006quantile, + title={Quantile regression methods for reference growth charts}, + author={Wei, Ying and Pere, Anneli and Koenker, Roger and He, Xuming}, + journal={Statistics in Medicine}, + volume={25}, + number={8}, + pages={1369--1382}, + year={2006}, + url={https://doi.org/10.1002/sim.2271}, + publisher={Wiley Online Library} +} + +@article{whang2006smoothed, + title={Smoothed empirical likelihood methods for quantile regression models}, + author={Whang, Yoon-Jae}, + journal={Econometric Theory}, + pages={173--205}, + year={2006}, + doi={10.1017/S0266466606060087}, + publisher={JSTOR} +} + +@article{gelfand2003bayesian, + title={Bayesian semiparametric regression for median residual life}, + author={Gelfand, Alan E and Kottas, Athanasios}, + journal={Scandinavian Journal of Statistics}, + volume={30}, + number={4}, + pages={651--665}, + year={2003}, + url={https://doi.org/10.1111/1467-9469.00356}, + publisher={Wiley Online Library} +} + +@article{wang2009locally, + title={Locally weighted censored quantile regression}, + author={Wang, Huixia Judy and Wang, Lan}, + journal={Journal of the American Statistical Association}, + volume={104}, + number={487}, + pages={1117--1128}, + year={2009}, + url={https://doi.org/10.1198/jasa.2009.tm08230}, + publisher={Taylor \& Francis} +} + +@article{huang2010quantile, + title={Quantile calculus and censored regression}, + author={Huang, Yijian}, + journal={Annals of Statistics}, + volume={38}, + number={3}, + pages={1607}, + year={2010}, + doi={10.1214/09-AOS771}, + publisher={NIH Public Access} +} + +@article{portnoy2010asymptotics, + title={Asymptotics for censored regression quantiles}, + author={Portnoy, Stephen and Lin, Guixian}, + journal={Journal of Nonparametric Statistics}, + volume={22}, + number={1}, + pages={115--130}, + year={2010}, + url={https://doi.org/10.1080/10485250903105009}, + publisher={Taylor \& Francis} +} + +@article{johnson2009induced, + title={Induced smoothing for the semiparametric accelerated failure time model: {A}symptotics and extensions to clustered data}, + author={Johnson, Lynn M and Strawderman, Robert L}, + journal={Biometrika}, + volume={96}, + number={3}, + pages={577--590}, + year={2009}, + url={https://doi.org/10.1093/biomet/asp025}, + publisher={Oxford University Press} +} + +@article{fu2010rank, + title={Rank regression for analysis of clustered data: {A} natural induced smoothing approach}, + author={Fu, Liya and Wang, You-Gan and Bai, Zhidong}, + journal={Computational Statistics \& Data Analysis}, + volume={54}, + number={4}, + pages={1036--1050}, + year={2010}, + url={https://doi.org/10.1016/j.csda.2009.10.015}, + publisher={Elsevier} +} + +@article{chiou2014fast, + title={Fast accelerated failure time modeling for case-cohort data}, + author={Chiou, Sy Han and Kang, Sangwook and Yan, Jun}, + journal={Statistics and Computing}, + volume={24}, + number={4}, + pages={559--568}, + year={2014}, + url={https://doi.org/10.1007/s11222-013-9388-2}, + publisher={Springer} +} + +@Manual{aftgeepackage, + title = {aftgee: Accelerated failure time model with generalized estimating equations}, + author = {Sy Han Chiou and Sangwook Kang and Jun Yan}, + year = {2021}, + note = {R package version 1.1.6}, + url = {https://CRAN.R-project.org/package=aftgee} +} + +@article{bang2002median, + title={Median regression with censored cost data}, + author={Bang, Heejung and Tsiatis, Anastasios A}, + journal={Biometrics}, + volume={58}, + number={3}, + pages={643--649}, + year={2002}, + url={https://doi.org/10.1111/j.0006-341X.2002.00643.x}, + publisher={Wiley Online Library} +} + +@Manual{r2021, + title = {R: {A} language and environment for statistical computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2021}, + url = {https://www.R-project.org/} + } + +@incollection{sit2017survival, + title={Survival analysis: {A} quantile perspective}, + author={Ying, Zhiliang and Sit, Tony}, + booktitle={Handbook of Quantile Regression}, + pages={89--108}, + year={2017}, + url={https://doi.org/10.1201/9781315120256}, + publisher={Chapman and Hall/CRC} +} + +@article{brown2005standard, + author={Brown, BM and Wang, You-Gan}, + title={Standard errors and covariance matrices for smoothed rank estimators}, + journal={Biometrika}, + volume={92}, + number={1}, + pages={149--158}, + year={2005}, + url={https://doi.org/10.1093/biomet/92.1.149.}, + publisher={Oxford University Press} +} + +@article{kassebaum2015global, + title={Global burden of untreated caries: {A} systematic review and metaregression}, + author={Kassebaum, NJ and Bernab{\'e}, E and Dahiya, M and Bhandari, B and Murray, CJL and Marcenes, W}, + journal={Journal of Dental Research}, + volume={94}, + number={5}, + pages={650--658}, + year={2015}, + doi={10.1177/0022034515573272}, + publisher={SAGE Publications Sage CA: Los Angeles, CA} +} + +@article{lin2000fitting, + title={On fitting {C}ox's proportional hazards models to survey data}, + author={Lin, DY}, + journal={Biometrika}, + volume={87}, + number={1}, + pages={37--47}, + year={2000}, + url={https://doi.org/10.1093/biomet/87.1.37}, + publisher={Oxford University Press} +} + +@article{Kang:fitt:2016, +title={Fitting semiparametric accelerated failure time models for nested case–control data}, + author={Kang, Sangwook}, + journal={Journal of Statistical Computation and Simulation}, + volume={87}, + number={4}, + pages={652--663}, + year={2017}, + url={https://doi.org/10.1080/00949655.2016.1222611}, + publisher={Taylor \& Francis} +} + +@article{ma2010semiparametric, + title={Semiparametric median residual life model and inference}, + author={Ma, Yanyuan and Yin, Guosheng}, + journal={The Canadian Journal of Statistics}, + volume={38}, + number={4}, + pages={665--679}, + year={2010}, + url={https://doi.org/10.1002/cjs.10076}, + publisher={Wiley Online Library} +} + +@article{zhang2015smoothed, + title={Smoothed estimator of quantile residual lifetime for right censored data}, + author={Zhang, Li and Liu, Peng and Zhou, Yong}, + journal={Journal of Systems Science and Complexity}, + volume={28}, + number={6}, + pages={1374--1388}, + year={2015}, + url={https://doi.org/10.1007/s11424-015-3067-7}, + publisher={Springer} +} + +@Manual{Brqpackage, + title = {Brq: Bayesian Analysis of Quantile Regression Models}, + author = {Rahim Alhamzawi}, + year = {2020}, + note = {R package version 3.0}, + url = {https://CRAN.R-project.org/package=Brq} +} + +@Manual{cmprskQRpackage, + title = {cmprskQR: Analysis of competing risks using quantile regressions}, + author = {Stephan Dlugosz and Limin Peng and Ruosha Li and Shuolin Shi}, + year = {2019}, + note = {R package version 0.9.2}, + url = {https://CRAN.R-project.org/package=cmprskQR} +} + +@article{loprinzi1994prospective, + title={Prospective evaluation of prognostic variables from patient-completed questionnaires. {N}orth {C}entral {C}ancer {T}reatment {G}roup.}, + author={Loprinzi, Charles Lawrence and Laurie, John A and Wieand, H Sam and Krook, James E and Novotny, Paul J and Kugler, John W and Bartel, Joan and Law, Marlys and Bateman, Marilyn and Klatt, Nancy E}, + journal={Journal of Clinical Oncology}, + volume={12}, + number={3}, + pages={601--607}, + year={1994}, + url={https://doi.org/10.1200/JCO.1994.12.3.601} +} + +@article{kim2023smoothed, + title={Smoothed quantile regression for censored residual life}, + author={Kim, Kyu Hyun and Caplan, Daniel J and Kang, Sangwook}, + journal={Computational Statistics}, + volume={38}, + pages={1001--1022}, + year={2023}, + url = {https://doi.org/10.1007/s00180-022-01262-z} +} + +@article{jin2001simple, + title={A simple resampling method by perturbing the minimand}, + author={Jin, Zhezhen and Ying, Zhiliang and Wei, LJ}, + journal={Biometrika}, + volume={88}, + number={2}, + pages={381--390}, + year={2001}, + url={https://doi.org/10.1093/biomet/88.2.381}, + publisher={Oxford University Press} +} + +@article{jin2003rank, + title={Rank-based inference for the accelerated failure time model}, + author={Jin, Zhezhen and Lin, DY and Wei, LJ and Ying, Zhiliang}, + journal={Biometrika}, + volume={90}, + number={2}, + pages={341--353}, + year={2003}, + url={https://doi.org/10.1093/biomet/90.2.341}, + publisher={Oxford University Press} +} + +@article{zhou2006simple, + title={A simple censored median regression estimator}, + author={Zhou, Lingzhi}, + journal={Statistica Sinica}, + pages={1043--1058}, + year={2006}, + url={https://www.jstor.org/stable/24307586}, + publisher={JSTOR} +} + +@article{powell1986censored, + title={Censored regression quantiles}, + author={Powell, James L}, + journal={Journal of Econometrics}, + volume={32}, + number={1}, + pages={143--155}, + year={1986}, + url={https://doi.org/10.1016/0304-4076(86)90016-3}, + publisher={Elsevier} +} + +@Manual{ctqrpackage, + title = {ctqr: {C}ensored and truncated quantile regression}, + author = {Paolo Frumento}, + year = {2021}, + note = {R package version 2.0}, + url = {https://CRAN.R-project.org/package=ctqr} +} + +@article{ackerberg2012practical, + title={A practical asymptotic variance estimation for two-step semiparametric estimators}, + author={Ackerberg, Daniel and Chen, Xiaohong and Hahn, Jinyong}, + journal={Review of Economics and Statistics}, + volume={94}, + number={2}, + pages={481--498}, + year={2012}, + url={https://doi.org/10.1162/REST_a_00251}, + publisher={The MIT Press} +} + +@article{kim2021comparison, + title={Comparison of variance estimation methods in semiparametric accelerated failure time models for multivariate failure time data}, + author={Kim, Kyuhyun and Ko, Jungyeol and Kang, Sangwook}, + journal={Japanese Journal of Statistics and Data Science}, + volume={4}, + number={2}, + pages={1179--1202}, + year={2021}, + url={https://doi.org/10.1007/s42081-021-00126-y}, + publisher={Springer} +} + +@Manual{Rcpppackage, + title = {Rcpp: Seamless R and C++ Integration}, + author = {Dirk Eddelbuettel and Romain Francois and JJ Allaire and Kevin Ushey and Qiang Kou and Nathan Russell and Inaki Ucar and Douglas Bates and John Chambers}, + year = {2022}, + note = {R package version 1.0.9}, + url = {https://CRAN.R-project.org/package=Rcpp} +} + +@Manual{RcppArmadillopackage, + title = {RcppArmadillo: `Rcpp' Integration for the `Armadillo' Templated Linear Algebra Library}, + author = {Dirk Eddelbuettel and Romain Francois and Doug Bates and Binxiang Ni and Conrad Sanderson}, + year = {2022}, + note = {R package version 0.11.1.1.0}, + url = {https://CRAN.R-project.org/package=RcppArmadillo} +} + +@article{siegel2021cancer, + title={Cancer statistics, 2021}, + author={Siegel, Rebecca L and Miller, Kimberly D and Fuchs, Hannah E and Jemal, Ahmedin}, + journal={CA: A Cancer Journal for Clinicians}, + volume={71}, + number={1}, + pages={7--33}, + year={2021}, + url={https://doi.org/10.3322/caac.21654}, + publisher={Wiley Online Library} +} + +@article{prentice1986case, + title={A case-cohort design for epidemiologic cohort studies and disease prevention trials}, + author={Prentice, Ross L}, + journal={Biometrika}, + volume={73}, + number={1}, + pages={1--11}, + year={1986}, + url={https://doi.org/10.1093/biomet/73.1.1}, + publisher={Oxford University Press} +} + +@Manual{ggplot2package, + title = {ggplot2: Create elegant data visualisations using the grammar of graphics}, + author = {Hadley Wickham and Winston Chang and Lionel Henry and Thomas Lin Pedersen and Kohske Takahashi and Claus Wilke and Kara Woo and Hiroaki Yutani and Dewey Dunnington}, + year = {2022}, + note = {R package version 3.3.6}, + url = {https://CRAN.R-project.org/package=ggplot2} +} + +@article{koenker1994quantile, + title={Quantile smoothing splines}, + author={Koenker, Roger and Ng, Pin and Portnoy, Stephen}, + journal={Biometrika}, + volume={81}, + number={4}, + pages={673--680}, + year={1994}, + url={https://doi.org/10.1093/biomet/81.4.673}, + publisher={Oxford University Press} +} + +@article{koenker2004penalized, + title={Penalized triograms: {T}otal variation regularization for bivariate smoothing}, + author={Koenker, Roger and Mizera, Ivan}, + journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + volume={66}, + number={1}, + pages={145--163}, + year={2004}, + url={https://doi.org/10.1111/j.1467-9868.2004.00437.x}, + publisher={Wiley Online Library} +} + +@Manual{survivalpackage, + title = {survival: Survival analysis}, + author = {Terry M Therneau}, + year = {2021}, + note = {R package version 3.2-13}, + url = {https://CRAN.R-project.org/package=survival} +} + +@Article{brmspackage, + title = {Advanced {Bayesian} Multilevel Modeling with the {R} + Package {brms}}, + author = {Paul-Christian Bürkner}, + journal = {The R Journal}, + year = {2018}, + volume = {10}, + number = {1}, + pages = {395--411}, + doi = {10.32614/RJ-2018-017}, + encoding = {UTF-8} + } + +@article{koenker2008censored, + title={Censored quantile regression redux}, + author={Koenker, Roger}, + journal={Journal of Statistical Software}, + volume={27}, + pages={1--25}, + doi={https://doi.org/10.18637/jss.v027.i06}, + year={2008} +} + diff --git a/_articles/RJ-2024-007/qris.pdf b/_articles/RJ-2024-007/qris.pdf new file mode 100644 index 0000000000..f5ca912dbe Binary files /dev/null and b/_articles/RJ-2024-007/qris.pdf differ diff --git a/_articles/RJ-2024-007/qris.tex b/_articles/RJ-2024-007/qris.tex new file mode 100644 index 0000000000..40e05a72c5 --- /dev/null +++ b/_articles/RJ-2024-007/qris.tex @@ -0,0 +1,1428 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + +%\usepackage{orcidlink,thumbpdf,lmodern} +\usepackage{thumbpdf,lmodern} +%% another package (only for this demo article) +\usepackage{framed} +%\usepackage{subfigure} +\usepackage{pgfplots} +\usepackage{multirow} +\usepackage{amsthm} +\usepackage{soul} +\usepackage{bm} +\usepackage{alltt} +\usepackage{framed} +\usepackage{nameref} +\usepackage{graphicx} +\graphicspath{{figures/}} +\usepackage{subcaption} +\usepackage{caption} + +%% new custom commands +\newcommand{\class}[1]{`\code{#1}'} +\newcommand{\fct}[1]{\code{#1()}} +\newcommand{\Var}{\mathop{\rm Var}\nolimits} %Variance +\newcommand{\vect}[1]{\mathbf{#1}} +\newcommand{\matr}[1]{\mathbf{#1}} + +\newcommand{\bns}{\widehat{\bm{\beta}}_{\tiny\mbox{NS}}} +\newcommand{\bis}{\widehat{\bm{\beta}}_{\tiny\mbox{IS}}} +\newcommand{\bit}{\widehat{\bm{\beta}}_{\tiny\mbox{IT}}} + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% For rmd prints +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother +\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}} +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}} +\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}} + +\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}} +\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}} +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}} +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}} +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}} +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}} +\let\hlipl\hlkwb +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{ + \def\at@end@of@kframe{} + \ifinner\ifhmode + \def\at@end@of@kframe{\end{minipage}} +\begin{minipage}{\columnwidth} + \fi\fi + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth} + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}} +{\par\unskip\endMakeFramed + \at@end@of@kframe} +\makeatother +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} + +%% load any required packages FOLLOWING this line + +\begin{document} + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{XX} +\volnumber{YY} +\year{20ZZ} +\month{AAAA} + +%% replace RJtemplate with your article +\begin{article} + +\title{Fitting a Quantile Regression Model for Residual Life with the R Package {q}ris} +\author{Kyu Hyun Kim, Sangwook Kang, and Sy Han Chiou} + +\maketitle + +\abstract{ + In survival analysis, regression modeling has traditionally focused on assessing covariate effects on survival times, + which is defined as the elapsed time between a baseline and event time. + Nevertheless, focusing on residual life can provide a more dynamic assessment of covariate effects, + as it offers more updated information at specific time points between the baseline and event occurrence. + Statistical methods for fitting quantile regression models have recently been proposed, + providing favorable alternatives to modeling the mean of residual lifetimes. + Despite these progresses, the lack of computer software that implements these methods remains an obstacle for researchers analyzing data in practice. + In this paper, we introduce an R package {qris}, which implements methods for fitting semiparametric quantile regression models on residual life subject to right censoring. + We demonstrate the effectiveness and versatility of this package through comprehensive simulation studies and + a real-world data example, showcasing its valuable contributions to survival analysis research. +} + +\section{Introduction} \label{sec:intro} + +In the analysis of time-to-event data, standard statistical inference procedures often focus on quantities +based on failure time and its relationship with covariates measured at baseline. +However, throughout the follow-up process, +inference procedures based on residual life become increasingly intuitive for assessing the survival of subjects +and can offer insights into the effectiveness of treatments in prolonging the remaining lifetime. +As covariates can substantially change over time and +models based solely on baseline covariates have limited potential for long-term prognosis, +there is a growing interest in modeling the remaining lifetime of a surviving subject with updated patient information. +Many efforts have been made to model the mean residual life including proportional mean residual life models +\citep{maguluri1994estimation, oakes1990note, oakes2003inference, chen2005semiparametric}, +additive mean residual life models \citep{chen2006linear, chen2007additive, zhang2010goodness}, +and proportional scaled mean residual life models \citep{liu2008regression}. +Given that failure times are usually right-skewed and heavy-tailed, +the mean of the residual life might not be identifiable if +the follow-up time is not sufficiently long. +For this reason, quantiles, which are robust under skewed distribution, +have traditionally been used more frequently as alternative summary measures. +For example, the approach on the semiparametric quantile regression model for continuous responses \citep{koenker1978regression} has been extended to uncensored failure time data +\citep{jung1996quasi, portnoy1997gaussian, wei2006quantile} +and censored failure times data \citep{ying1995survival, portnoy2003censored,peng2008survival, huang2010quantile}. + + +When the outcome variable is the residual life, +semiparametric quantile models that apply the inverse probability of censoring weighting (IPCW) +principle to address right-censored observations have been explored +\citep{jung2009regression, kim2012censored, li2016quantile}. +These approaches are based on non-smooth estimating functions with respect to regression parameters, +and the estimates of the regression parameters are obtained either through zero-crossing of +non-smooth estimating functions using grid search techniques \citep{jung2009regression} or +by optimizing non-smooth objective functions with $L_1$-minimization algorithms \citep{kim2012censored, li2016quantile}. +While these methods are relatively straightforward to implement, +an additional challenge lies in standard error estimation, +which necessitates the computationally intensive use of a multiplier bootstrap method \citep{li2016quantile}. +Alternatively, \citet{jung2009regression} and \citet{kim2012censored} utilized the minimum dispersion statistic and +the empirical likelihood method, respectively, +to bypass the need to directly estimate the variance of the regression parameter estimator for +hypothesis testing and constructing confidence intervals. +The non-smooth nature of the estimating functions in these approaches +precludes the estimation of variance using the robust sandwich-type variance estimator typically employed +in equation-based estimation methods. +To lessen the associated computational burden, an induced smoothing was proposed \citep{brown2005standard}, +which modifies the non-smooth estimating equations into smooth ones. +Leveraging the asymptotic normality of the non-smooth estimator, +the smooth estimating functions are constructed by averaging out the random perturbations +inherent in the non-smooth estimating functions. +The resulting estimating functions become smooth with respect to the regression parameters, +allowing for the straightforward application of standard numerical algorithms, such as the Newton-Raphson method. +Furthermore, these smoothed estimating functions facilitate the straightforward computation of variances using +the robust sandwich-type estimator. +The induced smoothing approach has been employed in fitting semiparametric accelerated failure time (AFT) models +via the rank-based approach \citep{johnson2009induced, aftgeepackage, chiou2015semiparametric, Kang:fitt:2016}. +Regarding quantile regression, \citet{choi2018smoothed} considered the induced smoothing approach under +a competing-risks setting. All of these methods are based on modeling event times. +Recently, \citet{kim2023smoothed} proposed an induced smoothing estimator for fitting +a semiparametric quantile regression model for residual life. + + + +The availability of published R packages for fitting quantile regression models is somewhat limited. +The \code{rq()}, \code{nlrq()}, \code{rqss()}, and \code{crq()} functions in the package \CRANpkg{quantreg} +\citep{quantregpackage} are predominantly used and provide various features for fitting linear, +nonlinear, non-parametric, and censored quantile regression models, respectively. +The \code{rq()} function minimizes non-smooth objective functions to obtain point estimates of regression coefficients +and can accommodate right-censored survival times by incorporating weights. +By redefining survival times as the remaining lifetime at time $t_0$, +one can also obtain a non-smoothed estimator for quantile regression models for residual life \citep{kim2012censored}. +On the other hand, the \code{nlrq()} function is designed to fit a nonlinear quantile regression model, while +the \code{rqss()} function fits additive quantile regression models with +nonparametric terms, including univariate components and bivariate components, +using smoothing splines and total variation regularization techniques \citep{koenker1994quantile, koenker2004penalized}. +% On the other hand, the \code{nlrq()} function is designed to fit a nonlinear quantile regression model, +% while the \code{rqss()} function fits additive quantile regression models with nonparametric terms, +% including univariate components and bivariate components, using smoothing splines and +% total variation regularization techniques \citep{koenker1994quantile, koenker2004penalized}. +Furthermore, the \code{crq()} function fits a quantile regression model for censored data on the $\tau$-th +conditional quantile function of the response variable. +Overall, the \CRANpkg{quantreg} implements three methods for handling right-censored survival times: \citet{powell1986censored}'s estimator, +\citet{portnoy2003censored}'s estimator and \citet{peng2008survival}'s estimator. +However, none of the implemented methods in the \code{nlrq()}, \code{rqss()}, or \code{crq()} functions +are applicable for handling censored residual life using the induced smoothing methods. +The only function that implements the induced smoothing method is the \code{aftsrr()} function in the package +\CRANpkg{aftgee} \citep{aftgeepackage}, +but it is specifically designed for fitting semiparametric AFT models, which are not directly applicable +to fitting quantile regression models. + + +% In an effort to lessen the computational burden in handling non-smooth estimating equations, +% the \code{aftsrr()} function in package \CRANpkg{aftgee} \citep{aftgeepackage} is the only function that implements the induced smoothing method in the context of fitting semiparametric AFT models. + +Other R packages that can be used to fit quantile regression models for survival data include the package +\CRANpkg{ctqr} \citep{ctqrpackage}, package \CRANpkg{Brq} \citep{Brqpackage}, package \CRANpkg{brms} \citep{brmspackage}, +and package \CRANpkg{cmprskQR} \citep{cmprskQRpackage}. +The \code{ctqr()} function in the package \CRANpkg{ctqr} implements the methods proposed in +\citet{ctqrpackage} for right or interval-censored failure times with left-truncation. +The \code{Bqr()} function in the package \CRANpkg{Brq} implements Bayesian methods based on the +asymmetric Laplace distribution. +In the package \CRANpkg{brms}, the \code{brm()} function with the \code{family=asym\_laplace()} +option enables the implementation of full Bayesian inference. +The \code{crrQR()} function in the package \CRANpkg{cmprskQR} allows fitting quantile regression models +with competing risks. +All of these R packages are designed for fitting quantile regression models for failure times defined from a baseline +and are not applicable to the residual life setting. + +% In an effort to lessen the computational burden in handling non-smooth estimating equations, +% the \code{aftsrr()} function in package \CRANpkg{aftgee} \citep{aftgeepackage} is the only function that implements the induced smoothing method in the context of fitting semiparametric AFT models. + +The recently developed R package \CRANpkg{qris} \citep{R:qris} provides an efficient tool for +fitting semiparametric quantile regression models for residual life subject to right censoring. +The \CRANpkg{qris} package offers three methods for estimating the regression parameters: +$L_1$-minimization of non-smooth objective functions, induced smoothing with a non-iterative approach, +and an iterative procedure. +For standard error estimation, the \CRANpkg{qris} package provides two resampling-based approaches: +the partial multiplier bootstrap and the full multiplier bootstrap methods. +The partial multiplier bootstrap method utilizes the robust sandwich-type estimator by +incorporating the sample variance of perturbed estimating functions, +while the full multiplier bootstrap method is obtained by considering the sample variance +from the solutions of perturbed estimating functions. +To enhance the interpretability of results, the \CRANpkg{qris} package incorporates +graphical visualizations of covariate effects at different quantiles and base times, +utilizing the plotting environment similar to that in the \CRANpkg{ggplot2} package \citep{ggplot2package}, +%the \code{ggplot} plotting environment \citep{ggplot2package}, +thereby allowing for extensive flexibility and customization. +The ultimate goal of creating the \CRANpkg{qris} package is to facilitate +the easy incorporation of quantile regression for residual life into daily routines. +The package \CRANpkg{qris} is available on the Comprehensive R Archive Network (CRAN) at +\url{https://CRAN.R-project.org/package=qris}. + +The rest of the article is organized as follows: Section~\nameref{sec:nsm} introduces +a semiparametric regression model for quantiles of residual life and the estimation methods +implemented in the package. +Section~\nameref{sec:implementation} provides details about computing algorithms. +Illustrations of the package using a simulated dataset and the real data from the +North Central Cancer Treatment Group +are presented in Section~\nameref{sec:illustration}. +Finally, in Section~\nameref{sec:conclusion}, concluding remarks are provided along with some discussions. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Model %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Semiparametric quantile regression for residual life} +\label{sec:nsm} + +Define $T$ as the potential failure time that is subject to right censoring by $C$ +and $\vect{X}$ as a $p \times 1$ vector of covariates, +where $p$ is the number of covariates, including an intercept. +The observed data consists of +$n$ independent copies of $(Z, \delta, \vect{X})$, where $Z = \min(T, C)$, +$\delta = I(T \leq C)$, % is the failure indicator, +and $I(\cdot)$ is an indicator function. +We also assume $T$ and $C$ are marginally independent. +Define the $\tau$-th quantile of the residual life at $t_0 > 0$ as +$\theta_{\tau}(t_0)$ that satisfies $P(T_i - t_0 \geq \theta_{\tau}(t_0) \ | \ T_i > t_0) = 1 - \tau$. +We consider the semiparametric quantile regression model for the residual life \citep{kim2012censored, kim2023smoothed}. Given $T_i > t_0$, +\begin{equation} \label{qr:mod1} + \log(T_i - t_0) = \vect{X}_{i}^{\top}\bm{\beta}_0(\tau, t_0) + \epsilon_i, i = 1, \ldots, n, %\label{qr:mod2} +\end{equation} +where $\bm{\beta}_0(\tau, t_0)$ is a $p \times 1$ vector of regression coefficients, +and $\epsilon_i$ is a random error having zero $\tau$-th quantile. +The quantile regression model for a continuous response \citep{koenker1978regression} +is a special case of Equation~\eqref{qr:mod1} when $t_0 = 0$. +For ease of notation, we omit $\tau$ and $t_0$ in $\bm{\beta}_0(\tau, t_0)$ and $\theta_{\tau}(t_0)$ +and write $\bm{\beta}_0$ and $\theta$. +We present different estimation procedures to estimate $\bm{\beta}_0$ given $\tau$ and $t_0$ in the following. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Non-smooth model point estimation %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Estimation using non-smooth functions} \label{sec:nsm:pt} + +When there is no censoring, an estimator for $\beta_0$ in Equation~\eqref{qr:mod1} +can be obtained by solving the estimating equation \citep{kim2012censored}, where +\begin{equation} \label{eq:ns:obj1} + \frac{1}{n}\sum_{i=0}^{n}I[T_i \ge t_0] \vect{X}_i \left\{I\left[\log(T_i - t_0) \leq \vect{X}_i^{\top}\bm{\beta} \right] - \tau \right\} = 0. +\end{equation} +However, Equation~\eqref{eq:ns:obj1} cannot be directly used when $T_i - t_0$ is subject to right censoring. +The IPCW technique can be incorporated into Equation~\eqref{eq:ns:obj1} +to account for the right censoring \citep{li2016quantile}. +Specifically, in the presence of right censoring, +the estimator for $\bm{\beta}_0$ in Equation~\eqref{qr:mod1} can be obtained as the root of the following weighted estimating equations: +\begin{equation} \label{eq:nsm:ipw} + U_{t_0}(\bm{\beta}, \tau) = \frac{1}{n}\sum_{i=1}^{n}I[Z_i \ge t_0] \vect{X}_i \left\{I \left[\log(Z_i - t_0) \leq \vect{X}_i^{\top} \bm{\beta} \right]\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0)} -\tau \right\}, +\end{equation} +where $\widehat{G}(\cdot)$ +is the Kaplan-Meier estimate of the survival function $G(\cdot)$ of the censoring time $C$ and +$\widehat{G}(t) = \prod_{i: t_i \leq t} (1 - \sum_{j=1}^n (1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n I(Z_j \geq t_i))$. +A computational challenge arises because the exact solution to Equation~\eqref{eq:nsm:ipw} might not exist +due to the non-smoothness in $\beta$ caused by the involvement of indicator functions. +When the exact solutions do not exist, the root of Equation~\eqref{eq:nsm:ipw} can be approximated by +minimizing the $L_1$-objective function $L_{t_0}(\bm{\beta}, \tau)$ \citep{li2016quantile}, +\begin{align*} + \label{l1:nsm} + \nonumber + L_{t_0}(\bm{\beta}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/\widehat{G}(t_0)} \left| \log(Z_i - t_0) - \vect{X}_i^{\top}\beta \right| + \\ + & \left| M - \bm{\beta}^{\top}\sum_{l=1}^n - \vect{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}(Z_l)/\widehat{G}(t_0)}\right| + + \ \left| M - \bm{\beta}^{\top}\sum_{l=1}^n 2\tau \vect{X}_l I[Z_l > t_0]\right|, +\end{align*} +where $M > 0$ bounds +$\left| \bm{\beta}^{\top}\sum_{i=1}^n - \vect{X}_i \frac{\delta_i I[Z_i > t_0]}{\widehat{G}(Z_i)/ \widehat{G}(t_0)}\right|$ +and $\left| \bm{\beta}^{\top}\sum_{i=1}^n 2\tau \vect{X}_i I[Z_i > t_0]\right|$ from above. +Numerically, the limit $M$ is set to be an extremely large number, and the \code{qris()} function uses $M = 10^6$. +Denote the resulting estimator to be $\bns$. +It has been shown that $\bns$ is consistent for $\bm{\beta}_0$ and asymptotically normally distributed +\citep{li2016quantile}. + +Despite the well-established asymptotic properties, directly estimating the variance of $\bns$ is impractical +because it involves the derivative of non-smooth functions. +A multiplier bootstrap method has typically been employed \citep{li2016quantile} to address this difficulty. +The multiplier bootstrap method considers the perturbed version of $U_{t_0}(\beta, \tau)$, defined as +\begin{equation*} + \label{eq:nsm:rev} + U_{t_0}^{\ast}(\beta, \tau) = \frac{1}{n}\sum_{i=1}^{n} \eta_i I[Z_i \ge t_0] \vect{X}_i \left\{I \left[\log(Z_i - t_0) \leq \vect{X}_i^{\top} \bm{\beta} \right]\frac{\delta_i}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} -\tau \right\}, +\end{equation*} +where $\eta_i, i = 1, \ldots, n, $ are independently and identically (iid) +generated from a positive random variable with unity mean and variance, +and $\widehat{G}^\ast(\cdot)$ is a perturbed version of $\widehat{G}(\cdot)$, +constructed as +$\widehat{G}^\ast(t) = +\prod_{i: t_i \leq t} (1 - \sum_{j=1}^n \eta_j(1 - \delta_j)I(Z_j \leq t_i) / \sum_{j=1}^n \eta_jI(Z_j \geq t_i))$ +for a given realization of $\eta_i$. +% by substituting $\sum_{j=1}^n (1-\delta_j) I(Z_j \leq t)$ in the numerator and $\sum_{j=1}^n I(Z_j \geq t)$ +% in the denominator with +% $\sum_{j=1}^n \eta_j (1-\delta_j) I(Z_j \leq t)$ and $\sum_{j=1}^n \eta_j I(Z_j \geq t)$ given $(\eta_1, \ldots, \eta_n)$, respectively. +On the other hand, a perturbed $L_1$-objective function, denoted as $L_{t_0}^{\ast}(\bm{\beta}, \tau)$, +can be similarly constructed, where +\begin{align*} + L_{t_0}^{\ast}(\bm{\beta}, \tau) = & \frac{1}{n}\sum_{i=1}^n \frac{\delta_i I[Z_i > t_0]}{\widehat{G}^{\ast}(Z_i)/\widehat{G}^{\ast}(t_0)} \left| \log(Z_i - t_0) - \vect{X}_i^{\top}\bm{\beta} \right| + \nonumber \\ + & \left| M - \bm{\beta}^{\top}\sum_{l=1}^n - \vect{X}_l \frac{\delta_l I[Z_l > t_0]}{\widehat{G}^{\ast}(Z_l)/\widehat{G}^{\ast}(t_0)}\right| + + \ \left| M - \beta^{\top}\sum_{l=1}^n 2\tau \vect{X}_l \eta_l I[Z_l > t_0]\right|. +\end{align*} +Solving for $U_{t_0}^{\ast}(\bm{\beta}, \tau) = 0$, or equivalently, +minimizing $L_{t_0}^{\ast}(\bm{\beta}, \tau)$, yields one realization of $\bns$. +The multiplier bootstrap variance is computed as the sample variance of +a large number of realizations of $\bns$. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Induced smoothing %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Estimation using induced smoothed functions} \label{sec:IS:pt} + +The regression coefficient in Equation~\eqref{qr:mod1} can be more efficiently obtained +through the induced smoothed version of Equation~\eqref{eq:nsm:ipw}. +The induced smoothed estimating functions are constructed by taking +the expectation with respect to a mean-zero random noise added to the +regression parameters in Equation~\eqref{eq:nsm:ipw}. +Specifically, +\begin{align}\label{eq:is} + \widetilde{U}_{t_0}(\bm{\beta}, \tau, H) & = E_w \{U_{t_0}(\bm{\beta}+\matr{H}^{1/2}\matr{W}, \tau)\}\nonumber\\ + & = \frac{1}{n} \sum_{i=1}^{n} I[Z_i > t_0] \vect{X}_i \left\{ \Phi\left(\frac{\vect{X}_i^\top\bm{\beta}-\log(Z_i-t_0)}{\sqrt{\vect{X}_i^{\top} \matr{H} \vect{X}_{i}}}\right)\frac{\delta_i}{\widehat{G}(Z_i)/\widehat{G}(t_0) } -\tau \right\}, +\end{align} +where $\matr{H} = O(n^{-1})$, +$\matr{W} \sim N(0, \matr{I}_p)$ is a standard normal random vector, +$\matr{I}_p$ is the $p \times p $ identity matrix, +and $\Phi(\cdot)$ is the cumulative distribution function of a standard normal random variable. +A typical choice for $\matr{H}$ is to fix it at $n^{-1}\matr{I}_p$, +while some alternative choices are explored in \citet{chiou2015rank}. +Let $\bis$ be the solution to $\widetilde{U}_{t_0}(\bm{\beta}, \tau, \matr{H}) = 0$. +Since Equation~\eqref{eq:is} is a smooth function in $\bm{\beta}$, +the estimator can be obtained using standard numerical algorithms such as the Newton-Raphson method. +Moreover, the induced smoothed estimator for $\bm{\beta}_0$ has been shown to be +asymptotically equivalent to its non-smooth counterpart \citep{kim2023smoothed}. + + +Following the idea in Section~\nameref{sec:nsm:pt}, +the multiplier bootstrap procedure can be similarly employed to estimate the variance of $\bis$. +The perturbed version of Equation~\eqref{eq:is} takes the form of +\begin{equation} \label{eq:7} + \widetilde{U}^{\ast}_{t_0}(\bm{\beta}, \tau, \matr{H}) = \frac{1}{n} \sum_{i=1}^{n} \eta_i I[Z_i > t_0] \vect{X}_i \left\{ \Phi\left(\frac{\vect{X}_i^\top\bm{\beta} - \log(Z_i-t_0)}{\sqrt{\vect{X}_i^{\top} \matr{H} \vect{X}_{i}}}\right)\frac{\widehat{G}^{\ast}(t_0) \delta_i}{\widehat{G}^{\ast}(Z_i)} -\tau \right\}. +\end{equation} +The multiplier bootstrap procedure estimates the variance of $\bis$ by calculating the sample variance of +a large number of realizations of $\bis$ obtained by repeatedly solving Equation~\eqref{eq:7}. + + +It has been shown that the asymptotic variance +$\Var(\bm{\beta}, \tau)$ can be decomposed into +$\matr{A}(\bm{\beta})^{\top} \matr{V}(\bm{\beta}) \matr{A}(\bm{\beta})$ \citep{kim2023smoothed}, +where the two components, $\matr{A}(\bm{\beta})$ and $\matr{V}(\bm{\beta})$, can be estimated separately. +Since Equation~\eqref{eq:is} is a smooth function in $\bm{\beta}$, the slope matrix, +$\matr{A}(\bm{\beta})$, can be conveniently estimated by differentiating +$\widetilde{U}_{t_0}(\bm{\beta}, \tau, \matr{H})$ with respect to $\bm{\beta}$. +The explicit form of $\matr{A}(\bm{\beta})$ is as follows: +\begin{align} \label{eq:cov:slp} + \matr{A}(\bm{\beta}) & = \frac{\partial \widetilde{U}_{t_0}(\bm{\beta}, \tau, \matr{H})}{\partial \bm{\beta}} \nonumber \\ + & = \frac{1}{n}\sum_{i=1}^{n} I[Z_i > t_0] \vect{X}_i \frac{G(t_0) \delta_i}{G(Z_i)} \phi\left(\frac{{\vect{X}_i}^{\top}\bm{\beta} - \log(Z_i-t_0)}{\sqrt{{\vect{X}_i}^{\top}\matr{H} \vect{X}_i}}\right)\left(\frac{-{\vect{X}_i}}{\sqrt{{\vect{X}_i}^{\top} \matr{H} {\vect{X}_i}}}\right), +\end{align} +where $\phi (\cdot)$ is the density function of the standard normal random variable. + +The slope matrix, $\widehat{\matr{A}}(\bis)$, can be evaluated directly +by plugging in $\bis$ and $\widehat{G}(\cdot)$. +On the other hand, the variance of the estimating function, +$\widehat{\matr{V}}(\bm{\beta})$, can be obtained by a computationally efficient +resampling method motivated by the multiplier bootstrap procedure in +Section~\nameref{sec:nsm:pt}. +Specifically, we propose estimating $\widehat{\matr{V}}(\bis)$ as the +simple variance of a large set of realizations of the perturbed version of +$\widetilde{U}_{t_0}(\bis, \tau, \matr{H})$ presented in Equation~\eqref{eq:7}. +We refer to this procedure as the partial multiplier bootstrapping approach +because it utilizes the perturbed estimating function, +similar to the full multiplier bootstrapping approach, +but the computation of $\widehat{\matr{A}}(\bis)$ and $\widehat{\matr{V}}(\bis)$ +does not involve the repeated solving of the perturbed estimating equations. +Thus, the partial multiplier bootstrapping approach is expected to be computationally +more efficient than the multiplier bootstrap method. +A similar procedure and its performance have been studied in modeling failure +times with semiparametric AFT models \citep{chiou2014fast,aftgeepackage}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Iteration procedure %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Iterative procedure in induced smoothing estimation} \label{sec:iter} + +The induced estimator $\bis$ is obtained with a fixed $\matr{H}$, +as described in Section~\nameref{sec:IS:pt}, and its variance is estimated separately. +This estimation procedure can be viewed as a special case of the following iterative procedure, +which updates $\matr{H}$ and $\bis$ iteratively. +Specifically, the iterative algorithm utilizes the Newton-Raphson method while sequentially updating $\bis$ +and $\widehat{\Var}(\bis)$ until convergence. +Similar iterative algorithms have also been considered previously in the induced smoothing approach +for semiparametric AFT models \citep{johnson2009induced, chiou2014fast, chiou2015semiparametric, choi2018smoothed}. +The iterative procedure is summarized as follows: +\begin{description} +\item[\bf Step 1:] + Set the initial values $\widehat{\bm{\beta}}^{(0)}$, + $\widehat{\matr{\Sigma}}^{(0)} = \matr{I}_{p}$, + and $\matr{H}^{(0)} = n^{-1}\widehat{\matr{\Sigma}}^{(0)}$. +\item[\bf Step 2:] + Given $\widehat{\bm{\beta}}^{(k)}$ and $\matr{H}^{(k)}$ at the $k$-th step, update $\widehat{\bm{\beta}}^{(k)}$ by + \begin{equation*} + \widehat{\bm{\beta}}^{(k+1)}=\widehat{\bm{\beta}}^{(k)} - \widehat{\matr{A}}(\widehat{\bm{\beta}}^{(k)})^{-1}{\widetilde{U}_{t_0}(\widehat{\bm{\beta}}^{(k)}, \tau, \matr{H}^{(k)}}). + \end{equation*} +\item[\bf Step 3:] + Given $\widehat{\bm{\beta}}^{(k+1)}$ and $\widehat{\matr{\Sigma}}^{(k)}$, update $\widehat{\matr{\Sigma}}^{(k)}$ by + \begin{equation*} + \widehat{\matr{\Sigma}}^{(k+1)} = \widehat{\matr{A}}(\widehat{\bm{\beta}}^{(k+1)})^{-1} \widehat{\matr{V}}(\widehat{\bm{\beta}}^{(k+1)}, \tau) \widehat{\matr{A}}(\widehat{\bm{\beta}}^{(k+1)})^{-1}. + \end{equation*} +\item[\bf Step 4:] + Set $\matr{H}^{(k+1)} = n^{-1}\widehat{\matr{\Sigma}}^{(k+1)}$. Repeat Steps 2, 3 and 4 until $\widehat{\bm{\beta}}^{(k)}$ and $\widehat{\matr{\Sigma}}^{(k)}$ converge. +\end{description} +The initial value, $\widehat{\bm{\beta}}^{(0)}$, could be chosen as $\bns$. +We define $\bit$ and $\widehat{\bm{\Sigma}}_{\tiny\mbox{IT}}$ as the +values of $\widehat{\bm{\beta}}^{(k)}$ and $\widehat{\matr{\Sigma}}^{(k)}$ at convergence, +and $\widehat{\Var}(\bit) = n^{-1}\widehat{\matr{\Sigma}}_{\tiny\mbox{IT}}$. +In Step 3, $\widehat{\matr{V}}(\widehat{\bm{\beta}}^{(k+1)}, \tau)$ +is obtained using the partial multiplier bootstrap approach. +However, the full multiplier bootstrap approach can also be employed +but would require longer computation times. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Package implementation %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Package implementation} +\label{sec:implementation} + +The main function in the \CRANpkg{qris} package for +estimating the regression parameters in the quantile regression model for residual life is the +\code{qris()} function. +The \code{qris()} function is written in C++ and incorporated into R +using the \CRANpkg{Rcpp} \citep{Rcpppackage} and \CRANpkg{RcppArmadillo} \citep{RcppArmadillopackage} packages. +The synopsis of \code{qris} is: + +\begin{example} + > args(qris) + function (formula, data, t0 = 0, Q = 0.5, nB = 100, method = c("smooth", + "iterative", "nonsmooth"), se = c("fmb", + "pmb"), init = c("rq", "noeffect"), verbose = FALSE, + control = qris.control()) +\end{example} +% \input{codes/argsqris.tex} + +The required argument is \code{formula}, +which specifies the quantile regression model to be fitted using the variables in \code{data}. +The \code{formula} assumes that the response variable is a \class{Surv} object +created by the \code{Surv()} function in the \CRANpkg{survival} package \citep{survivalpackage}. +This formula structure is commonly adopted for handling survival data in R, as seen in functions +like \code{survreg()} and \code{coxph()} in the \CRANpkg{survival} package. +The argument \code{t0} specifies the base time used in defining residual life. +The default value of \code{t0} is set to zero, in which case residual life reduces to a failure time. +The \code{Q} argument is used to specify the target quantile of residual life to estimate, +with the default value being set to 0.5 (median). +The \code{nB} argument specifies the bootstrapping size used in standard error estimation, +with the default value set to 100. +The \code{method} argument specifies one of the three estimation methods: +\code{"nonsmooth"}, \code{"smooth"}, and \code{"iterative"}, +corresponding to the estimating procedures outlined in Sections~\nameref{sec:nsm:pt}, +\nameref{sec:IS:pt}, and~\nameref{sec:iter}, respectively. +Given the point estimates of the regression parameters, +their standard errors can be estimated using one of two implemented methods: +\code{se = "fmb"} and \code{se = "pmb"}. +The \code{se = "fmb"} method employs a full-multiplier bootstrapping approach to +estimate the variance by the sample variance of large realizations of $\widehat\beta$. +The \code{se = "pmb"} method estimates the variance using a robust sandwich variance estimator +and employs the computationally efficient partial multiplier bootstrapping approach described in +Section~\nameref{sec:IS:pt}. +The \code{"fmb"} option is available for all three point estimation methods, +whereas the \code{"pmb"} option is not available for the \code{"nonsmooth"} +point estimation method due to the lack of a closed-form sandwich variance estimator. +The \code{init} argument allows users to specify the initial value for estimating regression parameters +by either a $p$-dimensional numerical vector or a character string. +In the latter case, the options \code{init = "rq"} and \code{init = "noeffect"} correspond to +the point estimate obtained from the \code{rq()} function in the \CRANpkg{quantreg} package +and a $p$-dimensional vector of zeros, respectively. +The default value for \code{init} is \code{init = "rq"}. +Among the three methods implemented for point estimation, \code{method = "smooth"} and +\code{method = "nonsmooth"} are non-iterative, +in the sense that point estimation is performed separately from the estimation of standard errors. +On the other hand, \code{method = "iterative"} calculates point estimates and the corresponding +standard error estimates simultaneously through iterative updates. +When \code{method = "iterative"}, users can define specific convergence criteria using \code{qris.control()}. +The available options in \code{qris.control()} are as follows. + +\begin{example} + > args(qris.control) + function (maxiter = 10, tol = 0.001, trace = FALSE) +\end{example} +% \input{codes/argscontrol.tex} + +The \code{maxiter} argument specifies the maximum number of iterations. +The default value for \code{maxiter} is ten, +as the proposed algorithm typically converges within ten steps based on our exploration. +The convergence tolerance is controlled using the \code{tol} argument, +which has a default value of \code{1e-3}. +The \code{trace} argument takes a logical value and +is used to determine whether to print the result for each iteration. +The default setting is \code{trace = FALSE}. +The \class{qris} object is fully compatible with many of R's generic functions, +including \code{coef()}, \code{confint()}, \code{plot()}, \code{predict()}, +\code{print()}, \code{residuals()}, \code{summary()}, and \code{vcov()}. + + +Among the available \code{S3} methods, +a unique feature of the \CRANpkg{qris} package's \code{S3 plot} method, +when applied to a \class{qris} object, is its ability to automatically +update the original object by extending the range of $\tau$ or $t_0$ values. +This extension enables the generation of a covariate effect plot over the +newly specified values of $\tau$ or $t_0$, +providing a comprehensive visualization of the covariate effects across the extended range. +The \code{S3} method for plotting a \class{qris} object is shown below. +\begin{example} + > argsAnywhere(plot.qris) + function (x, t0s = NULL, Qs = NULL, nB = NULL, vari = NULL, byQs = FALSE, + ggextra = NULL, ...) + NULL +\end{example} +The argument \code{x} is a \class{qris} object created using the \code{qris()} function. +The \code{t0s} and \code{Qs} arguments are numeric vectors that enable users to specify +the values of $t_0$ or $\tau$ for plotting the covariate effect. +If \code{t0s} and \code{Qs} are not specified, +the covariate effects are plotted against $\tau = 0.1, 0.2, \ldots, 0.9$ +at the base time ($t_0$) inherited from the \class{qris} object specified in \code{x}. +The \code{nB} argument is a numerical variable that controls the sample size for bootstrapping, +used to compute standard error estimations based on the variance estimation specified +in the original \class{qris} object. +When \code{nB} is specified, the function calculates standard errors +for all combinations of $t_0$ and $\tau$ specified in \code{t0s} and \code{Qs}, +computes 95\% confidence intervals accordingly, +and includes them in the covariate effect plot. +The \code{vari} argument is a character string that allows users to specify the +names of the covariates they want to display in the effect plots. +When the \code{vari} argument is not specified, +all covariates will be included in the plots by default. +The coefficient event plot can be plotted against the specified quantiles by +setting \code{byQs = TRUE} or against the specified base times by setting \code{byQs = FALSE}. +Finally, the \code{ggextra} argument allows users to pass additional graphical parameters +to the \CRANpkg{ggplot2} package, offering further customization options for the plots. +When the \code{plot()} function is called, it internally invokes the \code{qris.extend()} +function to compute the covariate effects at additional values. +The syntax for the \code{qris.extend()} function is provided below: +\begin{example} + > args(qris.extend) + function (x, t0s = NULL, Qs = NULL, nB = NULL, vari = NULL) + NULL +\end{example} +The arguments in \code{qris.extend()} are inherited from the arguments specified in +the \code{plot()} function. +To reduce runtime when repeatedly calling the \code{plot()}, +one can calculate the desired covariate effects by applying \code{qris.extend()} +outside of \code{plot()} first and then supply the results to \code{plot()}. +This approach allows for pre-computation of the covariate effects, making it more +efficient when generating multiple plots. +Overall, the unique plotting feature in \CRANpkg{qris} +provides users with a seamless and effortless approach to conducting a +comprehensive assessment of the covariate effects across different quantiles or base times. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Illustration %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%- +\section{Illustration} \label{sec:illustration} + +\subsection{Simulated data}\label{subsec:simulation} +In this subsection, we present a simple simulation example to validate the implementations in the +proposed \CRANpkg{qris} package. +The simulation involves five covariates, denoted as $X_1, \ldots, X_5$. +Among these covariates, $X_1$ and $X_4$ follow a standard uniform distribution, +$X_2$ follows a binomial distribution with a success probability of 0.5, +$X_3$ follows a standard normal distribution, and $X_5$ follows a standard exponential distribution. +We assume that $X_2, X_3, X_4$, and $X_5$ do not impact the residual life, +meaning their corresponding coefficient values $\beta_2$, $\beta_3$, $\beta_4$, and $\beta_5$ are zero. +The survival time $T$ is generated from a Weibull distribution with the survival function +$S(t) = \exp\{-(\rho t)^\kappa\}$ for $t > 0$, where $\kappa = 2$, and $\rho$ is obtained by solving +\begin{equation} \label{eq:sim:weibull} + \rho^{-1}\{ (\rho t_0)^\kappa - \log (1-\tau) \}^{(1/\kappa)}- t_0 = \exp\{\beta_0 + \beta_1 X_1\}, +\end{equation} +for a specified $t_0$ and $\tau$. +We set the intercept $\beta_0 = \log(5)$ and $\beta_1 = \log(2)$ at $t_0 = 0$. +Given $\rho$, $\tau$, and $X_1$, the true values of $\beta_0$ and $\beta_1$ +can be obtained sequentially from Equation~\ref{eq:sim:weibull} for different $t_0 > 0$. +In our case, the corresponding true values of $\beta_0$ are approximately 1.411 and 1.219 for $t_0=1$ and 2, respectively. +Similarly, the true values of $\beta_1$ are approximately 0.797 and 0.907 for $t_0=1$ and 2, respectively. +The closed-form expression for generating $T$ is then $\{ -\log(1 - u) \}^{1/\kappa} / \rho$, +where $u$ is a uniform random variable over $(0, 1)$. +Given these specifications, +we have implemented the \code{data.gen()} function to generate simulation data. +The \code{data.gen()} function takes four arguments: +\code{n}, \code{t0}, \code{cen}, and \code{Q}, representing the sample size, $t_0$, censoring proportion, +and $\tau$, respectively. +We generate censoring times $C$ from an independent uniform distribution over $(0, c)$, +where $c$ is chosen to achieve the desired censoring proportions of 10\% and 30\%. +Using the generated dataset, we fit the model using three different estimation methods: +induced smoothing, non-smooth, and iterative-induced smoothing. +All analyses were conducted on a 4.2 GHz Intel(R) quad Core(TM) i7-7700K central processing unit (CPU) using R 4.3.0 \citep{r2021}. +The following code demonstrates the implementation of \code{data.gen()} to generate a simulation dataset. +\begin{example} + > data.gen <- function(n, t0, cen = .3, Q = .5) { + + if (!(t0 %in% 0:2)) + + stop("T0 is limited to three specific values: 0, 1, or 2.") + + if (!(cen %in% c(0, .1, .3))) + + stop("Censoring is limited to three specific values: 0%, 10%, or 30%.") + + if (!(Q %in% c(.25, .5))) + + stop("Q is limited to two specific values: 0.25, or 0.50.") + + censoring <- Inf + + if (t0 == 0) { + + if (cen == .1) censoring <- runif(n, 0, 125.1) + + if (cen == .3) censoring <- runif(n, 0, 25.49) + + beta0 <- log(5); beta1 <- log(2) + + } + + if (t0 == 1) { + + if (cen == .1) censoring <- runif(n, 0, 120.8) + + if (cen == .3) censoring <- runif(n, 0, 23.41) + + beta0 <- 1.410748; beta1 <- 0.7974189 + + } + + if (t0 == 2) { + + if (cen == .1) censoring <- runif(n, 0, 120.6) + + if (cen == .3) censoring <- runif(n, 0, 26.20) + + beta0 <- 1.219403; beta1 <- 0.9070615 + + } + + dat <- data.frame(censoring, + + Time0 = sqrt(-log(1 - runif(n))), + + X1 = runif(n), + + X2 = rbinom(n, 1, .5), + + X3 = rnorm(n), + + X4 = runif(n), + + X5 = rexp(n, 1)) + + rho <- (-log(1 - Q))^0.5 * (((exp(beta0 + beta1 * dat$X1) + t0)^2 - t0^2)^-0.5) + + dat$Time0 <- dat$Time0 / rho + + dat$Time <- pmin(dat$Time0, dat$censoring) + + dat$status <- 1 * (dat$Time0 < dat$censoring) + + subset(dat, select = c(Time, status, X1, X2, X3, X4, X5)) + + } + > set.seed(3) + > head(data.gen(200, 0)) + + Time status X1 X2 X3 X4 X5 + 1 4.283379 0 0.09137221 0 2.1638425 0.33833437 0.8751895 + 2 14.797025 1 0.81196535 1 0.8803785 0.82101134 0.3648634 + 3 5.934559 1 0.60923418 1 0.5051163 0.56536790 0.3997803 + 4 7.223266 1 0.54550179 1 0.1105902 0.32417202 1.2169470 + 5 15.128553 1 0.86115736 0 -0.2928586 0.05825095 0.1835962 + 6 5.135852 1 0.28915525 0 0.7723200 0.94126325 0.3809120 +\end{example} +% \input{codes/datagen} +The \code{data.gen()} function generates a \code{data.frame} containing seven variables. +The \code{Time} variable represents the observed survival time, +while the \code{status} variable serves as the event indicator, +taking the value 1 for observed events and 0 for censored observations. +The variables \code{X1}, $\ldots$, \code{X5} are the covariates. +The implementation in the \code{data.gen()} function generates the Weibull survival times +using the inverse probability integral transform technique. +Alternatively, users can use the \code{rweibull()} function with the parameters +\code{shape = 2} and \code{scale = 1 / rho} to generate these Weibull survival times directly. + +We assess the performance of the proposed implementation across various scenarios, +including three sample sizes ($n = 200, 400, 1000$), three levels of $t_0$ ($0, 1, 2$), +two censoring proportions (10\% and 30\%), and two values of $\tau$ (0.25 and 0.50). +For a given dataset, we apply the full-multiplier bootstrapping approach with 200 bootstrap samples +to all three available estimating procedures: +\code{method = "nonsmooth"}, \code{method = "smooth"}, and \code{method = "iterative"}. +To facilitate the evaluation process, we create the \code{do\_fmb()} function +to record the coefficient estimates, standard errors, +and computing times for fitting a single simulated dataset generated from \code{data.gen()}. +The following is the implementation of the \code{do\_fmb()} function and the corresponding code +to run the simulation with 200 replications. +We present the code and result of the simulation experiments conducted at three different sample sizes, +with $t_0$ values set to 0 and 1, +while holding the censoring proportion at 30\% and $\tau$ value at 0.5. +The results for other simulation scenarios are provided in the Supplementary Materials. +\begin{example} + > do_fmb <- function(n, t0, cen, Q, nB) { + + dat <- data.gen(n, t0, cen, Q) + + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + + stamp <- NULL + + stamp[1] <- Sys.time() + + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "fmb") + + stamp[2] <- Sys.time() + + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "nonsmooth", se = "fmb") + + stamp[3] <- Sys.time() + + f3 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "fmb") + + stamp[4] <- Sys.time() + + list(smooth = c(f1$coef, f1$std), + + nonsmooth = c(f2$coef, f2$std), + + iter = c(f3$coef, f3$std), + + times = diff(stamp)) + + } + > B <- 200 + > set.seed(2) + > sims0_fmb <- mapply(function(n, t0) + + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) + > sim1_fmb <- mapply(function(n, t0) + + replicate(B, do_fmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) +\end{example} +% \input{codes/simulation_fmb} + +Figure~\ref{fig:sim1} displays violin plots that provide visualizations of the empirical +distribution of the coefficient estimates. +As expected, all three estimators exhibit small biases, +which are calculated as the difference between the point estimates (PE) and the true regression coefficients. +Furthermore, the empirical distributions of the PEs demonstrate a normal-like shape, +aligning with the asymptotic properties of the proposed method \citep{li2016quantile, kim2023smoothed}. +When the sample size is smaller (e.g., $n = 200$ and 400), +the \code{nonsmooth} approach appears to yield slightly larger empirical standard errors (ESE) +compared to the \code{smooth} or \code{iterative} approaches. +However, when $n = 1000$, the ESEs are similar across all approaches. +On the other hand, the comprehensive simulation results presented in Table 1 of the Supplementary Materials +confirm that all coefficient estimates closely approximate the true regression coefficients. +On the other hand, the ESEs and the averaged estimated standard errors (ASE) are in close agreement for all scenarios, +indicating the validity of the variance estimation. +Furthermore, the computation times, which are presented separately in the upper panel of Table~\ref{tab:time}, +indicate that when employing the full multiplier bootstrapping approach, +the \code{nonsmooth} approach demonstrates a slight advantage in terms of computational efficiency over the +\code{smooth} approach, while the \code{iterative} approach takes 5.1 to 9.5 times longer than the \code{smooth} approach. +In summary, the timing results show that the proposed method can yield valid inference results within seconds, +even with large datasets of up to 1000 observations or +when using the computationally demanding full multiplier bootstrapping approach for variance estimation. + +% As expected, all three estimators yield similar point estimates (PE), +% empirical standard error (ESE) +% computed as the simple standard deviation of the 200 PEs, +% and averaged estimated standard error (ASE) computed as the average of the 200 +% full multiplier bootstrap standard errors. +% More importantly, all PEs are close to true regression coefficients confirming the unbiasedness of the proposed estimators. +% The ESE and ASE are in close agreement for all scenarios, +% indicating the validity of the variance estimation. +% On the other hand, Table~\ref{tab:time} shows that, when the full multiplier bootstrapping approach is employed (fmb), the non-smooth approach has a slight edge over the smooth approach in terms of computing time while the iterative estimator took 5.1 to 9.5 times longer than the smooth approach. +% Overall, the timing results show the proposed method can provide valid inference results +% in seconds even with dataset size as large as 1000 and with the computational demanding +% full multiplier bootstrapping approach for variance estimation. + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{1.0\textwidth} + % \includegraphics[scale = .275]{figure/vplot_t0_c3_Q50} + %\includegraphics[width = .95\textwidth]{figure/vplot_t0_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_t0_c3_Q50} + \caption{$t_0 = 0$} + \label{fig:sim1t0} + %} + \end{subfigure} +% \hill + \\[3ex] + \begin{subfigure}[b]{1.0\textwidth} + %\includegraphics[width = .95\textwidth]{figure/vplot_t1_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_t1_c3_Q50} + \caption{$t_0 = 1$} + \label{fig:sim1t1} + %} + \end{subfigure} + \caption{\label{fig:sim1}Comparison of the \code{smooth}, \code{nonsmooth} and \code{iterative} estimators with \code{se = "fmb"} + under 30\% censoring and $\tau = 0.5$.} +\end{figure*} + +When $t_0 = 0$, the targeted semiparametric quantile regression model for residual life +simplifies to the standard quantile regression model for survival time. +In such cases, existing functions like \code{crq()} from the \CRANpkg{quantreg} package \citep{quantregpackage} +can be employed. +A comparison between the performance of \code{crq()} and our proposed implementation +when $t_0 = 0$ is presented in the Supplementary Materials, +where the standard errors of the \code{crq()} are obtained from the bootstrap method with 200 bootstrap samples. +Overall, the performance of \code{crq()} is comparable to the proposed methods in terms of bias and standard errors. +However, we have occasionally encountered situations where the \code{crq()} function fails to converge, +particularly when the sample size is large, as in the case of $n = 1000$. +In the other extended simulation scenarios outlined in the Supplementary Materials, +which encompass various levels of $t_0$, censoring proportions, and $\tau$, +the proposed methods consistently exhibit satisfactory performance across all settings. + +% We further extended the simulation to include different levels of $t_0$, $\tau$, +% and censoring rate, and the results of this extended simulation can also be found in the +% Supplementary Materials. + +The true potential of the proposed smooth approach lies in its capability for +efficient variance estimation through the implementation of the partial multiplier bootstrapping approach. +This approach eliminates the need for repetitive solving of estimating equations, +resulting in improved computational efficiency in variance estimation. +To demonstrate its usefulness, we conducted a simulation using both the smooth approach +and the iterative approach with the partial multiplier bootstrapping approach (\code{se = "pmb"}). +This simulation was conducted under the settings of $\tau = 0.5$, $t_0 = 0$ and $1$, +and a 30\% censoring rate. +The \code{do\_pmb()} function was accordingly modified as follows. + +\begin{example} + > do_pmb <- function(n, t0, cen, Q, nB) { + + dat <- data.gen(n, t0, cen, Q) + + fm <- Surv(Time, status) ~ X1 + X2 + X3 + X4 + X5 + + stamp <- NULL + + stamp[1] <- Sys.time() + + f1 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "smooth", se = "pmb") + + stamp[2] <- Sys.time() + + f2 <- qris(fm, data = dat, t0 = t0, Q = Q, nB = nB, method = "iterative", se = "pmb") + + stamp[3] <- Sys.time() + + list(smooth = c(f1$coef, f1$std), + + iter = c(f2$coef, f2$std), + + times = diff(stamp)) + + } + > B <- 200 + > set.seed(2) + > sims0_pmb <- mapply(function(n, t0) + + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(0, 0, 0), SIMPLIFY = F) + > sims1_pmb <- mapply(function(n, t0) + + replicate(B, do_pmb(n, t0 = t0, cen = .3, Q = .5, nB = 200)), + + n = c(200, 400, 1000), t0 = c(1, 1, 1), SIMPLIFY = F) +\end{example} +% \input{codes/simulation_pmb} + +The simulation results obtained using the partial multiplier bootstrapping approach are presented in Figure~\ref{fig:sim2} +and Tables 7 -- 12 in the Supplementary Materials, +while the computing times are displayed in the lower panel of Table~\ref{tab:time}. +Overall, the estimation results obtained using \code{se = "pmb"} in Figure~\ref{fig:sim2} +closely resemble those in Figure~\ref{fig:sim1} with \code{se = "fmb"}. +As seen in Tables 7 and 8, the ESEs from the non-iterative and iterative methods are comparable, +while the ASEs slightly overestimate the ESEs when the sample size is small. +The gaps are slightly smaller for the iterative method, +as shown in some cases \citep{johnson2009induced, kim2021comparison}. +The magnitudes of the differences are not large, and they also become smaller when the sample size reaches $n = 1000$. +More importantly, the computing times with \code{se = "pmb"} show significant speed improvements compared +to when \code{se = "fmb"} is used in every case; we observed up to 79\% timing improvements. + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{1.0\textwidth} + %\subfigure[$t_0 = 0$]{ + % \includegraphics[scale = .275]{figure/vplot_t0_c3_Q50} + %\includegraphics[width = 0.95\textwidth]{figure/vplot_pmb_t0_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_pmb_t0_c3_Q50} + \caption{$t_0 = 0$} + \label{fig:sim2t0} + %} + \end{subfigure} +% \hfill + \\[3ex] + \begin{subfigure}[b]{1\textwidth} +% \subfigure[$t_0 = 1$]{ + %\includegraphics[width = 0.95\textwidth]{figure/vplot_pmb_t1_c3_Q50} + \includegraphics[width = 0.95\textwidth]{vplot_pmb_t1_c3_Q50} + \caption{$t_0 = 1$} + \label{fig:sim2t1} + %} + \end{subfigure} + \caption{\label{fig:sim2} + Comparison of the \code{smooth} and \code{iterative} estimators with \code{se = "pmb"} + under 30\% censoring and $\tau = 0.5$.} +\end{figure*} + + + +\begin{table} + \caption{\label{tab:time} Runtimes (in seconds) when \code{se = fmb} and \code{se = pmb}.} + \centering + \begin{tabular}[t]{llrrrrrrrr} + \toprule + \multicolumn{2}{c}{} & \multicolumn{3}{c}{$t_0 = 0$} & \multicolumn{3}{c}{$t_0 = 1$} \\ + \cmidrule(l{3pt}r{3pt}){1-2}\cmidrule(l{3pt}r{3pt}){3-5} \cmidrule(l{3pt}r{3pt}){6-8} + se & method & 200 & 400 & 1000 & 200 & 400 & 1000\\ + \midrule + % \multirow{3}{*}{fmb} + \code{fmb} & Smooth & 0.103 & 0.174 & 0.471 & 0.106 & 0.178 & 0.480\\ + & Nonsmooth & 0.080 & 0.142 & 0.472 & 0.080 & 0.141 & 0.468\\ + & Iterative & 0.981 & 1.500 & 2.410 & 0.985 & 1.567 & 2.882\\ + [2ex] + \code{pmb} & Smooth & 0.022 & 0.052 & 0.223 & 0.022 & 0.053 & 0.224\\ + & Iterative & 0.296 & 0.580 & 1.407 & 0.296 & 0.581 & 1.435\\ + \bottomrule + \end{tabular} +\end{table} +% \input{codes/simTime} + +After confirming the satisfactory performance of the proposed methodologies, +we now proceed to illustrate the application of the \code{init} argument. +This argument controls the initial values assigned to the root-finding algorithm's estimates and +the plotting capacity of the \CRANpkg{qris} package. +For this illustrative example, we consider a simpler simulation scenario that involves a single binary covariate. +This simplified simulation can be generated using the revised version of the \code{data.gen()} function provided below. + +\begin{example} + > ## Global parameters + + rho0 <- .2 * sqrt(log(2)) + + rho1 <- .1 * sqrt(log(2)) + > data.gen <- function(n) { + + dat <- data.frame(censoring = runif(n, 0, 23.41), + + Time0 = sqrt(-log(1 - runif(n))), + + X = rbinom(n, 1, .5)) + + dat$Time0 <- ifelse(dat$X > 0, dat$Time0 / rho1, dat$Time0 / rho0) + + dat$Time <- pmin(dat$Time0, dat$censoring) + + dat$status <- 1 * (dat$Time0 < dat$censoring) + + subset(dat, select = c(Time, status, X)) + + } + > set.seed(10) + > head(dat <- data.gen(200)) + Time status X + 1 6.034713 1 1 + 2 7.181451 0 1 + 3 9.993908 0 1 + 4 16.225520 0 1 + 5 1.993033 0 1 + 6 5.277471 0 0 +\end{example} + +The updated \code{data.gen()} function returns a \code{data.frame} comprising three variables: +\code{Time}, \code{status}, and \code{X}, representing the +observed survival time, event indicator, and binary covariate, respectively. +We will first illustrate the usage of the argument \code{init} by considering three different initial values: +\code{init = "rq"}, \code{init = c(1,1)}, and a random vector \code{init = rnorm(2)}, +all used in conjunction with the smooth estimator \code{method = "smooth"}. +The following codes provide an example with different initial values. +\begin{example} + > (random <- rnorm(2)) + [1] 1.5025446 0.5904095 + > f1 <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, init = "rq", nB = 0) + > f2 <- update(f1, init = c(1, 1)) + > f3 <- update(f1, init = random) + > all.equal(f1$coef, f2$coef) + [1] TRUE + > all.equal(f2$coef, f3$coef) + [1] TRUE +\end{example} + +The \class{qris} object, with its \code{call} component, +is compatible with the \code{update()} function, +a built-in function commonly used for updating the +attributes of an existing object without requiring redundant and repetitive code. +In the example above, we used the \code{update()} function to modify the initial value specification in \code{f1}. +We observed that different initial values yield identical point estimates, thereby affirming the robustness of the proposed method against fluctuations in initial values. + +The covariate effects, along with their associated 95\% point-wise confidence intervals across +various quantiles or base times, can be visually assessed by applying the generic function +\code{plot()} to a \class{qris} object. +We demonstrate this feature using the following \code{qris} fit, +where the standard errors are obtained using \code{se = "pmb"}, $t_0 = 1$, +and all other parameters are set to their default values. +We update the \code{qris} fit with extended quantiles over ${0.4, 0.5, 0.6, 0.7}$ and +plot the covariate effects against these quantiles using the \code{plot()} function. +\begin{example} + > fit <- qris(Surv(Time, status) ~ X, data = dat, t0 = 1, se = "pmb") + > fit2 <- qris.extend(fit, Qs = 4:7 / 10) +\end{example} +The extended \class{qris} fit generated by the \code{qris.extend()} function inherits +all the attributes from the original \class{qris} object and +includes additional \code{ggdat} components. +The following code compares the components of the returned values from the extended \class{qris} fit +and the original \class{qris} fit. +\begin{example} + > class(fit2) + [1] "qris" + > names(fit) + [1] "call" "coefficient" "data" "formula" "para" + [6] "stderr" "varNames" "vcov" + > setdiff(names(fit2), names(fit)) + [1] "ggdat" +\end{example} +Specifically, the extended \class{qris} fit inherits +\code{call}, \code{coefficient}, \code{para}, \code{stderr}, \code{varNames}, and \code{vcov} +from the original \class{qris} object. +The \code{call} component is the function call from the original \code{qris()} fit, +while \code{coefficient}, \code{stderr}, and \code{vcov} are used to store the point estimates, +standard error estimates, and covariance matrix, respectively. +The \code{para} component is a list containing the parameters specified during the +fitting of the quantile regression model, and \code{varNames} is a character string +representing the variable names in the function call. +The newly added values are \code{ggdat} and \code{gg}. +The \code{ggdat} is a data frame containing covariate information generated under +the different quantiles and base times specified in the \code{qris.extend()}. +Finally, the corresponding covariate effect plot can be generated by plotting the +extended \class{qris} fit as follows. +\begin{example} + > plot(fit2) +\end{example} + +The true values of $\beta$'s at different quantiles and base times, +computed from Equation~\eqref{eq:sim:weibull}, can be implemented in the following commands. +\begin{example} + > ## Global parameters + > r <- 2:1 * sqrt(log(2)) / 10 + > k <- 2 + > ## Function to calculate true beta + > trueB <- function(t0, tau) { + + b <- log(1 / r * ((r * t0) ^ k - log(1 - tau))^(1 / k) - t0) + + c(b[1], b[2] - b[1]) + + } + > ## True beta calculation + > true_Q <- c(t(sapply(4:7 / 10, trueB, t0 = 1))) + > true_t0 <- c(t(sapply(1:3, trueB, tau = .5))) +\end{example} +% \input{codes/trueB} + +The following code extends the \class{ggplot} objects generated by \code{plot.qris()} +by adding additional layers of true value curves and incorporating various \code{ggplot} options. +The resulting figures, Figure~\ref{fig:simulation_quantile} and Figure~\ref{fig:simulation_t0}, +present the output based on whether the covariate effects are plotted against quantiles or base times, respectively. +This observed trend aligns with the specifications described in Equation~\eqref{eq:sim:weibull}, +where increasing $\tau$ corresponds to an increasing $\beta_0$ while keeping $\rho$ and $X$ fixed. +On the other hand, the covariate effect does not change with quantiles but slightly increases with base times, +echoing the model specification where $\beta_0$ is inversely related to $t_0$ and $\beta_1$ +%is directly proportional to $t_0$. +increases as $t_0$ increases. + +\begin{example} + > library(ggplot2) + > plot(fit2) + theme(legend.position = "bottom") + + + geom_line(aes(x = Qs, y = true_Q, col = variable, linetype = "True value")) + + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) + > b <- plot(fit2, t0s = 1:3, byQs = F) + > b + theme(legend.position = "bottom") + + + geom_line(aes(x = t0s, y = true_t0, col = variable, + + linetype = "True value")) + + + scale_linetype_manual(name = "", values = c("True value" = "dotdash")) +\end{example} +% \input{codes/plot1} + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[Plot for $Q\in\{0.4, \ldots, 0.7\}$ at $t_0 = 1$.]{ + \includegraphics[width = 1.0\textwidth]{simulation_smooth_quantile.pdf} + \caption{Plot for $Q\in\{0.4, \ldots, 0.7\}$ at $t_0 = 1$} + \label{fig:simulation_quantile} +% } + \end{subfigure} + \hfill + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[Plot for $t_0\in\{1, \ldots, 3\}$ at $Q = 0.5$.]{ + \includegraphics[width = 1.0\textwidth]{simulation_smooth_t0.pdf} + \caption{Plot for $t_0\in\{1, \ldots, 3\}$ at $Q = 0.5$} + \label{fig:simulation_t0} +% } + \end{subfigure} + \caption{(a) Estimated effects of covariate with the associated $95\%$ pointwise confidence intervals for quantiles ranging from 0.4 to 0.7 at $t_0=1$. Red and blue solid lines are the point estimates of regression parameters for intercept and covariate X, respectively. Similarly, red and blue dotted lines are the upper and lower bounds of $95\%$ pointwise confidence intervals for intercept and covariate X, respectively. + (b) Estimated effects of covariate with the associated $95\%$ pointwise confidence intervals for base times ranging from 1 to 3 at $\tau=0.5$. Red and blue solid lines are the point estimates of regression parameters for intercept and covariate X, respectively. Similarly, red and blue dotted lines are the upper and lower bounds of $95\%$ pointwise confidence intervals for intercept and covariate X, respectively.} + \label{fig:simulation} +\end{figure*} + + +\subsection{North Central Cancer Treatment Group Lung Cancer Data} \label{subsec:lung} + +The North Central Cancer Treatment Group Lung Cancer Data records the survival of patients with advanced lung cancer, +along with assessments of the patients' performance status measured by both physicians and the patients themselves +\citep{loprinzi1994prospective}. +The original objective of the study was to ascertain whether descriptive information from a +patient-completed questionnaire could offer prognostic insights. +The original objective of the study was to determine whether descriptive information from a patient-completed +questionnaire could provide prognostic information. +However, for this illustration, we focus on how gender and weight loss affect the quantiles of residual life +for patients diagnosed with advanced lung cancer at different time points. +The lung cancer data are publicly available from the \CRANpkg{survival} package \citep{survivalpackage} as \code{lung}. +The following code displays the structure of the \code{lung} dataset with variables of interest. + +\begin{example} + > data(cancer, package = "survival") + > str(subset(lung, select = c(time, status, sex, wt.loss))) + 'data.frame': 228 obs. of 4 variables: + $ time : num 306 455 1010 210 883 ... + $ status : num 2 2 1 2 2 1 2 2 2 2 ... + $ sex : num 1 1 1 1 1 1 2 2 1 1 ... + $ wt.loss: num NA 15 15 11 0 0 10 1 16 34 ... +\end{example} +% \input{codes/cancer0} + +The \code{lung} data contains 228 patients whose observed survival times in days and +censoring status (1 = censored, 2 = dead) are recorded in the \code{time} and the \code{status} columns, +respectively. +Although the censoring status in this dataset is not recorded in the typical 0-1 fashion, +the \code{Surv()} function is still applicable to create the corresponding ``\code{Surv}" object. +The \code{lung} data yields a censoring rate of $27.6\%$ with a median survival time of 310 days. +The covariates of interest are gender (\code{sex = 1} if male, \code{sex = 2} if female) and +weight loss (\code{wt.loss}). +In the following, we use the proposed semiparametric quantile regression models to assess +the gender and standardized weight loss effects on different quantiles of residual life at different base times. + + +We first model the median residual life (\code{Q = 0.5}) when the base time is one month (\code{t0 = 30}). +Since the estimated median survival times for combined lung cancers are typically less than one year, +with a range of 8 to 13 months \citep{siegel2021cancer}, +setting the base time at one month provides insight into how gender and weight loss impact the residual time +in early follow-up. +In the following, we obtain the regression coefficient estimates using the induced smoothing functions and +the corresponding variance estimate with the partial multiplier bootstrap approach. + +\begin{example} + > lung$male <- factor(lung$sex, 1:2, c("Male", "Female")) + > lung$std.wt.loss <- scale(lung$wt.loss) + > fit1 <- qris(Surv(time, status) ~ male + std.wt.loss, + + data = lung, t0 = 30, Q = .5, nB = 100, + + method = "smooth", se = "pmb") + > summary(fit1) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.5, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.5611 0.0950 58.550 <2e-16 *** + maleFemale 0.4804 0.1805 2.661 0.0078 ** + std.wt.loss -0.0731 0.0837 -0.874 0.3824 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer1} + +Subjects with missing values (in any of the variables relevant for the modeling task) +are automatically removed when \code{qris()} is called. +The estimated intercept implies that the median residual life for patients who have survived up to 30 days +is $\exp(5.5611) = 260.1$ days for a male with an average weight loss. +More interestingly, the summary shows that the gender effect is statistically significant at the 0.05 significance level, +indicating that a female patient is expected to have a median residual life at 30 days that is $\exp(0.4804) = 1.617$ +times that of a male patient with the same weight loss. +The effect of the weight loss is not statistically significant at the 0.05 level. +In addition to \code{summary()}, important statistics such as the coefficient and variance estimates can be extracted by +\code{S3} methods \code{coef()} and \code{vcov()}, respectively. + +\begin{example} + > coef(fit1) + (Intercept) maleFemale std.wt.loss + 5.56111984 0.48044228 -0.07307635 + > vcov(fit1) + (Intercept) maleFemale std.wt.loss + (Intercept) 0.009021459 -0.010944549 -0.003074041 + maleFemale -0.010944549 0.032594288 0.002847148 + std.wt.loss -0.003074041 0.002847148 0.006998314 +\end{example} +% \input{codes/s3fit1} +Moreover, the corresponding 95\% Wald-type confidence interval can be printed by applying +the \code{confint()} function to the \class{qris} object. +\begin{example} + > confint(fit1) + 2.5 % 97.5 % + (Intercept) 5.3749598 5.74727989 + maleFemale 0.1265926 0.83429199 + std.wt.loss -0.2370390 0.09088626 +\end{example} +% \input{codes/cifit1} + +The \code{update()} function can be conveniently applied to update existing \class{qris} objects. +The following examples update the \code{method} and \code{se} arguments from \code{fit1}. +The updated results yield similar coefficient estimates, but the non-smooth procedure (\code{method = "nonsmooth"}) +yields slightly greater standard error estimates. +\begin{example} + > summary(fit2 <- update(fit1, method = "nonsmooth", se = "fmb")) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.5, nB = 100, method = "nonsmooth", + se = "fmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.5585 0.1132 49.106 <2e-16 *** + maleFemale 0.4695 0.2015 2.331 0.0198 * + std.wt.loss -0.0668 0.1029 -0.650 0.5159 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer2} + +\begin{example} + > summary(update(fit1, method = "iterative")) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.5, nB = 100, method = "iterative", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.5605 0.1016 54.712 <2e-16 *** + maleFemale 0.4807 0.1626 2.957 0.0031 ** + std.wt.loss -0.0720 0.0903 -0.797 0.4252 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer3} + +At a lower (\code{Q = 0.25}) and a higher (\code{Q = 0.75}) quantiles, +the gender effect remains significant at the 0.05 significance level indicating +female patients are associated with longer lower-quantile and higher-quantile residual life +than male patients with the same weight loss. +Among these models, we observed that female patients tend to have higher coefficient estimates +when fitting higher-quantile residual life. +While the sign of the estimated regression coefficient for weight loss changes to a negative value +when considering the lower quantile, the effects remain statistically insignificant for +both the lower and higher quantiles. + + +\begin{example} + > summary(update(fit1, Q = 0.25)) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.25, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 4.9111 0.1034 47.480 <2e-16 *** + maleFemale 0.4651 0.2041 2.279 0.0227 * + std.wt.loss 0.0543 0.0584 0.930 0.3525 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer4} +\begin{example} + > summary(update(fit1, Q = 0.75)) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 30, Q = 0.75, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 6.0748 0.1063 57.126 <2e-16 *** + maleFemale 0.5237 0.1487 3.522 0.0004 *** + std.wt.loss -0.0171 0.1166 -0.147 0.8835 + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer5} + +We also consider the base time at six months \code{t0 = 180}, +which enables us to assess gender and weight loss effects in median residual time at a moderate length of follow-up. +The estimated effect for the gender and weight loss increases as $t_0$ increases from $30$ days to $180$ days and +becomes significant at the 0.05 significant level. +Additionally, the effect of the weight loss seems to be associated with a shorter survival time after +$180$ days, with a $p$-value of $0.0008$. + +\begin{example} + > summary(update(fit1, t0 = 180)) + Call: + qris(formula = Surv(time, status) ~ male + std.wt.loss, + data = lung, t0 = 180, Q = 0.5, nB = 100, method = "smooth", + se = "pmb") + + qris Estimator + estimate std.Error z.value p.value + (Intercept) 5.2243 0.0912 57.255 <2e-16 *** + maleFemale 0.5821 0.1867 3.117 0.0018 ** + std.wt.loss -0.2515 0.0754 -3.337 0.0008 *** + --- + Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +\end{example} +% \input{codes/fitcancer6} + +The \class{qris} object is designed to be compatible with \code{S3} methods: \code{predict()} and +\code{residuals()} functions. +The following presents the fitted survival times for two hypothetical male and female patients with no weight loss, +as well as the first five residual values for the dataset. + \begin{example} + > lung.new <- data.frame(male = c("Male", "Female"), std.wt.loss = 0) + > predict(fit2, newdata = lung.new) + 1 2 + 444.9026 289.4422 + > head(residuals(fit2), 5) + 1 2 3 4 5 + -20.86127 -575.86127 232.44474 -416.82295 -555.82295 + \end{example} + +To better understand the covariate effects on different quantiles of residual time and across different base times, +we plot the estimated regression coefficients of the intercept, sex, and weight loss in \code{fit1} and \code{fit2}. +Figures~\ref{fig:realdata_smooth} and~\ref{fig:realdata_nonsmooth} display the estimated regression coefficients when +\code{method = "smooth"} and \code{method = "nonsmooth"}, respectively, at +different quantiles ranging from 0.2 and 0.5 at $t_0 = 30$ days. +The \code{plot.qris()} function is currently not available for the iterative estimator. This is mainly due to an extended computation time involved, as indicated by our simulation results, and the nature of plotting that necessitates computations across various quantiles or base times. +As expected, the two plots show very similar patterns. +We plot the estimated regression coefficients of the intercept, sex, and weight loss for different quantiles in the range +of 0.2 to 0.5 at $t_0= 50$, 60, 70, and 80 days (Figure~\ref{fig:realdata_multi_quantile}), +as well as for different base times in the range of 50 to 80 days at $\tau=0.2$, 0.3, 0.4, and 0.5 (Figure~\ref{fig:realdata_multi_basetime}). +The estimation method used is non-iterative induced smoothed estimation (\code{method = "smooth"}). +In Figure~\ref{fig:realdata_multi_quantile}, +the estimated intercept increases as the quantile increases (for a given base time). +The estimated slopes for sex remain largely the same, +but those for weight loss tend to decrease slightly across different quantiles (for a given base time). +These patterns remain consistent for different base times. +In Figure~\ref{fig:realdata_multi_basetime}, the estimated intercepts increase as the quantiles increase, +but with a given quantile, they remain flat across the different base times considered. +The estimated regression coefficients for the two covariates do not appear to change significantly +for different base times. + +\begin{example} + > hide <- theme(legend.position = "none") + > plot(fit1, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) + > plot(fit2, Qs = 2:5 / 10, byQs = TRUE, ggextra = hide) + > plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = TRUE, ggextra = hide) + > plot(fit1, Qs = 2:5 / 10, t0s = 5:8 * 10, byQs = FALSE, ggextra = hide) +\end{example} +% \input{codes/plotcancer1} + +\begin{figure*}[ht] + \centering + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[\code{method = ''smooth''} and \code{se = ''pmb''}]{ + \includegraphics[width = 1.0\textwidth]{realdata_smooth_quantile.pdf} + \caption{\code{method = ''smooth''} and \code{se = ''pmb''}} + \label{fig:realdata_smooth} +% } + \end{subfigure} +% \hfill + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[\code{method = ''nonsmooth''} and \code{se = ''fmb''}]{ + \includegraphics[width = 1.0\textwidth]{realdata_nonsmooth_quantile.pdf} + \caption{\code{method = ''nonsmooth''} and \code{se = ''fmb''}} + \label{fig:realdata_nonsmooth} +% } + \end{subfigure} + \\[2ex] + \begin{subfigure}[b]{0.47\linewidth} +%\subfigure[Multiple covariate effect plot against quantiles]{ + \includegraphics[width = 1.0\textwidth]{realdata_multi_quantile.pdf} + \caption{\code{method = ''smooth''} and \code{se = ''pmb''}} + \label{fig:realdata_multi_quantile} +% } + \end{subfigure} +% \hfill + \begin{subfigure}[b]{0.47\linewidth} +% \subfigure[Multiple covariate effect plot against base time]{ + \includegraphics[width = 1.0\textwidth]{realdata_multi_basetime.pdf} + \caption{Multiple covariate effect plot against base time} + \label{fig:realdata_multi_basetime} +% } + \end{subfigure} + \caption{Green, red and blue lines are the point estimates of regression parameters for + intercept, covariate sex and covariate weight loss, respectively. Solid line and dotted line are the point estimates and the upper and lower bounds of $95\%$ pointwise confidence intervals for each regression coefficient. + (a) \code{method = "smooth"} and \code{se = "pmb"} ($\tau = 0.2, 0.3, 0.4, 0.5, t_0=30$) + (b) \code{method = "nonsmooth"} and \code{se = "fmb"} ($\tau = 0.2, 0.3, 0.4, 0.5, t_0=30$) + (c) \code{method = "smooth"} and \code{se = "pmb"} against quantiles ($\tau = 0.2, 0.3, 0.4, 0.5, t_0 = 50, 60, 70, 80$) + (d) \code{method = "smooth"} and \code{se = "pmb"} against base times ($\tau = 0.2, 0.3, 0.4, 0.5, t_0 = 50, 60, 70, 80$)} + \label{fig:realdata} +\end{figure*} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Conclusion %%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Conclusion} \label{sec:conclusion} + +The purpose of the \CRANpkg{qris} package is to provide a comprehensive tool for fitting quantile regression models on residual life for right-censored survival data, with the aim of promoting widespread dissemination and utilization. +This package implements one estimation method based on non-smooth estimating functions and two estimation methods +based on their induced smoothed versions. +The non-smooth estimator is calculated through $L_{1}$-type minimization while incorporating the IPCW technique, +and its variance is calculated using full multiplier bootstrapping. +The first type of the induced smoothed estimator, a non-iterative version, directly solves estimating functions, +and its variance can be calculated using either the full multiplier bootstrapping or +the robust sandwich form with partial multiplier bootstrapping. +As evidenced by the simulation results, this enables one to substantially reduce computing times without sacrificing +estimation accuracy and stability compared to the original non-smooth function-based method. +The iterative smoothed estimator has an advantage in obtaining more precise estimates than its non-iterative version, +although it requires longer computing times. For all these methods, estimates of the regression coefficients and their +variances can be calculated at user-defined quantiles and base times, as long as they are identifiable. +Additionally, the package provides features for plotting estimates with associated 95\% confidence intervals against +quantiles and base times using the generic \code{plot} function. +These plots visualize patterns of estimates at different quantiles and base times, +helping users to easily grasp the overall picture. +The package \CRANpkg{qris} and its included functions are verified through illustrations using simulated data +with interpretation of the results demonstrated through a real data application. + + +Some possible directions for extending our package are as follows. +Efforts can be made to reduce the computational burden associated with variance estimation, +which currently accounts for a significant portion of the computing time. +In particular, the iterative-induced smoothed method employs the partial multiplier bootstrap method +to calculate variance estimates in each iteration. +Since this method requires multiple iterations, it is crucial to explore more computationally efficient variance +estimation procedures for each iteration to reduce the currently relatively longer computation time. +One approach is to utilize a closed-form estimation of the mid-part of the sandwich-type variance, +as discussed in \citet{chiou2014fast, choi2018smoothed}. +Implementing this direct variance estimation in each iteration is expected to further enhance computation efficiency. +Another direction is to generalize the approaches to allow for the inclusion of sampling weights, +which is useful for bias correction when failure time data are generated from non-random sampling designs, +such as case-cohort designs \citep{prentice1986case, chiou2015semiparametric}. +% To obtain valid parameter estimates under such study designs, the incorporation of sampling weights is a standard approach. +The current estimating functions implemented in the \CRANpkg{qris} package assume that the data are randomly sampled, +with sampling weights set to 1." +% Incorporation of sampling weights that can accommodate unequal probabilities of being sampled in the \code{qris} function is a natural direction of future extension. +To the best of our knowledge, there is a lack of model-checking procedures and model-comparison methods +specifically designed for the non-smooth estimator, +and a logical next step would be to develop these procedures for subsequent integration into the package. + + +\bibliography{qris} + +\address{Kyu Hyun Kim\\ + Department of Statistics and Data Science \emph{and} Department of Applied Statistics\\ + Yonsei University\\ + 50 Yonsei-ro, Seodaemun-gu, Seoul\\ + Republic of Korea\\ + \email{kyuhyunkim07@yonsei.ac.kr}} + +\address{Sangwook Kang\\ + Department of Statistics and Data Science \emph{and} Department of Applied Statistics\\ + Yonsei University\\ + 50 Yonsei-ro, Seodaemun-gu, Seoul\\ + Republic of Korea\\ + \email{kanggi1@yonsei.ac.kr}} + +\address{Sy Han Chiou\\ + Department of Statistics and Data Science\\ + Southern Methodist University\\ + P.O. Box 750332, Dallas, TX\\ USA\\ + \email{schiou@smu.edu}\\ + \url{https://www.sychiou.com/}} + +\end{article} + +\end{document} diff --git a/_articles/RJ-2024-007/realdata_multi_basetime.png b/_articles/RJ-2024-007/realdata_multi_basetime.png new file mode 100644 index 0000000000..29454659b7 Binary files /dev/null and b/_articles/RJ-2024-007/realdata_multi_basetime.png differ diff --git a/_articles/RJ-2024-007/realdata_multi_quantile.png b/_articles/RJ-2024-007/realdata_multi_quantile.png new file mode 100644 index 0000000000..e0e970de43 Binary files /dev/null and b/_articles/RJ-2024-007/realdata_multi_quantile.png differ diff --git a/_articles/RJ-2024-007/realdata_nonsmooth_quantile.png b/_articles/RJ-2024-007/realdata_nonsmooth_quantile.png new file mode 100644 index 0000000000..ca90326d92 Binary files /dev/null and b/_articles/RJ-2024-007/realdata_nonsmooth_quantile.png differ diff --git a/_articles/RJ-2024-007/realdata_smooth_quantile.png b/_articles/RJ-2024-007/realdata_smooth_quantile.png new file mode 100644 index 0000000000..f695c54ce5 Binary files /dev/null and b/_articles/RJ-2024-007/realdata_smooth_quantile.png differ diff --git a/_articles/RJ-2024-007/simulation_smooth_quantile.png b/_articles/RJ-2024-007/simulation_smooth_quantile.png new file mode 100644 index 0000000000..0be5bb1cc0 Binary files /dev/null and b/_articles/RJ-2024-007/simulation_smooth_quantile.png differ diff --git a/_articles/RJ-2024-007/simulation_smooth_t0.png b/_articles/RJ-2024-007/simulation_smooth_t0.png new file mode 100644 index 0000000000..2d9458e1b7 Binary files /dev/null and b/_articles/RJ-2024-007/simulation_smooth_t0.png differ diff --git a/_articles/RJ-2024-007/supp/2022-185_Supp.pdf b/_articles/RJ-2024-007/supp/2022-185_Supp.pdf new file mode 100644 index 0000000000..361c589921 Binary files /dev/null and b/_articles/RJ-2024-007/supp/2022-185_Supp.pdf differ diff --git a/_articles/RJ-2024-007/vplot_pmb_t0_c3_Q50.pdf b/_articles/RJ-2024-007/vplot_pmb_t0_c3_Q50.pdf new file mode 100644 index 0000000000..f7ddcc9ff3 Binary files /dev/null and b/_articles/RJ-2024-007/vplot_pmb_t0_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/vplot_pmb_t0_c3_Q50.png b/_articles/RJ-2024-007/vplot_pmb_t0_c3_Q50.png new file mode 100644 index 0000000000..6637868c68 Binary files /dev/null and b/_articles/RJ-2024-007/vplot_pmb_t0_c3_Q50.png differ diff --git a/_articles/RJ-2024-007/vplot_pmb_t1_c3_Q50.pdf b/_articles/RJ-2024-007/vplot_pmb_t1_c3_Q50.pdf new file mode 100644 index 0000000000..cd8fed6efc Binary files /dev/null and b/_articles/RJ-2024-007/vplot_pmb_t1_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/vplot_pmb_t1_c3_Q50.png b/_articles/RJ-2024-007/vplot_pmb_t1_c3_Q50.png new file mode 100644 index 0000000000..514fb642b8 Binary files /dev/null and b/_articles/RJ-2024-007/vplot_pmb_t1_c3_Q50.png differ diff --git a/_articles/RJ-2024-007/vplot_t0_c3_Q50.pdf b/_articles/RJ-2024-007/vplot_t0_c3_Q50.pdf new file mode 100644 index 0000000000..3e9167526d Binary files /dev/null and b/_articles/RJ-2024-007/vplot_t0_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/vplot_t0_c3_Q50.png b/_articles/RJ-2024-007/vplot_t0_c3_Q50.png new file mode 100644 index 0000000000..5e0e7f079e Binary files /dev/null and b/_articles/RJ-2024-007/vplot_t0_c3_Q50.png differ diff --git a/_articles/RJ-2024-007/vplot_t1_c3_Q50.pdf b/_articles/RJ-2024-007/vplot_t1_c3_Q50.pdf new file mode 100644 index 0000000000..c10c772950 Binary files /dev/null and b/_articles/RJ-2024-007/vplot_t1_c3_Q50.pdf differ diff --git a/_articles/RJ-2024-007/vplot_t1_c3_Q50.png b/_articles/RJ-2024-007/vplot_t1_c3_Q50.png new file mode 100644 index 0000000000..451753b1e4 Binary files /dev/null and b/_articles/RJ-2024-007/vplot_t1_c3_Q50.png differ diff --git a/_articles/RJ-2024-008/RJ-2024-008.R b/_articles/RJ-2024-008/RJ-2024-008.R new file mode 100644 index 0000000000..0e68d5796b --- /dev/null +++ b/_articles/RJ-2024-008/RJ-2024-008.R @@ -0,0 +1,226 @@ +# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand +# Please edit RJ-2024-008.Rmd to modify this file + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = FALSE, + warning = FALSE, + message = FALSE +) + +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "figures/", + dev = "png", + dpi = 150, + fig.asp = 0.8, + fig.width = 8, + fig.height = 4, + out.width = "60%", + fig.align = "center" +) + +library(kableExtra) +library(nortsTest) +library(fGarch) +library(knitr) +library(forecast) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +x = arima.sim(250,model = list(ar =c(0.5,0.2)), + rand.gen = rbeta,shape1 = 9,shape2 = 1) + +# Asymptotic Epps test +epps.test(x) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +epps.test(x, lambda = abs(rnorm(mean = c(1, 2), 2))) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +epps_bootstrap.test(x, seed = 298) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +x = arima.sim(250,model = list(ma = c(0.2, 0.3, -0.4)), + rand.gen = rgamma, rate = 3, shape = 6) +# Asymptotic Lobato & Velasco +lobato.test(x) + + +## ----echo = TRUE-------------------------------------------------------------- +lobato_bootstrap.test(x, seed = 298) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(3468) +library(fGarch) +spec = garchSpec(model = list(alpha = 0.2, beta = 0.3)) +x = ts(garchSim(spec, n = 300)) +rp.test(x) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +x = arima.sim(250,model = list(ar = 0.2, ma = 0.34)) +# Default, Psaradakis and Vavra's procedure +vavra.test(x, seed = 298) + + +## ----echo = TRUE-------------------------------------------------------------- +vavra.test(x, normality = "cvm", seed = 298) + + +## ----echo=TRUE---------------------------------------------------------------- +set.seed(23890) +x = arima.sim(250,model = list(ar = 0.2)) +y = arima.sim(250,model = list(ar = c(0.4,0,.1))) +elbouch.test(y = y,x = x) + + +## ----tab1-static, eval = knitr::is_latex_output(),warning = FALSE------------- +load("data/r_sim.Rdata") +phi = c("-0.4","-0.25","0.0","0.25","0.4","max.phi") + +r1 = results1[,2:14] +colnames(r1) = c("phi", phi, phi) + +kable(r1, "latex", booktabs = TRUE,digits = 3, caption = "Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 100" = 6, "n = 250" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) + + +## ----tab1-interactive, eval = knitr::is_html_output(),warning = FALSE--------- +# load("data/r_sim.Rdata") +# phi = c("-0.4","-0.25","0.0","0.25","0.4","max.phi") +# +# r1 = results1[,2:14] +# colnames(r1) = c("phi", phi, phi) +# +# kable(r1, "html", booktabs = TRUE, digits = 3, caption = "Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +# kable_styling(latex_options = c("hold_position", "scale_down"))%>% +# add_header_above(c(" " = 1, "n = 100" = 6, "n = 250" = 6))%>% +# pack_rows("Lobato and Velasco", 1, 5) %>% +# pack_rows("Epps", 6, 10) %>% +# pack_rows("Random Projections", 11, 15) %>% +# pack_rows("Psaradakis and Vavra", 16, 20)%>% +# pack_rows("Bootstrap Lobato", 21, 25)%>% +# pack_rows("Bootstrap Epps", 26, 30)%>% +# pack_rows("El Bouch", 31, 35) + + +## ----tab2-static, eval = knitr::is_latex_output(),warning = FALSE------------- +r2 = results2[,2:14] +colnames(r2) = c("phi", phi, phi) + +kable(r2, "latex", booktabs = TRUE, digits = 3, caption = "Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 500" = 6, "n = 1,000" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) + + +## ----tab2-interactive, eval = knitr::is_html_output(),warning = FALSE--------- +# r2 = results2[,2:14] +# colnames(r2) = c("phi", phi, phi) +# +# kable(r2, "html", booktabs = TRUE, digits = 3, caption = "Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +# kable_styling(latex_options = c("hold_position", "scale_down"))%>% +# add_header_above(c(" " = 1, "n = 500" = 6, "n = 1,000" = 6))%>% +# pack_rows("Lobato and Velasco", 1, 5) %>% +# pack_rows("Epps", 6, 10) %>% +# pack_rows("Random Projections", 11, 15) %>% +# pack_rows("Psaradakis and Vavra", 16, 20)%>% +# pack_rows("Bootstrap Lobato", 21, 25)%>% +# pack_rows("Bootstrap Epps", 26, 30)%>% +# pack_rows("El Bouch", 31, 35) + + +## ----tab3-static, eval = knitr::is_latex_output(),warning = FALSE------------- +load("data/runtime.Rdata") + +kable(runtime, "latex", booktabs = TRUE, digits = 4, caption = "Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$") + + +## ----tab3-interactive, eval = knitr::is_html_output(),warning = FALSE--------- +# load("data/runtime.Rdata") +# +# kable(runtime,"html", booktabs = TRUE, digits = 4, caption = "Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$") + + +## ----fig1-static, fig.cap = "Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour.", eval = knitr::is_latex_output(), fig.alt="(ref:demo-caption1)", out.width = "75%"---- +library(astsa) +g1 = autoplot(cardox, main = "CO2 levels at Mauna Loa", + xlab = "years", ylab = "CO2 (ppm)") +g2 = autoplot(forecast(ets(cardox), h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: CO2 Levels at Mauna Loa") +cowplot::plot_grid(g1,g2,ncol = 2) + + +## ----fig1-interactive, echo = knitr::is_html_output(), eval = knitr::is_html_output(),fig.cap="CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality."---- +# library(astsa) +# +# autoplot(cardox, main = "Carbon Dioxide levels at Mauna Loa", +# xlab = "years", ylab = "CO2 (ppm)") + + +## ----echo = TRUE-------------------------------------------------------------- +library(forecast) +library(astsa) +model = ets(cardox) +summary(model) + + +## ----echo = TRUE, eval = FALSE------------------------------------------------ +# check_residuals(model,unit_root = "adf",normality = "rp", +# plot = TRUE) + + +## ----echo = FALSE, eval = TRUE------------------------------------------------ +check_residuals(model,unit_root = "adf",normality = "rp", plot = FALSE) + + +## ----fig2-interactive, eval = knitr::is_html_output(), fig.cap = "Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity."---- +# check_plot(model) + + +## ----fig2-static, fig.cap = "Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity.", eval = knitr::is_latex_output(), fig.alt= "(ref:demo-caption2)", out.width = "100%"---- +check_plot(model) + + +## ----fig3-dynamic, echo = knitr::is_html_output(), eval = knitr::is_html_output(), fig.cap = "Forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour."---- +# autoplot(forecast(model,h = 12),include = 100, +# xlab = "years",ylab = "CO2 (ppm)", +# main = "Forecast: Carbon Dioxide Levels at Mauna Loa") + + +## ----echo = knitr::is_latex_output(), eval = FALSE, fig.cap = "(ref:demo-caption3)"---- +# autoplot(forecast(model,h = 12),include = 100, +# xlab = "years",ylab = "CO2 (ppm)", +# main = "Forecast: Carbon Dioxide Levels at Mauna Loa") + + +## ----echo = TRUE, eval = FALSE------------------------------------------------ +# if (!requireNamespace("remotes")) install.packages("remotes") +# remotes::install_github("asael697/nortsTest",dependencies = TRUE) + diff --git a/_articles/RJ-2024-008/RJ-2024-008.Rmd b/_articles/RJ-2024-008/RJ-2024-008.Rmd new file mode 100644 index 0000000000..522c123b61 --- /dev/null +++ b/_articles/RJ-2024-008/RJ-2024-008.Rmd @@ -0,0 +1,686 @@ +--- +title: 'nortsTest: An R Package for Assessing Normality of Stationary Processes' +date: '2025-01-10' +abstract: | + Normality is the central assumption for analyzing dependent data in several time series models, and the literature has widely studied normality tests. However, the implementations of these tests are limited. The nortsTest package is dedicated to fill this void. The package performs the asymptotic and bootstrap versions of the tests of Epps and Lobato and Velasco and the tests of Psaradakis and Vavra, random projections and El Bouch for normality of stationary processes. These tests are for univariate stationary processes but for El Bouch that also allows bivariate stationary processes. In addition, the package offers visual diagnostics for checking stationarity and normality assumptions for the most used time series models in several R packages. This work aims to show the package's functionality, presenting each test performance with simulated examples and the package utility for model diagnostic in time series analysis. +draft: no +author: +- name: Asael Alonzo Matamoros + affiliation: Aalto University + address: + - Department of Computer Science + - Eespo, Finland + url: https://asael697.github.io + email: izhar.alonzomatamoros@aalto.fi +- name: Alicia Nieto-Reyes + affiliation: Universidad de Cantabria + address: + - Departmento de Mathemáticas, Estadística y Computación + - Avd. de los Castros s/n. 39005 Santander, Spain + url: https://orcid.org/0000-0002-0268-3322 + email: alicia.nieto@unican.es +- name: Claudio Agostinelli + affiliation: University of Trento + address: + - Department of Mathematics + - Via Sommarive, 14 - 38123 Povo + url: https://orcid.org/0000-0001-6702-4312 + email: claudio.agostinelli@unitn.it +type: package +output: + rjtools::rjournal_article: + self_contained: yes + toc: no +bibliography: RJreferences.bib +date_received: '2022-10-21' +volume: 16 +issue: 1 +slug: RJ-2024-008 +journal: + lastpage: 156 + firstpage: 135 + +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = FALSE, + warning = FALSE, + message = FALSE +) + +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "figures/", + dev = "png", + dpi = 150, + fig.asp = 0.8, + fig.width = 8, + fig.height = 4, + out.width = "60%", + fig.align = "center" +) + +library(kableExtra) +library(nortsTest) +library(fGarch) +library(knitr) +library(forecast) +``` + +# Introduction + +Normality (*a set of observations sampled from a Gaussian process*) is an essential assumption in various statistical models. Therefore, developing procedures for testing this assumption is a topic that has gained popularity over several years. Most existing literature and implementation is dedicated to independent and identically distributed random variables [@Dagostino1987]; no results show that these tests are consistent when applied to stationary processes. For this context, several tests have been proposed over the years, but as far as we know, no `R` package or consistent implementation exists. + +The proposed \CRANpkg{nortsTest} package provides seven test implementations to check normality of stationary processes. This work aims to present a review of these tests and introduce the package functionality. Thus, its novelty lies in being the first package and paper dedicated to the implementation of normality tests for stationary processes. The implemented tests are: (i) the asymptotic *Epps* test, [@epps1987] and [@nietoreyes2014], based on the characteristic function and (ii) its sieve bootstrap approximation [@psaradakis2020normality], (iii) the corrected *Skewness-Kurtosis* (SK) test implemented by @Lobato2004 as an asymptotic test and (iv) by @psaradakis2020normality with a sieve bootstrap approximation, (v) the *random projections test* proposed by @nietoreyes2014, which makes use of the tests in (i) and (iii), (vi) the *Psadarakis and Vávra test* [@vavra2017] that uses a bootstrap approximation of the @anderson1952 test statistic for stationary linear processes and (vii) a normality test by @el2022normality for multivariate dependent samples. Tests (i) to (vi) are for univariate stationary processes. + + Furthermore, we propose the `check_residual()` function for checking time-series models' assumptions. This function returns a report for stationarity, seasonality, normality tests and visual diagnostics. `check_residual()` supports models from the most used packages for time-series analysis, such as the packages \CRANpkg{forecast} [@Rob2007] and \CRANpkg{aTSA} [@aTSA] and even functions in the base `R` [@R]; for instance, it supports the `HoltWinters` (stats `R` package) function for the Holt and Winters method [@Holt2004]. In addition, the proposed \CRANpkg{nortsTest} package has already been applied in the literature, see @Nieto-Reyes:2022-1 and @Nieto-Reyes:2022-2. + +Section 2 provides the theoretical background, including preliminary concepts and results. Section 3 introduces the normality tests for stationary processes, each subsection introducing a test framework and including examples of the tests functions with simulated data. Section 4 provides numerical experiments with simulated data and a real-world application: Subsection 4.1 reports a simulation study for the implemented normality tests and Subsection 4.2 the package's functionality for model checking in a real data application. The *carbon dioxide* data measured in the Malua Loa Observatory [@astsa] is analyzed using a state space model from the \CRANpkg{forecast} package, evaluating the model's assumptions using our proposed `check_residuals()` function. Section 5 discusses the package functionality and provides our conclusions. Furthermore, we mention our future intended work on the package. + +# Preliminary concepts + +This section provides some theoretical aspects of stochastic processes that are a necessary theoretical framework for the following sections. @shumway2010 and @Ts2010 give more details of the following definitions and results below. + +For the purpose of this work, $T$ is a set of real values denoted as time, $T \subseteq \mathbb{R},$ for instance $T=\mathbb{N}$ or $T=\mathbb{Z},$ the naturals or integer numbers respectively. We denote by $X:=\{X_t\}_{t\in T}$ a \textit{stochastic process} with $X_t$ a real random variable for each $t\in T.$ Following this notation, a \textit{time-series} is just a finite collection of ordered observations of $X$ [@shumway2010]. An important measure for a stochastic process is its mean function $\mu(t) := E[X_t]$ for each $t \in T$, where $E[\cdot]$ denotes the usual expected value of a random variable. A generalization of this measure is the k-th order centered moment function $\mu_k(t) := E[(X_t -\mu(t))^k]$ for each $t \in T$ and $k > 1;$ with the process variance function being the second order centered moment, $\sigma^2(t) := \mu_2(t)$. Other important measures are the auto-covariance and auto-correlation functions, which measure the linear dependency between two different time points of a given process. For any $t,s \in T,$ they are, respectively, +$$ +\gamma(t,s) := E[(X_t -\mu(t))(X_s - \mu(s))] \mbox{ and } \rho(t,s) := \dfrac{\gamma(t,s)}{\sqrt{\mu_2(t)}\sqrt{\mu_2(s)}}. +$$ +Other widely used measure functions for the analysis of processes are the skewness and kurtosis functions, defined as $s(t) := \mu_3(t)/[\mu_2(t)]^{3/2}$ and $k(t) := \mu_4(t)/[\mu_2(t)]^2$ for each $t\in T,$ respectively. + +A generally used assumption for stochastic processes is stationarity. It has a key role in forecasting procedures of classic time-series modeling [@Ts2010] or as a principal assumption in de-noising methods for signal theory [@W2006]. + +#### Definition 1 +A stochastic process $X$ is said to be \emph{strictly stationary} if, for every collection $\tau = \{t_1,t_2,\ldots, t_k\} \subset T$ and $h > 0$, the joint distribution of $\{X_t\}_{t \in \tau}$ is identical to that of $\{X_{t+h}\}_{t \in \tau}.$ + +The previous definition is strong for applications. A milder version of it, which makes use of the process' first two moments, is weak stationarity. + +#### Definition 2 +A stochastic process $X$ is said to be \emph{weakly stationary} if its mean function is constant in time, $\mu(t) = \mu$, its auto-covariance function only depends on the difference between times, $\gamma(s,t) = \sigma|t-s|$ for a $\sigma\in \mathbb{R}$, and it has a finite variance function, $\mu_2(t) = \mu_2 < \infty$. + +For the rest of this work, the term *stationary* will be used to specify a weakly stationary process. A direct consequence of the stationarity assumption is that the previous measure functions get simplified. Thus, given a stationary stochastic process $X,$ its mean function, $k$-th order centered moment, for $k>1,$ and auto-covariance function are respectively, +$$ + \mu = E[X_{t_1}]\mbox{, } \mu_k = E[(X_{t_1} -\mu)^k] \mbox{ and } \gamma(h) = E[(X_{t_1+h}-\mu)(X_{t_1}-\mu)], +$$ +which are independent of $t_1\in T.$ + +Given a sample $x_1, \ldots, x_n,$ $n\in\mathbb{N},$ of equally spaced observations of $X,$ their corresponding estimators, sample mean, sample $k$-th order centered moment and sample auto-covariance, are respectively +$$ + \widehat{\mu} := n^{-1}\sum_{i=1}^nx_i\mbox{, } \widehat{\mu}_k := n^{-1}\sum_{i=1}^n(x_i - \widehat{\mu})^k \mbox{ and }\widehat{\gamma}(h) := n^{-1}\sum_{i = 1}^{n-h}(x_{i+h} - \widehat{\mu})(x_i - \widehat{\mu}). +$$ + +A particular case in which stationarity implies strictly stationarity is a Gaussian process. + +#### Definition 3 +A stochastic process $X$ is said to be a *Gaussian process* if for every finite collection $\tau = \{t_1,t_2,\ldots, t_k\} \subset T,$ the joint distribution of $\{X_t\}_{t \in \tau}$ has a multivariate normal distribution. + +A series of mean zero uncorrelated random variables with finite constant variance is known as *white noise*. If additionally, it is formed of independent and identically distributed (i.i.d) normal random variables, it is known as *Gaussian white noise*; which is a particular case of stationary Gaussian process. For the rest of the work, $X_t \sim N(\mu,\sigma^2)$ denotes that the random variable $X_t$ is normally distributed with mean $\mu$ and variance $\sigma^2$ and $\chi^2(v)$ denotes the Chi square distribution with $v$ degrees of freedom. + +Other classes of stochastic processes can be defined using collections of white noise, for instance, the linear process. + +#### Definition 4 +Let $X$ be a stochastic process. $X$ is said to be *linear* if it can be written as +$$ +X_t = \mu + \sum_{i\in\mathbb{Z}}\phi_i\epsilon_{t-i}, +$$ +where $\{\epsilon_i\}_{i\in\mathbb{Z}}$ is a collection of white noise random variables and $\{\phi_i\}_{i\in\mathbb{Z}}$ is a set of real values such that $\sum_{i\in\mathbb{Z}} |\phi_j| < \infty.$ + +An important class of processes is the *auto-regressive moving average* ($ARMA$). @Box1990 introduced it for time series analysis and forecast, becoming very well-known in the 90s and early 21st century. + +#### Definition 5 +For any non-negative integers $p,q,$ a stochastic process $X$ is an $ARMA(p,q)$ process if it is a stationary process and +\begin{equation} + X_t = \sum_{i=0}^p \phi_iX_{t-i} +\sum_{i=0}^q \theta_i\epsilon_{t-i}, (\#eq:ARMA) +\end{equation} +where $\{\phi_i\}_{i=0}^p$ and $\{\theta_i\}_{i=0}^q$ are sequences of real values with $\phi_0= 0,$ $\phi_p\neq 0,$ $\theta_0=1$ and $\theta_q\neq 0$ and $\{\epsilon_{i}\}_{i\in\mathbb{Z}}$ is a collection of white noise random variables. + +Particular cases of $ARMA$ processes are those known as auto-regressive ($AR(p) := ARMA(p,0)$) and mean average ($MA(q) := ARMA(0,q)$) processes. Additionally, a \emph{random walk} is a non stationary AR(1) +process satisfying \@ref(eq:ARMA) with $p=1,$ $\phi_1 = 1$ and $q=0.$ Several properties of an $ARMA$ process can be extracted from its structure. For that, the $AR$ and $MA$ polynomials are introduced +$$ + AR:\text{ } \phi(z) = 1-\sum_{i=0}^p \phi_i z^i \text{ and } MA:\text{ } \theta(z) = \sum_{i=0}^q \theta_i z^i, +$$ +where $z$ is a complex number and, as before, $\phi_0 = 0,$ $\phi_p\neq 0,$ $\theta_0= 1$ and $\theta_q\neq 0.$ Conditions for stationarity, order selection and, process behavior are properties studied from these two polynomials. + +For modeling volatility in financial data, @Bollerslev1986 proposed the *generalized auto-regressive conditional heteroscedastic* (GARCH) class of processes as a generalization of the *auto-regressive conditional heteroscedastic* (ARCH) processes [@engle1982]. + +#### Definition 6 +For any $p,q \in \mathbb{N}$, a stochastic process $X$ is a $GARCH(p,q)$ process if it satisfies $X_t = \mu + \sigma_{t}\epsilon_t$ with +$$ +\sigma_t^2 = \alpha_0 +\sum_{i=1}^p\alpha_i \epsilon_{t-i}^2 +\sum_{i=1}^q \beta_{i}\sigma^2_{t-i}. +$$ +$\mu$ is the process mean, $\sigma_0$ is a positive constant value, $\{\alpha_i\}_{i=1}^p$ and $\{\beta_i\}_{i=1}^q$ are non-negative sequences of real values and $\{\epsilon_{t}\}_{t \in T}$ is a collection of i.i.d. random variables. + +A more general class of processes are the *state-space models* ($SSMs$), which have gained popularity over the years because they do not impose on the process common restrictions such as linearity or stationarity and are flexible in incorporating the process different characteristics [@OBrien2010]. They are widely used for smoothing [@west2006] and forecasting [@Rob2007] in time series analysis. The main idea is to model the process dependency with two equations: the *state equation*, which models how parameters change over time, and the *innovation equation*, which models the process in terms of the parameters. Some particular SSMs that analyze the level, trend and seasonal components of the process are known as *error, trend, and seasonal* (ETS) models. There are over 32 different variations of ETS models [@Hyndman2008]. One of them is the *multiplicative error, additive trend-seasonality* $(ETS(M,A,A))$ model. + +#### Definition 7 +A SSM process $X$ follows an ETS(M,A,A) model, if the process accepts +$$ +X_t = [L_{t-1} +T_{t-1} + S_{t-1}](1 + \epsilon_t) +$$ +as innovation equation and +\begin{eqnarray*}L_t &= &L_{t-1} +T_{t-1} +\alpha (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + T_t &= &T_{t-1} + \beta (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + S_t &= &S_{t-m} + \gamma (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t, +\end{eqnarray*} +as state equations. +$\alpha, \beta,\gamma \in [0,1]$, $m\in\mathbb{N}$ denotes the period of the series and $\{\epsilon_t\}$ are i.i.d normal random variables. For each $t\in\mathbb{Z},$ $L_t$, $T_t$ and $S_t$ represent respectively the level, trend and seasonal components. + +# Normality tests for stationary processes + +Extensive literature exists on goodness of fit tests for normality under the assumption of independent and identically distributed random variables, including, among others, Pearson's chi-squared test [@Pearson1895], Kolmogorov-Smirnov test [@Smirnov1948], Anderson-Darling test [@anderson1952], SK test [@jarque1980] and Shapiro-Wilk test, [@SWtest1965] and [@Royston1982]. These procedures have been widely used in many studies and applications, see @Dagostino1987 for further details. There are no results, however, showing that the above tests are consistent in the context of stationary processes, in which case the independence assumption is violated. For instance, @Gasser1975 provides a simulation study where Pearson's chi-squared test has an excessive rejection rate under the null hypothesis for dependent data. For this matter, several tests for stationary processes have been proposed over the years. A selection of which we reference here. @epps1987 provides a test based on the characteristic function, @Hinich1982 proposes a similar test based on the process' spectral density function [@Berg2010, for further insight]. @Gasser1975 gives a correction of the SK test, with several modifications made in @Lobato2004, @bai2005 and @MarianZach2017, which are popular in many financial applications. @Meddahi2005 constructs a test based on Stein's characterization of a Gaussian distribution. Using the random projection method [@Cuesta2007], @nietoreyes2014 build a test that upgrades the performance of @epps1987 and @Lobato2004 procedures. Furthermore, @vavra2017 adapts the @anderson1952 statistic for stationary linear processes approximating its sample distribution with a sieve bootstrap procedure. + +Despite the existing literature, consistent implementations of goodness of fit test for normality of stationary processes in programming languages such as `R` or `Python` are limited. This is not the case for normality of independent data, the \CRANpkg{nortest} package [@nortest2015] implements tests such as Lilliefors [@Wilkinson1986], Shapiro-Francia [@Royston1993], Pearson's chi-squared, Cramer von Misses [@vonMisses1962] and Anderson-Darling. For a multivariate counterpart, the \CRANpkg{mvnTest} package [@mvntest] implements the multivariate Shapiro-Wilk, Anderson-Darling, Cramer von Misses, Royston [@Royston1992], Doornik and Hansen [@DH2008], Henze and Zirkler [@HZ1990] and the multivariate Chi square test [@S2_2016]. For the case of dependent data, we present here the \CRANpkg{nortsTest} package. Type within `R` `install.packages("nortsTest", dependencies = TRUE)` to install its latest released version from `CRAN`. \CRANpkg{nortsTest} performs the tests proposed in @epps1987, @Lobato2004, @psaradakis2020normality, @nietoreyes2014, @vavra2017 and @el2022normality. + +Additionally, the package offers visualization functions for descriptive time series analysis and several diagnostic methods for checking stationarity and normality assumptions for the most used time series models of several `R` packages. To elaborate on this, Subsection 3.1 introduces the package functionality and software and Subsection 3.2 provides an overview of tests for checking stationary and seasonality. Finally, Subsections 3.3-3.5 present a general framework of each of the implemented normality tests and their functionality by providing simulated data examples. + +## Software + +The package works as an extension of the \CRANpkg{nortest} package [@nortest2015], which performs normality tests in random samples but for independent data. The building block functions of the \CRANpkg{nortsTest} package are: + + + `epps.test()`, function that implements the test of Epps, + + + `epps_bootstrap.test()`, function that implements a bootstrap approximation of the test of Epps, + + + `lobato.test()`, function that implements the asymptotic test of Lobato and Velasco, + + + `lobato_bootstrap.test()`, function that implements a bootstrap approximation of the test of Lobato and Velasco, + + + `rp.test()`, function that implements the random projection test of Nieto-Reyes, Cuesta-Albertos and Gamboa, + + + `vavra.test()`, function that implements the test of Psaradaki and Vavra, and + + + `elbouch.test()`, function that implements the test of El Bouch, Michel and Comon. + +Each of these functions accepts a `numeric` (*numeric*) or `ts` (*time series*) class object for storing data, and returns a `htest` (*hypothesis test*) class object with the main results for the test. To guarantee the accuracy of the results, each test performs unit root tests for checking stationarity and seasonality (see Subsection 3.2) and displays a warning message if any of them is not satisfied. + +For visual diagnostic, the package offers different plot functions based on the \CRANpkg{ggplot2} package [@ggplot2]: the `autoplot()` function plots `numeric`, `ts` and `mts` (*multivariate time series*) classes while the `gghist()` and `ggnorm()` functions are for plotting histogram and qq-plots respectively; and on the \CRANpkg{forecast} package [@Rob2007]: `ggacf()` and `ggPacf()` for the display of the auto-correlation and partial auto-correlations functions respectively. + +Furthermore, inspired in the function `checkresiduals()` of the \CRANpkg{forecast} package, we provide the `check_residuals()` function to test the model assumptions using the estimated residuals. The upgrade of our proposal is that, besides providing plots for visual diagnosis (setting the `plot` option as `TRUE`), it does check stationarity, seasonality (*Subsection 3.2*) and normality, presenting a report of the used tests and conclusions for assessing the model's assumptions. An illustration of these functions is provided in Subsection 4.2, where we show the details of the functions and their utility for assumptions commonly checked in time series modeling. + +## Tests for stationarity + +For checking stationarity, the \CRANpkg{nortsTest} package uses \textit{unit root} and \textit{seasonal unit root} tests. These tests work similarly, checking whether a specific process follows a random walk model, which clearly is a non-stationary process. + +### Unit root tests + +A linear stochastic process $X$ that follows a random walk model is non stationary. Its AR polynomial is $\phi(z) = 1 - z$, whose solution (root) is unique and equal to one. Thus, it is common to test the non stationarity of a linear process by checking whether its AR polynomial has a unit root (a root equal to one). + +The most commonly used tests for unit root testing are *Augmented Dickey-Fuller* [@dickey1984], *Phillips-Perron* [@Perron1988], *kpps* [@KppsI1992] and \textit{Ljung-Box} [@Box]. In particular, the *Ljung-Box* test contrasts the null auto-correlation hypothesis of identically distributed Gaussian random variables, which is equivalent to test stationarity. The `uroot.test()` and `check_residual()` functions perform these tests, making use of the \CRANpkg{tseries} package [@tseries]. + +### Seasonal unit root tests + +Let $X$ be a stationary process and $m$ its period. Note that for observed data, $m$ generally corresponds to the number of observations per unit of time. $X$ follows a seasonal random walk if it can be written as +$$ + X_t = X_{t-m} + \epsilon_t, +$$ +where $\epsilon_t$ is a collection of i.i.d random variables. In a similar way, the process $X$ is non-stationary if it follows a seasonal random walk. Or equivalently, $X$ is non stationary if the seasonal AR(1) polynomial ($\phi_m(z) = 1 - \phi z^m$) has a unit root. The `seasonal.test()` and `check_residuals()` functions perform the *OCSB test* [@ocsb1988] from the \CRANpkg{forecast} package and the *HEGY* [@Hegy1993] and *Ch* [@ch1995] tests from the \CRANpkg{uroot} package [@uroot]. + +## Tests of Epps + +The $\chi^2$ test for normality proposed by @epps1987 compares the empirical characteristic function of the one-dimensional marginal of the process with the one of a normally distributed random variable evaluated at certain points on the real line. Several authors, including @Lobato2004, @vavra2017 and @el2022normality, point out that the greatest challenge in the Epps' test is its implementation procedure, which we address with the \CRANpkg{nortsTest} package. Other existing tests based on the empirical characteristic function of the one-dimensional marginal of the process include @hong1999hypothesis and the references therein. This test differs, however, in that it uses spectral analysis and derivatives. + +Furthermore, @meintanis2016review reviews on testing procedures based on the empirical characteristic function. There, it is commented about the random projection test [@nietoreyes2014, and here below] as a recent development of Epps' test. In fact, in @nietoreyes2014 the consistency of Epps test is improved by taking at random the elements at which the characteristic function is evaluated. Additionally, @el2022normality proposes a sieve bootstrap modification of the Epps' test. In addition to the classical asymptotic Epps' test, we include these last two approaches here, and in the package, see the Example below and the paragraph before it. Let us provide now the foundation behind the Epps' tests. + +Let $X$ be a stationary stochastic process that satisfies +\begin{equation} + \sum_{t=-\infty}^{\infty}|t|^k|\gamma(t)| <\infty \mbox{ for some } k >0. (\#eq:a) +\end{equation} +The null hypothesis is that the one-dimensional marginal distribution of $X$ is a Gaussian process. The procedure for constructing the test consists of defining a function $g$, estimating its inverse spectral matrix function, minimizing the generated quadratic function in terms of the unknown parameters of the random variable and, finally, obtaining the test statistic, which converges in distribution to a $\chi^2.$ + +Given $N \in\mathbb{N}$ with $N \geq 2,$ let +$$ +\Lambda :=\{\lambda:=(\lambda_1, \ldots, \lambda_N) \in \mathbb{R}^N: \lambda_i \leq \lambda_{i+1} \text{ and } \lambda_i > 0, \text{ for } i = 1,2,\ldots, N \}, +$$ +and $g:\mathbb{R}\times \Lambda \rightarrow \mathbb{R}^n$ be a measurable function, where +$$ + g(x,\lambda):= [\cos(\lambda_1x),\sin(\lambda_1x),\ldots,\cos(\lambda_Nx),\sin(\lambda_Nx)]. +$$ +Additionally, let $g_\theta:\Lambda \rightarrow \mathbb{R}^N$ be a function defined by +$$ + g_\theta(\lambda) := \left[\mbox{Re}(\Phi_\theta(\lambda_1)),\mbox{Im}(\Phi_\theta(\lambda_1)),\ldots,\mbox{Re}(\Phi_\theta(\lambda_N)),\mbox{Im}(\Phi_\theta(\lambda_N)) \right]^t, +$$ +where the $\mbox{Re}(\cdot)$ and $\mbox{Im}(\cdot)$ are the real and imaginary components of a complex number and $\Phi_\theta$ is the characteristic function of a normal random variable with parameters $\theta := (\mu,\sigma^2)\in \Theta,$ an open bounded set contained in $\mathbb{R}\times \mathbb{R}^+$. For any $\lambda\in\Lambda,$ let us also denote +$$ + \widehat{g}(\lambda) := \dfrac{1}{n}\sum_{t=1}^n [\cos(\lambda_1 x_t),\sin(\lambda_1x_t),\ldots,\cos(\lambda_N x_t),\sin(\lambda_N x_t)]^t. +$$ +Let $f(v;\theta,\lambda)$ be the spectral density matrix of $\{g(X_t,\lambda)\}_{t \in\mathbb{Z}}$ at a frequency $v.$ +Then, for $v = 0$, it can be estimated by +$$ + \widehat{f}(0;\theta,\lambda) := \dfrac{1}{2\pi n}\left(\sum_{t=1}^n \widehat{G}(x_{t,0},\lambda) +2\sum_{i=1}^{\lfloor n^{2/5}\rfloor}(1 -i/\lfloor n^{2/5} \rfloor)\sum_{t=1}^{n-i}\widehat{G}(x_{t,i},\lambda) \right), +$$ +where $\widehat{G}(x_{t,i},\lambda) = (\widehat{g}(\lambda) -g(x_{t},\lambda))(\widehat{g}(\lambda) -g(x_{t+i},\lambda))^t$ and $\lfloor \cdot \rfloor$ denotes the floor function. The test statistic general form under $H_0$ is +$$ + Q_n(\lambda) := \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +$$ +with +$$ + Q_n(\theta,\lambda):=(\widehat{g}(\lambda)-g_\theta(\lambda))^tG_n^+(\lambda)(\widehat{g}(\lambda)-g_\theta(\lambda)), +$$ +where $G^{+}_n$ is the generalized inverse of the spectral density matrix $2 \pi \widehat{f}(0;\theta,\lambda)$. Let +$$ + \widehat{\theta} := \arg \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +$$ +be the argument that minimizes $Q_n(\theta,\lambda)$ such that $\widehat{\theta}$ is in a neighborhood of $\widehat{\theta}_n := (\widehat{\mu},\widehat{\gamma}(0))$. To guarantee its' existence and uniqueness, the following assumptions are required. We refer to them as assumption $(A.)$. + +$(A.)$ Let $\theta_0$ be the true value of $\theta$ under $H_0$, then for every $\lambda \in \Lambda$ the following conditions are satisfied. + + + $f(0;\theta,\lambda)$ is positive definite. + + + $\Phi_\theta(\lambda)$ is twice differential with respect to $\theta$ in a neighborhood of $\theta_0$. + + + The matrix $D(\theta_0,\lambda) = \dfrac{\partial \Phi_\theta(\lambda)}{\partial\theta |_{\theta = \theta_0}} \in \mathbb{R}^{N\times 2}$ has rank 2. + + + The set $\Theta_0(\lambda) := \{ \theta \in \Theta: \Phi_\theta(\lambda_i) = \Phi_{\theta_0}(\lambda_i), i=1, \ldots,N\}$ is a finite bounded set in $\Theta$. And $\theta$ is a bounded subset $\mathbb{R}\times \mathbb{R}^+$. + + + $f(0;\theta,\lambda) = f(0;\theta_0,\lambda)$ and $D(\theta_0,\lambda) = D(\theta_,\lambda)$ for all $\theta \in \Theta_0(\lambda)$. + +Under these assumptions, the Epps's main result is presented as follows. + +#### Theorem 1 [@epps1987, Theorem 2.1] +Let $X$ be a stationary Gaussian process such that \@ref(eq:a) and $(A.)$ are satisfied, then $nQ_n(\lambda)\to_d \chi^2(2N - 2)$ for every $\lambda \in \Lambda$. + +The current \CRANpkg{nortsTest} version, uses $\Lambda := \{\verb|lambda|/\widehat{\gamma}(0)\}$ as the values to evaluate the empirical characteristic function, where $\widehat{\gamma}(0)$ is the sample variance. By default `lambda = c(1, 2)`. Therefore, the implemented test statistic converges to a $\chi^2$ distribution with two degrees of freedom. The user can change these $\Lambda$ values as desired by simply specifying the function's `lambda` argument, as we show in the Example below. + +#### Example 1 +A stationary $AR(2)$ process is drawn using a beta distribution with `shape1 = 9` and `shape2 = 1` parameters, and performed the implementation of the test of Epps, `epps.test()`. At significance level $\alpha = 0.05$, the null hypothesis of normality is correctly rejected. + +```{r, echo = TRUE} +set.seed(298) +x = arima.sim(250,model = list(ar =c(0.5,0.2)), + rand.gen = rbeta,shape1 = 9,shape2 = 1) + +# Asymptotic Epps test +epps.test(x) +``` + +Asymptotic Epps test with random Lambda values as proposed in @nietoreyes2014. + +```{r, echo = TRUE} +set.seed(298) +epps.test(x, lambda = abs(rnorm(mean = c(1, 2), 2))) +``` + +Approximated sieve bootstrap Epps test using 1000 repetitions of 250 units. + +```{r, echo = TRUE} +set.seed(298) +epps_bootstrap.test(x, seed = 298) +``` + +## Tests of Lobato and Velasco + +@Lobato2004 provides a consistent estimator for the corrected SK test statistic for stationary processes, see @Lomincki1961 and @Gasser1975 for further insight. Note that the SK test is also known as the Jarque-Bera test [@jarque1980], which is already available in several R packages [@tseries, for instance]. The improvement of this proposal over those implementations is a correction in the skewness and kurtosis estimates by the process' auto-covariance function, resulting in a consistent test statistic under the assumption of correlated data. The test in @Lobato2004 is asymptotic, which is computationally efficient, as opposed to a bootstrap based test. @psaradakis2020normality show that the bootstrap modification of the Lobato and Velasco's test is a fair competitor against the original asymptotic test, beating other tests for normality of the one-dimensional marginal distribution in terms of power. Thus, the package incorporates both the asymptotic, `lobato.test()` and its bootstrap version `lobato_bootstrap.test()`. + +The general framework for the test is presented in what follows. On the contrary to the test of Epps, this proposal does not require additional parameters for the computation of the test sample statistic. + +Let $X$ be a stationary stochastic process that satisfies + +\begin{equation} + \sum_{t=0}^{\infty}|\gamma(t)| <\infty. (\#eq:aLV) +\end{equation} + +The null hypothesis is that the one-dimensional marginal distribution of $X$ is normally distributed, that is +$$ +H_0: X_t \sim N(\mu,\sigma^2) \text{ for all } t \in \mathbb{R}. +$$ +Let $k_q(j_1,j_2,\ldots,j_{q-1})$ be the q-th order cummulant of $X_{1},X_{1+j_1},\ldots,X_{1+j_{q-1}}$. $H_0$ is fulfilled if all the marginal cummulants above the second order are zero. In practice, it is tested just for the third and fourth order marginal cummulants. Equivalently, in terms of moments, the marginal distribution is normal by testing whether $\mu_3 = 0$ and $\mu_4 = 3 \mu_2^2$. For non-correlated data, the SK test compares the SK statistic against upper critical values from a $\chi^2(2)$ distribution [@bai2005]. For a Gaussian process $X$ satisfying \@ref(eq:aLV), it holds the limiting result +$$ + \sqrt{n} \binom{\widehat{\mu}_3}{\widehat{\mu}_4 -3\widehat{\mu}^2_2} \to_d N[0_2,\Sigma_F)], +$$ +where $0_2 := (0,0)^t \in \mathbb{R}^2$ and $\Sigma_F := \mbox{diag}(6F^{(3)}, \text{ } 24F^{(4)}) \in \mathbb{R}^{2x2}$ is a diagonal matrix with $F^{(k)} := \sum_{j = -\infty}^{\infty}\gamma(j)^k$ for $k=3,4$ [@Gasser1975]. + +The following consistent estimator in terms of the auto-covariance function is proposed in @Lobato2004 +$$ + \widehat{F}^{(k)} := \sum_{t = 1-n}^{n-1}\widehat{\gamma}(t)[\widehat{\gamma}(t) +\widehat{\gamma}(n-|t|)]^{k-1}, +$$ +to build a *generalized SK test* statistic +$$ + G := \dfrac{n \widehat{\mu}_3^2}{6 \widehat{F}^{(3)}} + \dfrac{n(\widehat{\mu}_4 -3\widehat{\mu}_2)^2}{24\widehat{F}^{(4)}}. +$$ +Similar to the SK test for non-correlated data, the $G$ statistic is compared against upper critical values from a $\chi^2(2)$ distribution. This is seen in the below result that establishes the asymptotic properties of the test statistics, so that the general test procedure can be constructed. The result requires the following assumptions, denoted by $(B.),$ for the process $X.$ + +(B.) + + + $E[X_t^{16}] < \infty$ for $t \in T.$ + + + $\sum_{j_1 = -\infty}^{\infty}\cdots \sum_{j_{q-1} = -\infty}^{\infty} |k_q(j_1,\ldots,j_{q-1})| < \infty \text{ for } q=2,3,\ldots,16.$ + + + $\sum_{j=1}^{\infty}\left(E \left[\text{ } E[(X_0-\mu)^k|B_j] -\mu_k\right]^2 \right)^{1/2} < \infty \text{ for } k = 3,4,$ where $B_j$ denotes the $\sigma$-field generated by $X_t$, $t \leq -j.$ + + + $E\left[Z_k \right]^2 +2\sum_{j=1}^{\infty}E\left(\left[Z_k \right] \left[ (X_j -\mu)^k -\mu_k \right] \right) > 0$ for $k = 3,4,$ with $Z_k=(X_0 -\mu)^k -\mu_k.$ + +Note that these assumptions imply that the higher-order spectral densities up to order 16 are continuous and bounded. + +#### Theorem 2 [@Lobato2004, Theorem 1] +Let $X$ be a stationary process. If $X$ is Gaussian and satisfies \@ref(eq:aLV) then $G \to_d \chi^2(2)$, and under assumption (B.), the test statistic G diverges whenever $\mu_3 \neq 0$ or $\mu_4 \neq 3\mu_2^2.$ + +#### Example 2 +A stationary $MA(3)$ process is drawn using a gamma distribution with `rate = 3` and `shape = 6` parameters. The `lobato.test()` function performs the test of *Lobato and Velasco* to the simulated data. At significance level $\alpha = 0.05$, the null hypothesis of normality is correctly rejected. + +```{r, echo = TRUE} +set.seed(298) +x = arima.sim(250,model = list(ma = c(0.2, 0.3, -0.4)), + rand.gen = rgamma, rate = 3, shape = 6) +# Asymptotic Lobato & Velasco +lobato.test(x) +``` + +Approximated sieve bootstrap Lobato and Velasco test using 1000 repetitions of 250 units. + +```{r, echo = TRUE} +lobato_bootstrap.test(x, seed = 298) +``` + +## The Random Projections test + +The previous proposals only test for the normality of the one-dimensional marginal distribution of the process, which is inconsistent against alternatives whose one-dimensional marginal is Gaussian. @nietoreyes2014 provides a procedure to fully test normality of a stationary process using a Crammér-Wold type result [@Cuesta2007] that uses random projections to differentiate among distributions. In @nietoreyes2014 existing tests for the normality of the one dimensional marginal are applied to the random projections and the resulting p-values combined using the false discovery rate for dependent data [@Benjamin2001]. The \CRANpkg{nortsTest} package improves on this test by allowing to use the less conservative false discovery rate in @Benjamin1995. + +We show the Crammér-Wold type result below. The result works for separable Hilbert spaces, however here, for its later application, we restrict it to $l^2,$ the space of square summable sequences over $\mathbb{N},$ with inner product $\langle \cdot,\cdot \rangle.$ + +#### Theorem 3 [@Cuesta2007, Theorem 3.6] +Let $\eta$ be a dissipative distribution on $l^2$ and $Z$ a $l^2$-valued random element, then $Z$ is Gaussian if and only if +$$ + \eta\{h \in l^2: \langle Z,h \rangle \text{ has a Gaussian distribution}\} > 0. +$$ +A dissipative distribution [@nietoreyes2014, Definition 2.1] is a generalization of the concept of absolutely continuous distribution to the infinite-dimensional space. A Dirichlet process [@gelman2013] produces random elements with a dissipative distribution in $l^2$. In practice, generate draws of $h \in l^2$ with a stick-breaking process that makes use of beta distributions. + +Let $X = \{X_t\}_{t\in\mathbb{Z}}$ be a stationary process. As $X$ is normally distributed if the process $X^{(t)} := \{X_k\}_{k \leq t}$ is Gaussian for each $t\in\mathbb{Z},$ using the result above, @nietoreyes2014 provides a procedure for testing that $X$ is a Gaussian process by testing whether the process $Y^h = \{Y^h_t\}_{t \in \mathbb{Z}}$ is Gaussian. +\begin{equation} + Y^h_t := \sum_{i=0}^\infty h_i X_{t-i} = \langle X^{ (t) },h \rangle, (\#eq:proj) +\end{equation} +where $\langle X^{(t)},h \rangle$ is a real random variable for each $t \in \mathbb{Z}$ and $h\in l^2$. Thus, $Y^h$ is a stationary process constructed by the projection of $X^{(t)}$ on the space generated by $h.$ Therefore, $X$ is a Gaussian process if and only if the one dimensional marginal distribution of $Y^{h}$ is normally distributed. Additionally, the hypothesis of the tests *Lobato and Velasco* or *Epps*, such as \@ref(eq:a), \@ref(eq:aLV), $(A)$ and $(B)$, imposed on $X$ are inherited by $Y^h$. Then, those tests can be applied to evaluate the normality of the one dimensional marginal distribution of $Y^h$. Further considerations include the specific beta parameters used to construct the distribution from which to draw $h$ and selecting a proper number of combinations to establish the number of projections required to improve the method performance. All of these details are discussed in @nietoreyes2014. + +Next, we summarize the test of random projections in practice: + + 1. Select $k,$ which results in $2k$ independent random projections (*by default* `k = 1`). + + 2. Draw the $2k$ random elements to project the process from a dissipative distribution that uses a particular beta distribution. By default, use a $\beta(2,7)$ for the first $k$ projections and a $\beta(100,1)$ for the later $k$. + + 3. Apply the tests of *Lobato and Velasco* to the even projected processes and *Epps* to the odd projections. + + 4. Combine the obtained $2k$ `p-values` using the false discover rate. By default, use @Benjamin2001 procedure. + +The `rp.test()` function implements the above procedure. The user might provide optional parameters such as the number of projections `k`, the parameters of the first beta distribution `pars1` and those of the second `pars2`. The next example illustrates the application of the `rp.test()` to a stationary GARCH(1,1) process drawn using normal random variables. + +#### Example 3 +A stationary `GARCH(1,1)` process is drawn with a standard normal distribution and parameters $\alpha_0 = 0,$ $\alpha_1 = 0.2$ and $\beta_1 = 0.3$ using the [\CRANpkg{fGarch} package, @fGarch]. Note that a `GARCH(1,1)` process is stationary if the parameters $\alpha_1$ and $\beta_1$ satisfy the inequality $\alpha_1 + \beta_1 < 1$ [@Bollerslev1986]. + +```{r, echo = TRUE} +set.seed(3468) +library(fGarch) +spec = garchSpec(model = list(alpha = 0.2, beta = 0.3)) +x = ts(garchSim(spec, n = 300)) +rp.test(x) +``` + +At significance level $\alpha = 0.05,$ the applied *random projections* test with `k = 1` as the number of projections shows no evidence to reject the null hypothesis of normality. + +## The Psaradakis and Vavra's test + +@vavra2017 adapted a distance test for normality for a one-dimensional marginal distribution of a stationary process. Initially, the test was based on the Anderson (1952) test statistic and used an auto-regressive sieve bootstrap approximation to the null distribution of the sample test statistic. Later, @psaradakis2020normality considered this test as the ultimate normality test based on the empirical distribution function, and adapted its methodology to a wide range of tests, including Shapiro-Wilk [@SWtest1965], Jarque-Bera [@jarque1980], Cramer von Mises [@vonMisses1962], Epps, and Lobato-Velasco. Their experiments show that the Lobato-Velasco and Jarque-Bera test's bootstrap version performs best in small samples. + +Although the test is said to be applicable to a wide class of non-stationary processes by transforming them into stationary by means of a fractional difference operator, no theoretic result was apparently provided to sustain this transformation. This work restricts the presentation of the original procedure to stationary processes. + +Let $X$ be a stationary process satisfying +\begin{equation} + X_t = \sum_{i=0}^{\infty}\theta_i \epsilon_{t-i} + \mu_0, \ t \in \mathbb{Z}, (\#eq:aPV) +\end{equation} +where $\mu_0 \in \mathbb{R}$, $\{\theta_i\}_{i=0}^\infty\in l^2$ with $\theta_0 = 1$ and $\{\epsilon_t\}_{i=0}^\infty$ is a collection of mean zero i.i.d random variables. The null hypothesis is that the one dimensional marginal distribution of $X$ is normally distributed, +$$ + H_0: F(\mu_0 +\sqrt{\gamma(0)}x)-F_N(x) = 0, \text{ for all } x\in \mathbb{R}, +$$ +where F is the cumulative distribution function of $X_0$, and $F_N$ denotes the standard normal cumulative distribution function. Note that if $\epsilon_0$ is normally distributed, then the null hypothesis is satisfied. Conversely, if the null hypothesis is satisfied, then $\epsilon_0$ is normally distributed and, consequently, $X_0$. +The considered test for $H_0$ is based on the Anderson-Darling distance statistic +\begin{equation} + A_d = \int_{-\infty}^{\infty}\dfrac{[{F_n}(\widehat{\mu}+\sqrt{\widehat{\gamma}(0)}x)-F_N(x)]^2}{F_N(x)[1-F_N(x)]}dF_N(x), (\#eq:aPV1) +\end{equation} +where ${F_n}(\cdot)$ is the empirical distribution function associated to $F$ based on a simple random sample of size $n$. @vavra2017 proposes an auto-regressive sieve bootstrap procedure to approximate the sampling properties of $A_d$ arguing that making use of classical asymptotic inference for $A_d$ is problematic and involved. This scheme is motivated by the fact that under some assumptions for $X,$ including \@ref(eq:aPV), $\epsilon_t$ admits the representation +\begin{equation} + \epsilon_t = \sum_{i=1}^{\infty}\phi_i(X_{t-i} - \mu_0), \ t \in \mathbb{Z}, (\#eq:ePV) +\end{equation} +for certain type of $\{\phi_i\}_{i=1}^\infty\in l^2$. The main idea behind this approach is to generate a bootstrap sample $\epsilon_t^*$ to approximate $\epsilon_t$ with a finite-order auto-regressive model. This is because the distribution of the processes $\epsilon_t$ and $\epsilon_t^*$ coincide asymptotically if the order of the auto-regressive approximation grows simultaneously with $n$ at an appropriate rate [@Buhlmann1997]. The procedure makes use of the $\epsilon_t^{*'}s$ to obtain the $X_t^{*'}s$ through the bootstrap analog of \@ref(eq:ePV). Then, generate a bootstrap sample of the $A_d$ statistic, $A_d^{*},$ making use of the bootstrap analog of \@ref(eq:aPV). + +The `vavra.test()` function implements @psaradakis2020normality procedure. By default, it generates 1,000 sieve-bootstrap replications of the Anderson-Darling statistic. The user can provide different test procedures, such as the *Shapiro-Wilk, Jarque-Bera, Cramer von Mises, Epps* or *Lobato-Velasco* test, by specifying a text value to the `normality` argument. The presented values are Monte Carlo estimates of the $A_d$ statistic and `p.value`. + +#### Example 4 +A stationary $ARMA$(1,1) process is simulated using a standard normal distribution and performs *Psaradakis and Vávra* procedure using Anderson-Darling and Cramer von Mises test statistics. At significance level $\alpha = 0.05$, there is no evidence to reject the null hypothesis of normality. + +```{r, echo = TRUE} +set.seed(298) +x = arima.sim(250,model = list(ar = 0.2, ma = 0.34)) +# Default, Psaradakis and Vavra's procedure +vavra.test(x, seed = 298) +``` + +Approximate Cramer von Mises test for the Psaradakis and Vavra's procedure + +```{r, echo = TRUE} +vavra.test(x, normality = "cvm", seed = 298) +``` + +## The multivariate kurtosis test + +The literature contains some procedures to test the null hypothesis that a multivariate stochastic process is Gaussian. Those include @moulines1992testing, a test based on the characteristic function, and @Steinberg1992, a test based on properties of the entropy of Gaussian processes that does not make use of cumulant computations. According to @el2022normality, these tests may hardly be executable in real time. Consequently, they propose a test based on multivariate kurtosis [@mardia1970measures]. The proposed procedure is for $p=1,2,$ and we elaborate on it in what follows. In Section 6.3 of @el2022normality, they suggest to apply random projections for higher dimensions but they do not investigate the procedure any further. + +The p-value of this test is obtained as $2(1-F_N(z))$ where, as above, $F_N$ denotes the standard normal cumulative distribution function. There, + $$ + z:=(\hat{B}_p-E[\hat{B}_p])/\sqrt{E[(\hat{B}_p-E[\hat{B}_p])^2]}, + $$ + where + $$ + \hat{B}_p:=n^{-1}\sum_{t=1}^n(x_t^t \hat{S}^{-1}x_t)^2, + $$ +and +$$ + \hat{S}:=n^{-1}\sum_{t=1}^n x_t x_t^t. +$$ +In @el2022normality, there reader can found the exact computations of $E[\hat{B}_p]$ and $E[(\hat{B}_p-E[\hat{B}_p])^2].$ + +This test is implemented in the `elbouch.test()` function. By default, the function computes the univariate El Bouch test. If the user provides a secondary data set, the function computes the bivariate counterpart. + +#### Example 5 +Simulate a two-dimensional stationary VAR(2) process using independent AR(1) and AR(2) processes with standard normal distributions and apply the bivariate El Bouch test. At significance level $\alpha = 0.05$, there is no evidence to reject the null hypothesis of normality. + +```{r, echo=TRUE} +set.seed(23890) +x = arima.sim(250,model = list(ar = 0.2)) +y = arima.sim(250,model = list(ar = c(0.4,0,.1))) +elbouch.test(y = y,x = x) +``` + +# Simulations and data analysis + +## Numerical experiments + +Inspired by the simulation studies in @vavra2017 and @nietoreyes2014, we propose here a procedure that involves drawing data from the $AR(1)$ process +\begin{equation} + X_t = \phi X_{t-1} + \epsilon_t, \ t \in\mathbb{Z}, \text{ for } \phi \in \{ 0,\pm 0.25,\pm 0.4\}, (\#eq:eqAR) +\end{equation} +where the $\{\epsilon_t\}_{t\in\mathbb{Z}}$ are i.i.d random variables. For the distribution of the $\epsilon_t$ we consider different scenarios: standard normal ($N$), standard log-normal ($\log N$), Student t with 3 degrees of freedom ($t_3$), chi-squared with 10 degrees of freedom ($\chi^2(10)$) and gamma with $(7, 1)$ shape and scale parameters ($\Gamma(7,1)$). + +```{r tab1-static, eval = knitr::is_latex_output(),warning = FALSE} +load("data/r_sim.Rdata") +phi = c("-0.4","-0.25","0.0","0.25","0.4","max.phi") + +r1 = results1[,2:14] +colnames(r1) = c("phi", phi, phi) + +kable(r1, "latex", booktabs = TRUE,digits = 3, caption = "Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 100" = 6, "n = 250" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) +``` + +```{r tab1-interactive, eval = knitr::is_html_output(),warning = FALSE} +load("data/r_sim.Rdata") +phi = c("-0.4","-0.25","0.0","0.25","0.4","max.phi") + +r1 = results1[,2:14] +colnames(r1) = c("phi", phi, phi) + +kable(r1, "html", booktabs = TRUE, digits = 3, caption = "Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 100" = 6, "n = 250" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) +``` + +As in @vavra2017, $m=1,000$ independent draws of the above process are generated for each pair of parameter $\phi$ and distribution. Each draw is taken of length $past+n,$ with $past=500$ and $n \in \{100,250,500,1000 \}$. The first 500 data points of each realization are then discarded in order to eliminate start-up effects. The $n$ remaining data points are used to compute the value of the test statistic of interest. In each particular scenario, the rejection rate is obtained by computing the proportion of times that the test is rejected among the $m$ trials. + +```{r tab2-static, eval = knitr::is_latex_output(),warning = FALSE} +r2 = results2[,2:14] +colnames(r2) = c("phi", phi, phi) + +kable(r2, "latex", booktabs = TRUE, digits = 3, caption = "Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 500" = 6, "n = 1,000" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) +``` + +```{r tab2-interactive, eval = knitr::is_html_output(),warning = FALSE} +r2 = results2[,2:14] +colnames(r2) = c("phi", phi, phi) + +kable(r2, "html", booktabs = TRUE, digits = 3, caption = "Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 500" = 6, "n = 1,000" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) +``` + +Tables `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab1-interactive)', '\\@ref(tab:tab1-static)'))` and `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab2-interactive)', '\\@ref(tab:tab2-static)'))` present the rejection rate estimates. For every process of length $n,$ the columns represent the used $AR(1)$ parameter and the rows the distribution used to draw the process. The obtained results are consistent with those obtained in the publications where the different tests were proposed. As expected, rejection rates are around 0.05 when the data is drawn from a standard normal distribution, as in this case the data is drawn from a Gaussian process. Conversely, high rejection rates are registered for the other distributions. Low rejection rates are observed, however, for the $\chi^2(10)$ distribution when making use of some of the tests. For instance, the *Epps* and *bootstrap Epps* tests, although they consistently tend to 1 when the length of the process, $n,$ increases. Another case is the El Bouch test. However, this one maintains low rates for large values of $|\phi|$ when $n$ increases. Furthermore, for the random projections test, the number of projections used in this study is the default $k = 1,$ which is by far a lower number than the recommended by @nietoreyes2014. However, even in these conditions, the obtained results are satisfactory, with the random projection test having even better performance than the tests of @epps1987 or @vavra2017. + +An important aspect in selecting a procedure is its computation time. Thus, for each length of the process, $n,$ there is an additional column, max.phi, in *Tables* `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab1-interactive)', '\\@ref(tab:tab1-static)'))` and `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab2-interactive)', '\\@ref(tab:tab2-static)'))`. Each entry in this column refers to a different distribution and contains the maximum running time in seconds to obtain the rejection rate among the different values of the AR parameter. That is, for a fix distribution, the rejection rates are computed for each of the five possibilities of $\phi$ and the time that it takes recorded. The running time in the table is the largest among the five. Furthermore, in \textit{Table} `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab3-interactive)', '\\@ref(tab:tab3-static)'))` we show the time in seconds that each studied test takes to check whether a given process is Gaussian. In particular, the table contains the average running time over 1,000 trials that takes to generate and check a Gaussian AR(1) process with parameter $\phi = 0.5$. This is done for different sample sizes, $n \in \{1000, 2000, 3000, 4000, 5000\}.$ According to the table, the asymptotic tests (Lobato and Velasco, Epps, random projections and El Bouch) have similar running times. On the contrary, the bootstrap based tests (Psaradakis and Vavra, Bootstrap Epps and Lobato and Velasco) have, as expected, higher running times on average. Furthermore, Tables `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab1-interactive)', '\\@ref(tab:tab1-static)'))` and `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(tab:tab2-interactive)', '\\@ref(tab:tab2-static)'))` show similar results in time performance. There, the maximum running time of the bootstrap based tests exceeds in more than ten seconds the time obtained with the asymptotic based tests. It is worth saying that the tables have been obtained with R version 4.3.1 (2023-06-16) and platform aarch64-apple-darwin20 (64-bit),running under macOS Sonoma 14.2.1. + +```{r tab3-static, eval = knitr::is_latex_output(),warning = FALSE} +load("data/runtime.Rdata") + +kable(runtime, "latex", booktabs = TRUE, digits = 4, caption = "Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$") +``` + +```{r tab3-interactive, eval = knitr::is_html_output(),warning = FALSE} +load("data/runtime.Rdata") + +kable(runtime,"html", booktabs = TRUE, digits = 4, caption = "Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$") +``` + +## Real data application + +As an illustrative example, we analyze the monthly mean carbon dioxide, in parts per million (*ppm*), measured at the Mauna Loa Observatory, in Hawaii, from March 1958 to November 2018. The carbon dioxide data measured as the mole fraction in dry air on Mauna Loa constitute the longest record of direct measurements of $CO2$ in the atmosphere. This dataset is available in the \CRANpkg{astsa} package [@astsa] under the name *cardox* data and it is displayed in the left panel of Figure `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(fig:fig1-interactive)', '\\@ref(fig:fig1-static)'))`. The plot's grid is created using the \CRANpkg{cowplot} package [@cowplot]. + +The objective of this subsection is to propose a model to analyze this time series and check the assumptions on the residuals of the model using our implemented `check_residuals()` function. The time series clearly has trend and seasonal components (see left panel of Figure `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(fig:fig1-interactive)', '\\@ref(fig:fig1-static)'))`), therefore, an adequate model that filters both components has to be selected. We make use of an ETS model. For its implementation, we make use the `ets()` function from the \CRANpkg{forecast} package [@Rob2007]. This function fits 32 different ETS models and selects the best model according to information criteria such as *Akaike's information criterion* (AIC) or *Bayesian Information criteria* (BIC) [@BIC2006]. +The results provided by the `ets()` function are: + +```{r fig1-static, fig.cap = "Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour.", eval = knitr::is_latex_output(), fig.alt="(ref:demo-caption1)", out.width = "75%"} +library(astsa) +g1 = autoplot(cardox, main = "CO2 levels at Mauna Loa", + xlab = "years", ylab = "CO2 (ppm)") +g2 = autoplot(forecast(ets(cardox), h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: CO2 Levels at Mauna Loa") +cowplot::plot_grid(g1,g2,ncol = 2) +``` + +`r if (knitr::is_latex_output()) "(ref:demo-caption1) Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour."` + +```{r fig1-interactive, echo = knitr::is_html_output(), eval = knitr::is_html_output(),fig.cap="CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality."} +library(astsa) + +autoplot(cardox, main = "Carbon Dioxide levels at Mauna Loa", + xlab = "years", ylab = "CO2 (ppm)") +``` + + + +```{r, echo = TRUE} +library(forecast) +library(astsa) +model = ets(cardox) +summary(model) +``` + +The resulting model, proposed by the `ets()` function, for analyzing the *carbon dioxide* data in *Mauna Loa* is an $ETS[M,A,A]$ model. The parameters $\alpha, \beta \text{ and } \gamma$ (see Definition 1) have being estimated using the least squares method. If the assumptions on the model are satisfied, then the errors of the model behave like a Gaussian stationary process. To check it, we make use of the function `check_residuals()`. For more details on the compatibility of this function with the models obtained by other packages see the \CRANpkg{nortsTest} repository. In the following, we display the results of using the *Augmented Dickey-Fuller* test (*Subsection 3.1*) to check the stationary assumption and the *random projection* test with `k = 1` projections to check the normality assumption. For the other test options see the function's documentation. + +```{r, echo = TRUE, eval = FALSE} +check_residuals(model,unit_root = "adf",normality = "rp", + plot = TRUE) +``` + +```{r, echo = FALSE, eval = TRUE} +check_residuals(model,unit_root = "adf",normality = "rp", plot = FALSE) +``` + +The obtained results indicate that the null hypothesis of non stationarity is rejected at significance level $\alpha = 0.01.$ Additionally, there is no evidence to reject the null hypothesis of normality at significance level $\alpha = 0.05.$ Consequently, we conclude that the residuals follow a stationary Gaussian process, having that the resulting $ETS[M,A,A]$ model adjusts well to the *carbon dioxide* data in *Mauna Loa*. + +In the above displayed `check_residuals()` function, the `plot` argument is set to `TRUE`. The resulting plots are shown in Figure `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(fig:fig2-interactive)', '\\@ref(fig:fig2-static)'))`. The plot in the *top* panel and the auto-correlation plots in the bottom panels insinuate that the residuals have a stationary behavior. The *top* panel plot shows slight oscillations around zero and the auto-correlations functions in the *bottom* panels have values close to zero in every lag. The histogram and qq-plot in the *middle* panels suggest that the marginal distribution of the residuals is normally distributed. Therefore, Figure `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(fig:fig2-interactive)', '\\@ref(fig:fig2-static)'))` agrees with the reported results, indicating that the assumptions of the model are satisfied. + +```{r fig2-interactive, eval = knitr::is_html_output(), fig.cap = "Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity."} +check_plot(model) +``` + +```{r fig2-static, fig.cap = "Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity.", eval = knitr::is_latex_output(), fig.alt= "(ref:demo-caption2)", out.width = "100%"} +check_plot(model) +``` + +`r if (knitr::is_latex_output()) "(ref:demo-caption2) Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity."` + +As the assumptions of the model have been checked, it can be used for instance to forecast. The result of applying the following function is displayed in Figure `r knitr::asis_output(ifelse(knitr::is_html_output(), '\\@ref(fig:fig3-dynamic)', '\\@ref(fig:fig1-static)'))`. It presents the carbon dioxide data for the last 8 years and a forecast of the next 12 months. It is observable from the plot that the model captures the process trend and periodicity. + +```{r fig3-dynamic, echo = knitr::is_html_output(), eval = knitr::is_html_output(), fig.cap = "Forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour."} +autoplot(forecast(model,h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: Carbon Dioxide Levels at Mauna Loa") +``` + +```{r, echo = knitr::is_latex_output(), eval = FALSE, fig.cap = "(ref:demo-caption3)"} +autoplot(forecast(model,h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: Carbon Dioxide Levels at Mauna Loa") +``` + +`r if (knitr::is_latex_output()) "(ref:demo-caption3) Forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour."` + +# Conclusions + +For independent data, the \CRANpkg{nortest} package [@nortest2015] provides five different tests for normality, the \CRANpkg{mvnormtest} package [@mvnormtest2012] performs the Shapiro-Wilks test for multivariate data and the \CRANpkg{MissMech} package [@Mortaza2014] provides tests for normality in multivariate incomplete data. To test the normality of dependent data, some authors such as @vavra2017 and @nietoreyes2014 have available undocumented `Matlab` code, which is almost only helpful in re-doing their simulation studies. + +To our knowledge, no consistent implementation or package of tests for normality of stationary processes has been done before. Therefore, the \CRANpkg{nortsTest} is the first package to implement normality tests in stationary processes. This work gives a general overview of a careful selection of tests for normality in the stationary process, which consists of the most available types of tests. It additionally provides examples that illustrate each of the test implementations. + +For checking the model's assumptions, the \CRANpkg{forecast} and \CRANpkg{astsa} packages contain functions for visual diagnostic. Following the same idea, \CRANpkg{nortsTest} provides similar diagnostic methods; it also reports the results of testing stationarity and normality, the main assumptions for the residuals in time series analysis. + +# Future work and projects + +A further version of the \CRANpkg{nortsTest} package will incorporate additional tests such as Bispectral [@Hinich1982] and Stein's characterization [@Meddahi2005]. Further future work will include a Bayesian version of a *residuals check* procedure that uses the random projection method. Any future version under development can be installed from `GitHub` using the following code. + +```{r,echo = TRUE, eval = FALSE} +if (!requireNamespace("remotes")) install.packages("remotes") +remotes::install_github("asael697/nortsTest",dependencies = TRUE) +``` + +# Acknowledgment {-} + +This work was supported by grant PID2022-139237NB-I00 funded by “ERDF A way of making Europe” and MCIN/AEI/10.13039/501100011033. diff --git a/_articles/RJ-2024-008/RJ-2024-008.html b/_articles/RJ-2024-008/RJ-2024-008.html new file mode 100644 index 0000000000..91d8d3f2ae --- /dev/null +++ b/_articles/RJ-2024-008/RJ-2024-008.html @@ -0,0 +1,6891 @@ + + + + + + + + + + + + + + + + + + + + + + nortsTest: An R Package for Assessing Normality of Stationary Processes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    nortsTest: An R Package for Assessing Normality of Stationary Processes

    + + + +

    Normality is the central assumption for analyzing dependent data in several time series models, and the literature has widely studied normality tests. However, the implementations of these tests are limited. The nortsTest package is dedicated to fill this void. The package performs the asymptotic and bootstrap versions of the tests of Epps and Lobato and Velasco and the tests of Psaradakis and Vavra, random projections and El Bouch for normality of stationary processes. These tests are for univariate stationary processes but for El Bouch that also allows bivariate stationary processes. In addition, the package offers visual diagnostics for checking stationarity and normality assumptions for the most used time series models in several R packages. This work aims to show the package’s functionality, presenting each test performance with simulated examples and the package utility for model diagnostic in time series analysis.

    +
    + + + +
    +

    1 Introduction

    +

    Normality (a set of observations sampled from a Gaussian process) is an essential assumption in various statistical models. Therefore, developing procedures for testing this assumption is a topic that has gained popularity over several years. Most existing literature and implementation is dedicated to independent and identically distributed random variables (D’Agostino and Stephens 1986); no results show that these tests are consistent when applied to stationary processes. For this context, several tests have been proposed over the years, but as far as we know, no R package or consistent implementation exists.

    +

    The proposed nortsTest package provides seven test implementations to check normality of stationary processes. This work aims to present a review of these tests and introduce the package functionality. Thus, its novelty lies in being the first package and paper dedicated to the implementation of normality tests for stationary processes. The implemented tests are: (i) the asymptotic Epps test, (Epps 1987) and (Nieto-Reyes et al. 2014), based on the characteristic function and (ii) its sieve bootstrap approximation (Psaradakis and Vávra 2020), (iii) the corrected Skewness-Kurtosis (SK) test implemented by Lobato and Velasco (2004) as an asymptotic test and (iv) by Psaradakis and Vávra (2020) with a sieve bootstrap approximation, (v) the random projections test proposed by Nieto-Reyes et al. (2014), which makes use of the tests in (i) and (iii), (vi) the Psadarakis and Vávra test (Psaradakis and Vávra 2017) that uses a bootstrap approximation of the Anderson and Darling (1952) test statistic for stationary linear processes and (vii) a normality test by El Bouch et al. (2022) for multivariate dependent samples. Tests (i) to (vi) are for univariate stationary processes.

    +

    Furthermore, we propose the check_residual() function for checking time-series models’ assumptions. This function returns a report for stationarity, seasonality, normality tests and visual diagnostics. check_residual() supports models from the most used packages for time-series analysis, such as the packages forecast (Hyndman and Khandakar 2008) and aTSA (Qiu 2015) and even functions in the base R (Team 2018); for instance, it supports the HoltWinters (stats R package) function for the Holt and Winters method (Holt 2004). In addition, the proposed nortsTest package has already been applied in the literature, see Nieto-Reyes (2021) and Nieto-Reyes (2022).

    +

    Section 2 provides the theoretical background, including preliminary concepts and results. Section 3 introduces the normality tests for stationary processes, each subsection introducing a test framework and including examples of the tests functions with simulated data. Section 4 provides numerical experiments with simulated data and a real-world application: Subsection 4.1 reports a simulation study for the implemented normality tests and Subsection 4.2 the package’s functionality for model checking in a real data application. The carbon dioxide data measured in the Malua Loa Observatory (Stoffer 2020) is analyzed using a state space model from the forecast package, evaluating the model’s assumptions using our proposed check_residuals() function. Section 5 discusses the package functionality and provides our conclusions. Furthermore, we mention our future intended work on the package.

    +

    2 Preliminary concepts

    +

    This section provides some theoretical aspects of stochastic processes that are a necessary theoretical framework for the following sections. Shumway and Stoffer (2010) and Tsay (2010) give more details of the following definitions and results below.

    +

    For the purpose of this work, \(T\) is a set of real values denoted as time, \(T \subseteq \mathbb{R},\) for instance \(T=\mathbb{N}\) or \(T=\mathbb{Z},\) the naturals or integer numbers respectively. We denote by \(X:=\{X_t\}_{t\in T}\) a with \(X_t\) a real random variable for each \(t\in T.\) Following this notation, a is just a finite collection of ordered observations of \(X\) (Shumway and Stoffer 2010). An important measure for a stochastic process is its mean function \(\mu(t) := E[X_t]\) for each \(t \in T\), where \(E[\cdot]\) denotes the usual expected value of a random variable. A generalization of this measure is the k-th order centered moment function \(\mu_k(t) := E[(X_t -\mu(t))^k]\) for each \(t \in T\) and \(k > 1;\) with the process variance function being the second order centered moment, \(\sigma^2(t) := \mu_2(t)\). Other important measures are the auto-covariance and auto-correlation functions, which measure the linear dependency between two different time points of a given process. For any \(t,s \in T,\) they are, respectively, +\[ +\gamma(t,s) := E[(X_t -\mu(t))(X_s - \mu(s))] \mbox{ and } \rho(t,s) := \dfrac{\gamma(t,s)}{\sqrt{\mu_2(t)}\sqrt{\mu_2(s)}}. +\] +Other widely used measure functions for the analysis of processes are the skewness and kurtosis functions, defined as \(s(t) := \mu_3(t)/[\mu_2(t)]^{3/2}\) and \(k(t) := \mu_4(t)/[\mu_2(t)]^2\) for each \(t\in T,\) respectively.

    +

    A generally used assumption for stochastic processes is stationarity. It has a key role in forecasting procedures of classic time-series modeling (Tsay 2010) or as a principal assumption in de-noising methods for signal theory (Wasserman 2006).

    +
    Definition 1
    +

    A stochastic process \(X\) is said to be if, for every collection \(\tau = \{t_1,t_2,\ldots, t_k\} \subset T\) and \(h > 0\), the joint distribution of \(\{X_t\}_{t \in \tau}\) is identical to that of \(\{X_{t+h}\}_{t \in \tau}.\)

    +

    The previous definition is strong for applications. A milder version of it, which makes use of the process’ first two moments, is weak stationarity.

    +
    Definition 2
    +

    A stochastic process \(X\) is said to be if its mean function is constant in time, \(\mu(t) = \mu\), its auto-covariance function only depends on the difference between times, \(\gamma(s,t) = \sigma|t-s|\) for a \(\sigma\in \mathbb{R}\), and it has a finite variance function, \(\mu_2(t) = \mu_2 < \infty\).

    +

    For the rest of this work, the term stationary will be used to specify a weakly stationary process. A direct consequence of the stationarity assumption is that the previous measure functions get simplified. Thus, given a stationary stochastic process \(X,\) its mean function, \(k\)-th order centered moment, for \(k>1,\) and auto-covariance function are respectively, +\[ +\mu = E[X_{t_1}]\mbox{, } \mu_k = E[(X_{t_1} -\mu)^k] \mbox{ and } \gamma(h) = E[(X_{t_1+h}-\mu)(X_{t_1}-\mu)], +\] +which are independent of \(t_1\in T.\)

    +

    Given a sample \(x_1, \ldots, x_n,\) \(n\in\mathbb{N},\) of equally spaced observations of \(X,\) their corresponding estimators, sample mean, sample \(k\)-th order centered moment and sample auto-covariance, are respectively +\[ +\widehat{\mu} := n^{-1}\sum_{i=1}^nx_i\mbox{, } \widehat{\mu}_k := n^{-1}\sum_{i=1}^n(x_i - \widehat{\mu})^k \mbox{ and }\widehat{\gamma}(h) := n^{-1}\sum_{i = 1}^{n-h}(x_{i+h} - \widehat{\mu})(x_i - \widehat{\mu}). +\]

    +

    A particular case in which stationarity implies strictly stationarity is a Gaussian process.

    +
    Definition 3
    +

    A stochastic process \(X\) is said to be a Gaussian process if for every finite collection \(\tau = \{t_1,t_2,\ldots, t_k\} \subset T,\) the joint distribution of \(\{X_t\}_{t \in \tau}\) has a multivariate normal distribution.

    +

    A series of mean zero uncorrelated random variables with finite constant variance is known as white noise. If additionally, it is formed of independent and identically distributed (i.i.d) normal random variables, it is known as Gaussian white noise; which is a particular case of stationary Gaussian process. For the rest of the work, \(X_t \sim N(\mu,\sigma^2)\) denotes that the random variable \(X_t\) is normally distributed with mean \(\mu\) and variance \(\sigma^2\) and \(\chi^2(v)\) denotes the Chi square distribution with \(v\) degrees of freedom.

    +

    Other classes of stochastic processes can be defined using collections of white noise, for instance, the linear process.

    +
    Definition 4
    +

    Let \(X\) be a stochastic process. \(X\) is said to be linear if it can be written as +\[ +X_t = \mu + \sum_{i\in\mathbb{Z}}\phi_i\epsilon_{t-i}, +\] +where \(\{\epsilon_i\}_{i\in\mathbb{Z}}\) is a collection of white noise random variables and \(\{\phi_i\}_{i\in\mathbb{Z}}\) is a set of real values such that \(\sum_{i\in\mathbb{Z}} |\phi_j| < \infty.\)

    +

    An important class of processes is the auto-regressive moving average (\(ARMA\)). Box and Jenkins (1990) introduced it for time series analysis and forecast, becoming very well-known in the 90s and early 21st century.

    +
    Definition 5
    +

    For any non-negative integers \(p,q,\) a stochastic process \(X\) is an \(ARMA(p,q)\) process if it is a stationary process and +\[\begin{equation} + X_t = \sum_{i=0}^p \phi_iX_{t-i} +\sum_{i=0}^q \theta_i\epsilon_{t-i}, \tag{1} +\end{equation}\] +where \(\{\phi_i\}_{i=0}^p\) and \(\{\theta_i\}_{i=0}^q\) are sequences of real values with \(\phi_0= 0,\) \(\phi_p\neq 0,\) \(\theta_0=1\) and \(\theta_q\neq 0\) and \(\{\epsilon_{i}\}_{i\in\mathbb{Z}}\) is a collection of white noise random variables.

    +

    Particular cases of \(ARMA\) processes are those known as auto-regressive (\(AR(p) := ARMA(p,0)\)) and mean average (\(MA(q) := ARMA(0,q)\)) processes. Additionally, a is a non stationary AR(1) +process satisfying (1) with \(p=1,\) \(\phi_1 = 1\) and \(q=0.\) Several properties of an \(ARMA\) process can be extracted from its structure. For that, the \(AR\) and \(MA\) polynomials are introduced +\[ +AR:\text{ } \phi(z) = 1-\sum_{i=0}^p \phi_i z^i \text{ and } MA:\text{ } \theta(z) = \sum_{i=0}^q \theta_i z^i, +\] +where \(z\) is a complex number and, as before, \(\phi_0 = 0,\) \(\phi_p\neq 0,\) \(\theta_0= 1\) and \(\theta_q\neq 0.\) Conditions for stationarity, order selection and, process behavior are properties studied from these two polynomials.

    +

    For modeling volatility in financial data, Bollerslev (1986) proposed the generalized auto-regressive conditional heteroscedastic (GARCH) class of processes as a generalization of the auto-regressive conditional heteroscedastic (ARCH) processes (Engle 1982).

    +
    Definition 6
    +

    For any \(p,q \in \mathbb{N}\), a stochastic process \(X\) is a \(GARCH(p,q)\) process if it satisfies \(X_t = \mu + \sigma_{t}\epsilon_t\) with +\[ +\sigma_t^2 = \alpha_0 +\sum_{i=1}^p\alpha_i \epsilon_{t-i}^2 +\sum_{i=1}^q \beta_{i}\sigma^2_{t-i}. +\] +\(\mu\) is the process mean, \(\sigma_0\) is a positive constant value, \(\{\alpha_i\}_{i=1}^p\) and \(\{\beta_i\}_{i=1}^q\) are non-negative sequences of real values and \(\{\epsilon_{t}\}_{t \in T}\) is a collection of i.i.d. random variables.

    +

    A more general class of processes are the state-space models (\(SSMs\)), which have gained popularity over the years because they do not impose on the process common restrictions such as linearity or stationarity and are flexible in incorporating the process different characteristics (Petris et al. 2007). They are widely used for smoothing (West and Harrison 2006) and forecasting (Hyndman and Khandakar 2008) in time series analysis. The main idea is to model the process dependency with two equations: the state equation, which models how parameters change over time, and the innovation equation, which models the process in terms of the parameters. Some particular SSMs that analyze the level, trend and seasonal components of the process are known as error, trend, and seasonal (ETS) models. There are over 32 different variations of ETS models (Hyndman et al. 2008). One of them is the multiplicative error, additive trend-seasonality \((ETS(M,A,A))\) model.

    +
    Definition 7
    +

    A SSM process \(X\) follows an ETS(M,A,A) model, if the process accepts
    +\[ +X_t = [L_{t-1} +T_{t-1} + S_{t-1}](1 + \epsilon_t) +\] +as innovation equation and +\[\begin{eqnarray*}L_t &= &L_{t-1} +T_{t-1} +\alpha (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + T_t &= &T_{t-1} + \beta (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + S_t &= &S_{t-m} + \gamma (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t, +\end{eqnarray*}\]
    +as state equations. +\(\alpha, \beta,\gamma \in [0,1]\), \(m\in\mathbb{N}\) denotes the period of the series and \(\{\epsilon_t\}\) are i.i.d normal random variables. For each \(t\in\mathbb{Z},\) \(L_t\), \(T_t\) and \(S_t\) represent respectively the level, trend and seasonal components.

    +

    3 Normality tests for stationary processes

    +

    Extensive literature exists on goodness of fit tests for normality under the assumption of independent and identically distributed random variables, including, among others, Pearson’s chi-squared test (Pearson and Henrici 1895), Kolmogorov-Smirnov test (Smirnov 1948), Anderson-Darling test (Anderson and Darling 1952), SK test (Jarque and Bera 1980) and Shapiro-Wilk test, (Shapiro and Wilk 1965) and (Royston 1982). These procedures have been widely used in many studies and applications, see D’Agostino and Stephens (1986) for further details. There are no results, however, showing that the above tests are consistent in the context of stationary processes, in which case the independence assumption is violated. For instance, Gasser (1975) provides a simulation study where Pearson’s chi-squared test has an excessive rejection rate under the null hypothesis for dependent data. For this matter, several tests for stationary processes have been proposed over the years. A selection of which we reference here. Epps (1987) provides a test based on the characteristic function, Hinich (1982) proposes a similar test based on the process’ spectral density function (Berg et al. 2010, for further insight). Gasser (1975) gives a correction of the SK test, with several modifications made in Lobato and Velasco (2004), Bai and Ng (2005) and Psaradakis (2017), which are popular in many financial applications. Bontemps and Meddahi (2005) constructs a test based on Stein’s characterization of a Gaussian distribution. Using the random projection method (Cuesta-Albertos et al. 2007), Nieto-Reyes et al. (2014) build a test that upgrades the performance of Epps (1987) and Lobato and Velasco (2004) procedures. Furthermore, Psaradakis and Vávra (2017) adapts the Anderson and Darling (1952) statistic for stationary linear processes approximating its sample distribution with a sieve bootstrap procedure.

    +

    Despite the existing literature, consistent implementations of goodness of fit test for normality of stationary processes in programming languages such as R or Python are limited. This is not the case for normality of independent data, the nortest package (Gross and Ligges 2015) implements tests such as Lilliefors (Dallal and Wilkinson 1986), Shapiro-Francia (Royston 1993), Pearson’s chi-squared, Cramer von Misses (Anderson 1962) and Anderson-Darling. For a multivariate counterpart, the mvnTest package (Pya et al. 2016) implements the multivariate Shapiro-Wilk, Anderson-Darling, Cramer von Misses, Royston (Royston 1992), Doornik and Hansen (Doornik and Hansen 2008), Henze and Zirkler (Henze and Zirkler 1990) and the multivariate Chi square test (Vassilly Voinov and Voinov 2016). For the case of dependent data, we present here the nortsTest package. Type within R install.packages("nortsTest", dependencies = TRUE) to install its latest released version from CRAN. nortsTest performs the tests proposed in Epps (1987), Lobato and Velasco (2004), Psaradakis and Vávra (2020), Nieto-Reyes et al. (2014), Psaradakis and Vávra (2017) and El Bouch et al. (2022).

    +

    Additionally, the package offers visualization functions for descriptive time series analysis and several diagnostic methods for checking stationarity and normality assumptions for the most used time series models of several R packages. To elaborate on this, Subsection 3.1 introduces the package functionality and software and Subsection 3.2 provides an overview of tests for checking stationary and seasonality. Finally, Subsections 3.3-3.5 present a general framework of each of the implemented normality tests and their functionality by providing simulated data examples.

    +

    3.1 Software

    +

    The package works as an extension of the nortest package (Gross and Ligges 2015), which performs normality tests in random samples but for independent data. The building block functions of the nortsTest package are:

    +
      +
    • epps.test(), function that implements the test of Epps,

    • +
    • epps_bootstrap.test(), function that implements a bootstrap approximation of the test of Epps,

    • +
    • lobato.test(), function that implements the asymptotic test of Lobato and Velasco,

    • +
    • lobato_bootstrap.test(), function that implements a bootstrap approximation of the test of Lobato and Velasco,

    • +
    • rp.test(), function that implements the random projection test of Nieto-Reyes, Cuesta-Albertos and Gamboa,

    • +
    • vavra.test(), function that implements the test of Psaradaki and Vavra, and

    • +
    • elbouch.test(), function that implements the test of El Bouch, Michel and Comon.

    • +
    +

    Each of these functions accepts a numeric (numeric) or ts (time series) class object for storing data, and returns a htest (hypothesis test) class object with the main results for the test. To guarantee the accuracy of the results, each test performs unit root tests for checking stationarity and seasonality (see Subsection 3.2) and displays a warning message if any of them is not satisfied.

    +

    For visual diagnostic, the package offers different plot functions based on the ggplot2 package (Wickham 2009): the autoplot() function plots numeric, ts and mts (multivariate time series) classes while the gghist() and ggnorm() functions are for plotting histogram and qq-plots respectively; and on the forecast package (Hyndman and Khandakar 2008): ggacf() and ggPacf() for the display of the auto-correlation and partial auto-correlations functions respectively.

    +

    Furthermore, inspired in the function checkresiduals() of the forecast package, we provide the check_residuals() function to test the model assumptions using the estimated residuals. The upgrade of our proposal is that, besides providing plots for visual diagnosis (setting the plot option as TRUE), it does check stationarity, seasonality (Subsection 3.2) and normality, presenting a report of the used tests and conclusions for assessing the model’s assumptions. An illustration of these functions is provided in Subsection 4.2, where we show the details of the functions and their utility for assumptions commonly checked in time series modeling.

    +

    3.2 Tests for stationarity

    +

    For checking stationarity, the nortsTest package uses and tests. These tests work similarly, checking whether a specific process follows a random walk model, which clearly is a non-stationary process.

    +

    Unit root tests

    +

    A linear stochastic process \(X\) that follows a random walk model is non stationary. Its AR polynomial is \(\phi(z) = 1 - z\), whose solution (root) is unique and equal to one. Thus, it is common to test the non stationarity of a linear process by checking whether its AR polynomial has a unit root (a root equal to one).

    +

    The most commonly used tests for unit root testing are Augmented Dickey-Fuller (Said and Dickey 1984), Phillips-Perron (Perron 1988), kpps (Kwiatkowski et al. 1992) and (Box and Pierce 1970). In particular, the Ljung-Box test contrasts the null auto-correlation hypothesis of identically distributed Gaussian random variables, which is equivalent to test stationarity. The uroot.test() and check_residual() functions perform these tests, making use of the tseries package (Trapletti and Hornik 2019).

    +

    Seasonal unit root tests

    +

    Let \(X\) be a stationary process and \(m\) its period. Note that for observed data, \(m\) generally corresponds to the number of observations per unit of time. \(X\) follows a seasonal random walk if it can be written as +\[ +X_t = X_{t-m} + \epsilon_t, +\] +where \(\epsilon_t\) is a collection of i.i.d random variables. In a similar way, the process \(X\) is non-stationary if it follows a seasonal random walk. Or equivalently, \(X\) is non stationary if the seasonal AR(1) polynomial (\(\phi_m(z) = 1 - \phi z^m\)) has a unit root. The seasonal.test() and check_residuals() functions perform the OCSB test (Osborn et al. 1988) from the forecast package and the HEGY (Beaulieu and Miron 1993) and Ch (Canova and Hansen 1995) tests from the uroot package (López-de-Lacalle 2019).

    +

    3.3 Tests of Epps

    +

    The \(\chi^2\) test for normality proposed by Epps (1987) compares the empirical characteristic function of the one-dimensional marginal of the process with the one of a normally distributed random variable evaluated at certain points on the real line. Several authors, including Lobato and Velasco (2004), Psaradakis and Vávra (2017) and El Bouch et al. (2022), point out that the greatest challenge in the Epps’ test is its implementation procedure, which we address with the nortsTest package. Other existing tests based on the empirical characteristic function of the one-dimensional marginal of the process include Hong (1999) and the references therein. This test differs, however, in that it uses spectral analysis and derivatives.

    +

    Furthermore, Meintanis (2016) reviews on testing procedures based on the empirical characteristic function. There, it is commented about the random projection test (Nieto-Reyes et al. 2014, and here below) as a recent development of Epps’ test. In fact, in Nieto-Reyes et al. (2014) the consistency of Epps test is improved by taking at random the elements at which the characteristic function is evaluated. Additionally, El Bouch et al. (2022) proposes a sieve bootstrap modification of the Epps’ test. In addition to the classical asymptotic Epps’ test, we include these last two approaches here, and in the package, see the Example below and the paragraph before it. Let us provide now the foundation behind the Epps’ tests.

    +

    Let \(X\) be a stationary stochastic process that satisfies +\[\begin{equation} + \sum_{t=-\infty}^{\infty}|t|^k|\gamma(t)| <\infty \mbox{ for some } k >0. \tag{2} +\end{equation}\] +The null hypothesis is that the one-dimensional marginal distribution of \(X\) is a Gaussian process. The procedure for constructing the test consists of defining a function \(g\), estimating its inverse spectral matrix function, minimizing the generated quadratic function in terms of the unknown parameters of the random variable and, finally, obtaining the test statistic, which converges in distribution to a \(\chi^2.\)

    +

    Given \(N \in\mathbb{N}\) with \(N \geq 2,\) let +\[ +\Lambda :=\{\lambda:=(\lambda_1, \ldots, \lambda_N) \in \mathbb{R}^N: \lambda_i \leq \lambda_{i+1} \text{ and } \lambda_i > 0, \text{ for } i = 1,2,\ldots, N \}, +\] +and \(g:\mathbb{R}\times \Lambda \rightarrow \mathbb{R}^n\) be a measurable function, where +\[ +g(x,\lambda):= [\cos(\lambda_1x),\sin(\lambda_1x),\ldots,\cos(\lambda_Nx),\sin(\lambda_Nx)]. +\] +Additionally, let \(g_\theta:\Lambda \rightarrow \mathbb{R}^N\) be a function defined by +\[ +g_\theta(\lambda) := \left[\mbox{Re}(\Phi_\theta(\lambda_1)),\mbox{Im}(\Phi_\theta(\lambda_1)),\ldots,\mbox{Re}(\Phi_\theta(\lambda_N)),\mbox{Im}(\Phi_\theta(\lambda_N)) \right]^t, +\] +where the \(\mbox{Re}(\cdot)\) and \(\mbox{Im}(\cdot)\) are the real and imaginary components of a complex number and \(\Phi_\theta\) is the characteristic function of a normal random variable with parameters \(\theta := (\mu,\sigma^2)\in \Theta,\) an open bounded set contained in \(\mathbb{R}\times \mathbb{R}^+\). For any \(\lambda\in\Lambda,\) let us also denote +\[ +\widehat{g}(\lambda) := \dfrac{1}{n}\sum_{t=1}^n [\cos(\lambda_1 x_t),\sin(\lambda_1x_t),\ldots,\cos(\lambda_N x_t),\sin(\lambda_N x_t)]^t. +\] +Let \(f(v;\theta,\lambda)\) be the spectral density matrix of \(\{g(X_t,\lambda)\}_{t \in\mathbb{Z}}\) at a frequency \(v.\) +Then, for \(v = 0\), it can be estimated by +\[ +\widehat{f}(0;\theta,\lambda) := \dfrac{1}{2\pi n}\left(\sum_{t=1}^n \widehat{G}(x_{t,0},\lambda) +2\sum_{i=1}^{\lfloor n^{2/5}\rfloor}(1 -i/\lfloor n^{2/5} \rfloor)\sum_{t=1}^{n-i}\widehat{G}(x_{t,i},\lambda) \right), +\] +where \(\widehat{G}(x_{t,i},\lambda) = (\widehat{g}(\lambda) -g(x_{t},\lambda))(\widehat{g}(\lambda) -g(x_{t+i},\lambda))^t\) and \(\lfloor \cdot \rfloor\) denotes the floor function. The test statistic general form under \(H_0\) is +\[ +Q_n(\lambda) := \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +\] +with +\[ +Q_n(\theta,\lambda):=(\widehat{g}(\lambda)-g_\theta(\lambda))^tG_n^+(\lambda)(\widehat{g}(\lambda)-g_\theta(\lambda)), +\] +where \(G^{+}_n\) is the generalized inverse of the spectral density matrix \(2 \pi \widehat{f}(0;\theta,\lambda)\). Let +\[ +\widehat{\theta} := \arg \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +\] +be the argument that minimizes \(Q_n(\theta,\lambda)\) such that \(\widehat{\theta}\) is in a neighborhood of \(\widehat{\theta}_n := (\widehat{\mu},\widehat{\gamma}(0))\). To guarantee its’ existence and uniqueness, the following assumptions are required. We refer to them as assumption \((A.)\).

    +

    \((A.)\) Let \(\theta_0\) be the true value of \(\theta\) under \(H_0\), then for every \(\lambda \in \Lambda\) the following conditions are satisfied.

    +
      +
    • \(f(0;\theta,\lambda)\) is positive definite.

    • +
    • \(\Phi_\theta(\lambda)\) is twice differential with respect to \(\theta\) in a neighborhood of \(\theta_0\).

    • +
    • The matrix \(D(\theta_0,\lambda) = \dfrac{\partial \Phi_\theta(\lambda)}{\partial\theta |_{\theta = \theta_0}} \in \mathbb{R}^{N\times 2}\) has rank 2.

    • +
    • The set \(\Theta_0(\lambda) := \{ \theta \in \Theta: \Phi_\theta(\lambda_i) = \Phi_{\theta_0}(\lambda_i), i=1, \ldots,N\}\) is a finite bounded set in \(\Theta\). And \(\theta\) is a bounded subset \(\mathbb{R}\times \mathbb{R}^+\).

    • +
    • \(f(0;\theta,\lambda) = f(0;\theta_0,\lambda)\) and \(D(\theta_0,\lambda) = D(\theta_,\lambda)\) for all \(\theta \in \Theta_0(\lambda)\).

    • +
    +

    Under these assumptions, the Epps’s main result is presented as follows.

    +
    Theorem 1 (Epps 1987, Theorem 2.1)
    +

    Let \(X\) be a stationary Gaussian process such that (2) and \((A.)\) are satisfied, then \(nQ_n(\lambda)\to_d \chi^2(2N - 2)\) for every \(\lambda \in \Lambda\).

    +

    The current nortsTest version, uses \(\Lambda := \{\verb|lambda|/\widehat{\gamma}(0)\}\) as the values to evaluate the empirical characteristic function, where \(\widehat{\gamma}(0)\) is the sample variance. By default lambda = c(1, 2). Therefore, the implemented test statistic converges to a \(\chi^2\) distribution with two degrees of freedom. The user can change these \(\Lambda\) values as desired by simply specifying the function’s lambda argument, as we show in the Example below.

    +
    Example 1
    +

    A stationary \(AR(2)\) process is drawn using a beta distribution with shape1 = 9 and shape2 = 1 parameters, and performed the implementation of the test of Epps, epps.test(). At significance level \(\alpha = 0.05\), the null hypothesis of normality is correctly rejected.

    +
    +
    +
    set.seed(298)
    +x = arima.sim(250,model = list(ar =c(0.5,0.2)),
    +                 rand.gen = rbeta,shape1 = 9,shape2 = 1)
    +
    +# Asymptotic Epps test
    +epps.test(x)
    +
    +
    #> 
    +#>  Epps test
    +#> 
    +#> data:  x
    +#> epps = 22.576, df = 2, p-value = 1.252e-05
    +#> alternative hypothesis: x does not follow a Gaussian Process
    +
    +

    Asymptotic Epps test with random Lambda values as proposed in Nieto-Reyes et al. (2014).

    +
    +
    +
    set.seed(298)
    +epps.test(x, lambda = abs(rnorm(mean = c(1, 2), 2)))
    +
    +
    #> 
    +#>  Epps test
    +#> 
    +#> data:  x
    +#> epps = 25.898, df = 2, p-value = 2.379e-06
    +#> alternative hypothesis: x does not follow a Gaussian Process
    +
    +

    Approximated sieve bootstrap Epps test using 1000 repetitions of 250 units.

    +
    +
    +
    set.seed(298)
    +epps_bootstrap.test(x, seed = 298)
    +
    +
    #> 
    +#>  Sieve-Bootstrap epps test
    +#> 
    +#> data:  y
    +#> bootstrap-epps = 22.576, p-value < 2.2e-16
    +#> alternative hypothesis: y does not follow a Gaussian Process
    +
    +

    3.4 Tests of Lobato and Velasco

    +

    Lobato and Velasco (2004) provides a consistent estimator for the corrected SK test statistic for stationary processes, see Lomnicki (1961) and Gasser (1975) for further insight. Note that the SK test is also known as the Jarque-Bera test (Jarque and Bera 1980), which is already available in several R packages (Trapletti and Hornik 2019, for instance). The improvement of this proposal over those implementations is a correction in the skewness and kurtosis estimates by the process’ auto-covariance function, resulting in a consistent test statistic under the assumption of correlated data. The test in Lobato and Velasco (2004) is asymptotic, which is computationally efficient, as opposed to a bootstrap based test. Psaradakis and Vávra (2020) show that the bootstrap modification of the Lobato and Velasco’s test is a fair competitor against the original asymptotic test, beating other tests for normality of the one-dimensional marginal distribution in terms of power. Thus, the package incorporates both the asymptotic, lobato.test() and its bootstrap version lobato_bootstrap.test().

    +

    The general framework for the test is presented in what follows. On the contrary to the test of Epps, this proposal does not require additional parameters for the computation of the test sample statistic.

    +

    Let \(X\) be a stationary stochastic process that satisfies

    +

    \[\begin{equation} +\sum_{t=0}^{\infty}|\gamma(t)| <\infty. \tag{3} +\end{equation}\]

    +

    The null hypothesis is that the one-dimensional marginal distribution of \(X\) is normally distributed, that is +\[ +H_0: X_t \sim N(\mu,\sigma^2) \text{ for all } t \in \mathbb{R}. +\] +Let \(k_q(j_1,j_2,\ldots,j_{q-1})\) be the q-th order cummulant of \(X_{1},X_{1+j_1},\ldots,X_{1+j_{q-1}}\). \(H_0\) is fulfilled if all the marginal cummulants above the second order are zero. In practice, it is tested just for the third and fourth order marginal cummulants. Equivalently, in terms of moments, the marginal distribution is normal by testing whether \(\mu_3 = 0\) and \(\mu_4 = 3 \mu_2^2\). For non-correlated data, the SK test compares the SK statistic against upper critical values from a \(\chi^2(2)\) distribution (Bai and Ng 2005). For a Gaussian process \(X\) satisfying (3), it holds the limiting result +\[ +\sqrt{n} \binom{\widehat{\mu}_3}{\widehat{\mu}_4 -3\widehat{\mu}^2_2} \to_d N[0_2,\Sigma_F)], +\] +where \(0_2 := (0,0)^t \in \mathbb{R}^2\) and \(\Sigma_F := \mbox{diag}(6F^{(3)}, \text{ } 24F^{(4)}) \in \mathbb{R}^{2x2}\) is a diagonal matrix with \(F^{(k)} := \sum_{j = -\infty}^{\infty}\gamma(j)^k\) for \(k=3,4\) (Gasser 1975).

    +

    The following consistent estimator in terms of the auto-covariance function is proposed in Lobato and Velasco (2004) +\[ +\widehat{F}^{(k)} := \sum_{t = 1-n}^{n-1}\widehat{\gamma}(t)[\widehat{\gamma}(t) +\widehat{\gamma}(n-|t|)]^{k-1}, +\] +to build a generalized SK test statistic +\[ +G := \dfrac{n \widehat{\mu}_3^2}{6 \widehat{F}^{(3)}} + \dfrac{n(\widehat{\mu}_4 -3\widehat{\mu}_2)^2}{24\widehat{F}^{(4)}}. +\] +Similar to the SK test for non-correlated data, the \(G\) statistic is compared against upper critical values from a \(\chi^2(2)\) distribution. This is seen in the below result that establishes the asymptotic properties of the test statistics, so that the general test procedure can be constructed. The result requires the following assumptions, denoted by \((B.),\) for the process \(X.\)

    +

    (B.)

    +
      +
    • \(E[X_t^{16}] < \infty\) for \(t \in T.\)

    • +
    • \(\sum_{j_1 = -\infty}^{\infty}\cdots \sum_{j_{q-1} = -\infty}^{\infty} |k_q(j_1,\ldots,j_{q-1})| < \infty \text{ for } q=2,3,\ldots,16.\)

    • +
    • \(\sum_{j=1}^{\infty}\left(E \left[\text{ } E[(X_0-\mu)^k|B_j] -\mu_k\right]^2 \right)^{1/2} < \infty \text{ for } k = 3,4,\) where \(B_j\) denotes the \(\sigma\)-field generated by \(X_t\), \(t \leq -j.\)

    • +
    • \(E\left[Z_k \right]^2 +2\sum_{j=1}^{\infty}E\left(\left[Z_k \right] \left[ (X_j -\mu)^k -\mu_k \right] \right) > 0\) for \(k = 3,4,\) with \(Z_k=(X_0 -\mu)^k -\mu_k.\)

    • +
    +

    Note that these assumptions imply that the higher-order spectral densities up to order 16 are continuous and bounded.

    +
    Theorem 2 (Lobato and Velasco 2004, Theorem 1)
    +

    Let \(X\) be a stationary process. If \(X\) is Gaussian and satisfies (3) then \(G \to_d \chi^2(2)\), and under assumption (B.), the test statistic G diverges whenever \(\mu_3 \neq 0\) or \(\mu_4 \neq 3\mu_2^2.\)

    +
    Example 2
    +

    A stationary \(MA(3)\) process is drawn using a gamma distribution with rate = 3 and shape = 6 parameters. The lobato.test() function performs the test of Lobato and Velasco to the simulated data. At significance level \(\alpha = 0.05\), the null hypothesis of normality is correctly rejected.

    +
    +
    +
    set.seed(298)
    +x = arima.sim(250,model = list(ma = c(0.2, 0.3, -0.4)),
    +                 rand.gen = rgamma, rate = 3, shape = 6)
    +# Asymptotic Lobato & Velasco
    +lobato.test(x)
    +
    +
    #> 
    +#>  Lobato and Velasco's test
    +#> 
    +#> data:  x
    +#> lobato = 65.969, df = 2, p-value = 4.731e-15
    +#> alternative hypothesis: x does not follow a Gaussian Process
    +
    +

    Approximated sieve bootstrap Lobato and Velasco test using 1000 repetitions of 250 units.

    +
    +
    +
    lobato_bootstrap.test(x, seed = 298)
    +
    +
    #> 
    +#>  Sieve-Bootstrap lobato test
    +#> 
    +#> data:  y
    +#> bootstrap-lobato = 65.969, p-value < 2.2e-16
    +#> alternative hypothesis: y does not follow a Gaussian Process
    +
    +

    3.5 The Random Projections test

    +

    The previous proposals only test for the normality of the one-dimensional marginal distribution of the process, which is inconsistent against alternatives whose one-dimensional marginal is Gaussian. Nieto-Reyes et al. (2014) provides a procedure to fully test normality of a stationary process using a Crammér-Wold type result (Cuesta-Albertos et al. 2007) that uses random projections to differentiate among distributions. In Nieto-Reyes et al. (2014) existing tests for the normality of the one dimensional marginal are applied to the random projections and the resulting p-values combined using the false discovery rate for dependent data (Benjamini and Yekutieli 2001). The nortsTest package improves on this test by allowing to use the less conservative false discovery rate in Benjamini and Hochberg (1995).

    +

    We show the Crammér-Wold type result below. The result works for separable Hilbert spaces, however here, for its later application, we restrict it to \(l^2,\) the space of square summable sequences over \(\mathbb{N},\) with inner product \(\langle \cdot,\cdot \rangle.\)

    +
    Theorem 3 (Cuesta-Albertos et al. 2007, Theorem 3.6)
    +

    Let \(\eta\) be a dissipative distribution on \(l^2\) and \(Z\) a \(l^2\)-valued random element, then \(Z\) is Gaussian if and only if +\[ +\eta\{h \in l^2: \langle Z,h \rangle \text{ has a Gaussian distribution}\} > 0. +\] +A dissipative distribution (Nieto-Reyes et al. 2014, Definition 2.1) is a generalization of the concept of absolutely continuous distribution to the infinite-dimensional space. A Dirichlet process (Gelman et al. 2013) produces random elements with a dissipative distribution in \(l^2\). In practice, generate draws of \(h \in l^2\) with a stick-breaking process that makes use of beta distributions.

    +

    Let \(X = \{X_t\}_{t\in\mathbb{Z}}\) be a stationary process. As \(X\) is normally distributed if the process \(X^{(t)} := \{X_k\}_{k \leq t}\) is Gaussian for each \(t\in\mathbb{Z},\) using the result above, Nieto-Reyes et al. (2014) provides a procedure for testing that \(X\) is a Gaussian process by testing whether the process \(Y^h = \{Y^h_t\}_{t \in \mathbb{Z}}\) is Gaussian. +\[\begin{equation} +Y^h_t := \sum_{i=0}^\infty h_i X_{t-i} = \langle X^{ (t) },h \rangle, \tag{4} +\end{equation}\] +where \(\langle X^{(t)},h \rangle\) is a real random variable for each \(t \in \mathbb{Z}\) and \(h\in l^2\). Thus, \(Y^h\) is a stationary process constructed by the projection of \(X^{(t)}\) on the space generated by \(h.\) Therefore, \(X\) is a Gaussian process if and only if the one dimensional marginal distribution of \(Y^{h}\) is normally distributed. Additionally, the hypothesis of the tests Lobato and Velasco or Epps, such as (2), (3), \((A)\) and \((B)\), imposed on \(X\) are inherited by \(Y^h\). Then, those tests can be applied to evaluate the normality of the one dimensional marginal distribution of \(Y^h\). Further considerations include the specific beta parameters used to construct the distribution from which to draw \(h\) and selecting a proper number of combinations to establish the number of projections required to improve the method performance. All of these details are discussed in Nieto-Reyes et al. (2014).

    +

    Next, we summarize the test of random projections in practice:

    +
      +
    1. Select \(k,\) which results in \(2k\) independent random projections (by default k = 1).

    2. +
    3. Draw the \(2k\) random elements to project the process from a dissipative distribution that uses a particular beta distribution. By default, use a \(\beta(2,7)\) for the first \(k\) projections and a \(\beta(100,1)\) for the later \(k\).

    4. +
    5. Apply the tests of Lobato and Velasco to the even projected processes and Epps to the odd projections.

    6. +
    7. Combine the obtained \(2k\) p-values using the false discover rate. By default, use Benjamini and Yekutieli (2001) procedure.

    8. +
    +

    The rp.test() function implements the above procedure. The user might provide optional parameters such as the number of projections k, the parameters of the first beta distribution pars1 and those of the second pars2. The next example illustrates the application of the rp.test() to a stationary GARCH(1,1) process drawn using normal random variables.

    +
    Example 3
    +

    A stationary GARCH(1,1) process is drawn with a standard normal distribution and parameters \(\alpha_0 = 0,\) \(\alpha_1 = 0.2\) and \(\beta_1 = 0.3\) using the (fGarch package, Wuertz et al. 2017). Note that a GARCH(1,1) process is stationary if the parameters \(\alpha_1\) and \(\beta_1\) satisfy the inequality \(\alpha_1 + \beta_1 < 1\) (Bollerslev 1986).

    +
    +
    +
    set.seed(3468)
    +library(fGarch)
    +spec = garchSpec(model = list(alpha = 0.2, beta = 0.3))
    +x = ts(garchSim(spec, n = 300))
    +rp.test(x) 
    +
    +
    #> 
    +#>  k random projections test.
    +#> 
    +#> data:  x
    +#> k = 1, p.value adjust = Benjamini & Yekutieli, p-value = 1
    +#> alternative hypothesis: x does not follow a Gaussian Process
    +
    +

    At significance level \(\alpha = 0.05,\) the applied random projections test with k = 1 as the number of projections shows no evidence to reject the null hypothesis of normality.

    +

    3.6 The Psaradakis and Vavra’s test

    +

    Psaradakis and Vávra (2017) adapted a distance test for normality for a one-dimensional marginal distribution of a stationary process. Initially, the test was based on the Anderson (1952) test statistic and used an auto-regressive sieve bootstrap approximation to the null distribution of the sample test statistic. Later, Psaradakis and Vávra (2020) considered this test as the ultimate normality test based on the empirical distribution function, and adapted its methodology to a wide range of tests, including Shapiro-Wilk (Shapiro and Wilk 1965), Jarque-Bera (Jarque and Bera 1980), Cramer von Mises (Anderson 1962), Epps, and Lobato-Velasco. Their experiments show that the Lobato-Velasco and Jarque-Bera test’s bootstrap version performs best in small samples.

    +

    Although the test is said to be applicable to a wide class of non-stationary processes by transforming them into stationary by means of a fractional difference operator, no theoretic result was apparently provided to sustain this transformation. This work restricts the presentation of the original procedure to stationary processes.

    +

    Let \(X\) be a stationary process satisfying +\[\begin{equation} +X_t = \sum_{i=0}^{\infty}\theta_i \epsilon_{t-i} + \mu_0, \ t \in \mathbb{Z}, \tag{5} +\end{equation}\] +where \(\mu_0 \in \mathbb{R}\), \(\{\theta_i\}_{i=0}^\infty\in l^2\) with \(\theta_0 = 1\) and \(\{\epsilon_t\}_{i=0}^\infty\) is a collection of mean zero i.i.d random variables. The null hypothesis is that the one dimensional marginal distribution of \(X\) is normally distributed, +\[ +H_0: F(\mu_0 +\sqrt{\gamma(0)}x)-F_N(x) = 0, \text{ for all } x\in \mathbb{R}, +\] +where F is the cumulative distribution function of \(X_0\), and \(F_N\) denotes the standard normal cumulative distribution function. Note that if \(\epsilon_0\) is normally distributed, then the null hypothesis is satisfied. Conversely, if the null hypothesis is satisfied, then \(\epsilon_0\) is normally distributed and, consequently, \(X_0\).
    +The considered test for \(H_0\) is based on the Anderson-Darling distance statistic +\[\begin{equation} +A_d = \int_{-\infty}^{\infty}\dfrac{[{F_n}(\widehat{\mu}+\sqrt{\widehat{\gamma}(0)}x)-F_N(x)]^2}{F_N(x)[1-F_N(x)]}dF_N(x), \tag{6} +\end{equation}\] +where \({F_n}(\cdot)\) is the empirical distribution function associated to \(F\) based on a simple random sample of size \(n\). Psaradakis and Vávra (2017) proposes an auto-regressive sieve bootstrap procedure to approximate the sampling properties of \(A_d\) arguing that making use of classical asymptotic inference for \(A_d\) is problematic and involved. This scheme is motivated by the fact that under some assumptions for \(X,\) including (5), \(\epsilon_t\) admits the representation +\[\begin{equation} +\epsilon_t = \sum_{i=1}^{\infty}\phi_i(X_{t-i} - \mu_0), \ t \in \mathbb{Z}, \tag{7} +\end{equation}\] +for certain type of \(\{\phi_i\}_{i=1}^\infty\in l^2\). The main idea behind this approach is to generate a bootstrap sample \(\epsilon_t^*\) to approximate \(\epsilon_t\) with a finite-order auto-regressive model. This is because the distribution of the processes \(\epsilon_t\) and \(\epsilon_t^*\) coincide asymptotically if the order of the auto-regressive approximation grows simultaneously with \(n\) at an appropriate rate (Bühlmann 1997). The procedure makes use of the \(\epsilon_t^{*'}s\) to obtain the \(X_t^{*'}s\) through the bootstrap analog of (7). Then, generate a bootstrap sample of the \(A_d\) statistic, \(A_d^{*},\) making use of the bootstrap analog of (5).

    +

    The vavra.test() function implements Psaradakis and Vávra (2020) procedure. By default, it generates 1,000 sieve-bootstrap replications of the Anderson-Darling statistic. The user can provide different test procedures, such as the Shapiro-Wilk, Jarque-Bera, Cramer von Mises, Epps or Lobato-Velasco test, by specifying a text value to the normality argument. The presented values are Monte Carlo estimates of the \(A_d\) statistic and p.value.

    +
    Example 4
    +

    A stationary \(ARMA\)(1,1) process is simulated using a standard normal distribution and performs Psaradakis and Vávra procedure using Anderson-Darling and Cramer von Mises test statistics. At significance level \(\alpha = 0.05\), there is no evidence to reject the null hypothesis of normality.

    +
    +
    +
    set.seed(298)
    +x = arima.sim(250,model = list(ar = 0.2, ma = 0.34))
    +# Default, Psaradakis and Vavra's procedure
    +vavra.test(x, seed = 298)
    +
    +
    #> 
    +#>  Psaradakis-Vavra test
    +#> 
    +#> data:  x
    +#> bootstrap-ad = 0.48093, p-value = 0.274
    +#> alternative hypothesis: x does not follow a Gaussian Process
    +
    +

    Approximate Cramer von Mises test for the Psaradakis and Vavra’s procedure

    +
    +
    +
    vavra.test(x, normality = "cvm", seed = 298)
    +
    +
    #> 
    +#>  Sieve-Bootstrap cvm test
    +#> 
    +#> data:  x
    +#> bootstrap-cvm = 0.056895, p-value = 0.49
    +#> alternative hypothesis: x does not follow a Gaussian Process
    +
    +

    3.7 The multivariate kurtosis test

    +

    The literature contains some procedures to test the null hypothesis that a multivariate stochastic process is Gaussian. Those include Moulines et al. (1992), a test based on the characteristic function, and Steinberg and Zeitouni (1992), a test based on properties of the entropy of Gaussian processes that does not make use of cumulant computations. According to El Bouch et al. (2022), these tests may hardly be executable in real time. Consequently, they propose a test based on multivariate kurtosis (Mardia 1970). The proposed procedure is for \(p=1,2,\) and we elaborate on it in what follows. In Section 6.3 of El Bouch et al. (2022), they suggest to apply random projections for higher dimensions but they do not investigate the procedure any further.

    +

    The p-value of this test is obtained as \(2(1-F_N(z))\) where, as above, \(F_N\) denotes the standard normal cumulative distribution function. There, +\[ + z:=(\hat{B}_p-E[\hat{B}_p])/\sqrt{E[(\hat{B}_p-E[\hat{B}_p])^2]}, +\] +where +\[ + \hat{B}_p:=n^{-1}\sum_{t=1}^n(x_t^t \hat{S}^{-1}x_t)^2, +\] +and +\[ +\hat{S}:=n^{-1}\sum_{t=1}^n x_t x_t^t. +\] +In El Bouch et al. (2022), there reader can found the exact computations of \(E[\hat{B}_p]\) and \(E[(\hat{B}_p-E[\hat{B}_p])^2].\)

    +

    This test is implemented in the elbouch.test() function. By default, the function computes the univariate El Bouch test. If the user provides a secondary data set, the function computes the bivariate counterpart.

    +
    Example 5
    +

    Simulate a two-dimensional stationary VAR(2) process using independent AR(1) and AR(2) processes with standard normal distributions and apply the bivariate El Bouch test. At significance level \(\alpha = 0.05\), there is no evidence to reject the null hypothesis of normality.

    +
    +
    +
    set.seed(23890)
    +x = arima.sim(250,model = list(ar = 0.2))
    +y = arima.sim(250,model = list(ar = c(0.4,0,.1)))
    +elbouch.test(y = y,x = x)
    +
    +
    #> 
    +#>  El Bouch, Michel & Comon's test
    +#> 
    +#> data:  w = (y, x)
    +#> Z = 0.92978, p-value = 0.1762
    +#> alternative hypothesis: w = (y, x) does not follow a Gaussian Process
    +
    +

    4 Simulations and data analysis

    +

    4.1 Numerical experiments

    +

    Inspired by the simulation studies in Psaradakis and Vávra (2017) and Nieto-Reyes et al. (2014), we propose here a procedure that involves drawing data from the \(AR(1)\) process +\[\begin{equation} +X_t = \phi X_{t-1} + \epsilon_t, \ t \in\mathbb{Z}, \text{ for } \phi \in \{ 0,\pm 0.25,\pm 0.4\}, \tag{8} +\end{equation}\] +where the \(\{\epsilon_t\}_{t\in\mathbb{Z}}\) are i.i.d random variables. For the distribution of the \(\epsilon_t\) we consider different scenarios: standard normal (\(N\)), standard log-normal (\(\log N\)), Student t with 3 degrees of freedom (\(t_3\)), chi-squared with 10 degrees of freedom (\(\chi^2(10)\)) and gamma with \((7, 1)\) shape and scale parameters (\(\Gamma(7,1)\)).

    +
    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 1: Part 1. Rejection rate estimates over \(m=1,000\) trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of \(phi\) and \(n\) displayed in the columns and different distributions for \(epsilon_t\) in the rows. \(phi\) in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate’s running time in seconds among the different values of the AR parameter. +
    + +
    +n = 100 +
    +
    +
    +n = 250 +
    +
    +phi + +-0.4 + +-0.25 + +0.0 + +0.25 + +0.4 + +max.phi + +-0.4 + +-0.25 + +0.0 + +0.25 + +0.4 + +max.phi +
    +Lobato and Velasco +
    +N + +0.041 + +0.044 + +0.047 + +0.032 + +0.035 + +0.769 + +0.059 + +0.037 + +0.054 + +0.040 + +0.037 + +0.646 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.610 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.653 +
    +t3 + +0.797 + +0.853 + +0.902 + +0.875 + +0.829 + +0.627 + +0.990 + +0.994 + +0.998 + +0.999 + +0.983 + +0.674 +
    +chisq10 + +0.494 + +0.698 + +0.770 + +0.707 + +0.610 + +0.620 + +0.930 + +0.995 + +0.998 + +0.997 + +0.977 + +0.657 +
    +Gamma(7,1) + +0.995 + +1.000 + +0.999 + +0.996 + +0.988 + +0.634 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.665 +
    +Epps +
    +N + +0.056 + +0.051 + +0.062 + +0.060 + +0.063 + +0.695 + +0.048 + +0.058 + +0.053 + +0.066 + +0.063 + +0.736 +
    +logN + +0.908 + +0.917 + +0.972 + +0.985 + +0.984 + +0.729 + +1.000 + +1.000 + +1.000 + +0.999 + +1.000 + +0.777 +
    +t3 + +0.243 + +0.291 + +0.370 + +0.317 + +0.248 + +0.722 + +0.776 + +0.872 + +0.908 + +0.881 + +0.780 + +0.769 +
    +chisq10 + +0.267 + +0.440 + +0.548 + +0.469 + +0.360 + +0.699 + +0.611 + +0.850 + +0.930 + +0.866 + +0.721 + +0.739 +
    +Gamma(7,1) + +0.866 + +0.961 + +0.996 + +0.993 + +0.965 + +0.722 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.782 +
    +Random Projections +
    +N + +0.051 + +0.042 + +0.045 + +0.039 + +0.050 + +1.301 + +0.045 + +0.033 + +0.046 + +0.038 + +0.050 + +1.905 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.330 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.906 +
    +t3 + +0.790 + +0.863 + +0.879 + +0.823 + +0.727 + +1.320 + +0.982 + +0.994 + +0.995 + +0.991 + +0.975 + +1.949 +
    +chisq10 + +0.589 + +0.730 + +0.757 + +0.640 + +0.542 + +1.295 + +0.957 + +0.994 + +0.994 + +0.969 + +0.888 + +1.926 +
    +Gamma(7,1) + +0.998 + +1.000 + +1.000 + +0.998 + +0.989 + +1.308 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.963 +
    +Psaradakis and Vavra +
    +N + +0.052 + +0.048 + +0.051 + +0.058 + +0.050 + +17.905 + +0.061 + +0.046 + +0.038 + +0.051 + +0.045 + +22.115 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +17.149 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +21.841 +
    +t3 + +0.700 + +0.799 + +0.851 + +0.780 + +0.695 + +17.503 + +0.960 + +0.979 + +0.991 + +0.977 + +0.960 + +22.183 +
    +chisq10 + +0.498 + +0.673 + +0.804 + +0.689 + +0.550 + +18.029 + +0.902 + +0.983 + +0.997 + +0.988 + +0.933 + +22.197 +
    +Gamma(7,1) + +0.989 + +1.000 + +1.000 + +1.000 + +0.998 + +18.467 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +22.292 +
    +Bootstrap Lobato +
    +N + +0.057 + +0.052 + +0.047 + +0.059 + +0.052 + +37.141 + +0.035 + +0.049 + +0.048 + +0.058 + +0.049 + +40.532 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +32.509 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +40.793 +
    +t3 + +0.797 + +0.867 + +0.899 + +0.869 + +0.809 + +32.755 + +0.989 + +0.994 + +0.996 + +0.996 + +0.989 + +41.158 +
    +chisq10 + +0.567 + +0.729 + +0.801 + +0.745 + +0.649 + +32.242 + +0.942 + +0.990 + +1.000 + +0.994 + +0.963 + +40.950 +
    +Gamma(7,1) + +0.999 + +1.000 + +1.000 + +0.998 + +0.991 + +31.763 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +41.277 +
    +Bootstrap Epps +
    +N + +0.047 + +0.053 + +0.048 + +0.052 + +0.044 + +57.749 + +0.058 + +0.052 + +0.053 + +0.048 + +0.043 + +65.367 +
    +logN + +0.846 + +0.877 + +0.963 + +0.974 + +0.959 + +56.756 + +1.000 + +1.000 + +1.000 + +1.000 + +0.999 + +65.968 +
    +t3 + +0.183 + +0.238 + +0.313 + +0.230 + +0.196 + +57.350 + +0.752 + +0.863 + +0.913 + +0.841 + +0.754 + +65.699 +
    +chisq10 + +0.252 + +0.364 + +0.527 + +0.450 + +0.358 + +56.627 + +0.596 + +0.813 + +0.913 + +0.854 + +0.685 + +65.369 +
    +Gamma(7,1) + +0.816 + +0.948 + +0.993 + +0.979 + +0.931 + +56.986 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +65.315 +
    +El Bouch +
    +N + +0.040 + +0.047 + +0.044 + +0.033 + +0.050 + +0.798 + +0.040 + +0.054 + +0.052 + +0.061 + +0.059 + +1.020 +
    +logN + +0.990 + +0.998 + +0.998 + +0.995 + +0.980 + +0.805 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.025 +
    +t3 + +0.833 + +0.883 + +0.928 + +0.886 + +0.846 + +0.824 + +0.996 + +0.999 + +0.998 + +0.998 + +0.991 + +1.044 +
    +chisq10 + +0.041 + +0.152 + +0.281 + +0.155 + +0.046 + +0.812 + +0.062 + +0.386 + +0.597 + +0.388 + +0.065 + +1.031 +
    +Gamma(7,1) + +0.833 + +0.905 + +0.929 + +0.898 + +0.818 + +0.818 + +0.993 + +0.998 + +0.999 + +0.995 + +0.989 + +1.042 +
    +
    +

    As in Psaradakis and Vávra (2017), \(m=1,000\) independent draws of the above process are generated for each pair of parameter \(\phi\) and distribution. Each draw is taken of length \(past+n,\) with \(past=500\) and \(n \in \{100,250,500,1000 \}\). The first 500 data points of each realization are then discarded in order to eliminate start-up effects. The \(n\) remaining data points are used to compute the value of the test statistic of interest. In each particular scenario, the rejection rate is obtained by computing the proportion of times that the test is rejected among the \(m\) trials.

    +
    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 2: Part 2. Rejection rate estimates over \(m=1,000\) trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of \(phi\) and \(n\) displayed in the columns and different distributions for \(epsilon_t\) in the rows. \(phi\) is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate’s running time in seconds among the different values of the AR parameter. +
    + +
    +n = 500 +
    +
    +
    +n = 1,000 +
    +
    +phi + +-0.4 + +-0.25 + +0.0 + +0.25 + +0.4 + +max.phi + +-0.4 + +-0.25 + +0.0 + +0.25 + +0.4 + +max.phi +
    +Lobato and Velasco +
    +N + +0.041 + +0.035 + +0.052 + +0.035 + +0.049 + +0.729 + +0.048 + +0.050 + +0.040 + +0.062 + +0.040 + +1.065 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.743 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.076 +
    +t3 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.844 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.116 +
    +chisq10 + +0.999 + +1.000 + +1.000 + +1.000 + +1.000 + +0.824 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.082 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.825 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.105 +
    +Epps +
    +N + +0.048 + +0.046 + +0.056 + +0.065 + +0.050 + +0.905 + +0.034 + +0.038 + +0.046 + +0.033 + +0.059 + +1.182 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.931 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.294 +
    +t3 + +0.991 + +0.994 + +0.996 + +0.997 + +0.985 + +0.936 + +1.000 + +0.998 + +1.000 + +1.000 + +0.999 + +1.235 +
    +chisq10 + +0.924 + +0.991 + +0.999 + +0.991 + +0.969 + +0.917 + +0.997 + +1.000 + +1.000 + +1.000 + +1.000 + +1.202 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +0.873 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.239 +
    +Random Projections +
    +N + +0.044 + +0.043 + +0.040 + +0.040 + +0.048 + +2.723 + +0.021 + +0.027 + +0.043 + +0.043 + +0.047 + +4.544 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +2.759 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +4.588 +
    +t3 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +2.755 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +4.531 +
    +chisq10 + +1.000 + +1.000 + +1.000 + +1.000 + +0.998 + +2.782 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +4.520 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +2.843 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +4.527 +
    +Psaradakis and Vavra +
    +N + +0.048 + +0.050 + +0.045 + +0.053 + +0.039 + +26.957 + +0.055 + +0.045 + +0.047 + +0.043 + +0.033 + +37.993 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +27.209 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +37.282 +
    +t3 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +26.599 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +37.642 +
    +chisq10 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +27.418 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +37.731 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +27.659 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +38.232 +
    +Bootstrap Lobato +
    +N + +0.055 + +0.048 + +0.053 + +0.037 + +0.035 + +53.110 + +0.050 + +0.046 + +0.067 + +0.049 + +0.047 + +72.528 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +52.632 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +71.845 +
    +t3 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +52.763 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +71.454 +
    +chisq10 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +52.455 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +73.413 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +53.204 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +72.253 +
    +Bootstrap Epps +
    +N + +0.051 + +0.043 + +0.033 + +0.043 + +0.051 + +78.920 + +0.055 + +0.054 + +0.056 + +0.044 + +0.064 + +101.883 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +78.194 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +101.753 +
    +t3 + +0.979 + +0.995 + +0.998 + +0.996 + +0.985 + +79.735 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +100.766 +
    +chisq10 + +0.911 + +0.986 + +0.996 + +0.995 + +0.945 + +80.841 + +0.997 + +1.000 + +1.000 + +1.000 + +0.998 + +101.250 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +78.688 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +101.360 +
    +El Bouch +
    +N + +0.065 + +0.053 + +0.047 + +0.061 + +0.059 + +1.419 + +0.055 + +0.064 + +0.051 + +0.048 + +0.045 + +2.467 +
    +logN + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.435 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +2.500 +
    +t3 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.453 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +2.492 +
    +chisq10 + +0.100 + +0.609 + +0.871 + +0.609 + +0.076 + +1.439 + +0.176 + +0.858 + +0.984 + +0.865 + +0.173 + +2.470 +
    +Gamma(7,1) + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +1.444 + +1.000 + +1.000 + +1.000 + +1.000 + +1.000 + +2.483 +
    +
    +

    Tables 1 and 2 present the rejection rate estimates. For every process of length \(n,\) the columns represent the used \(AR(1)\) parameter and the rows the distribution used to draw the process. The obtained results are consistent with those obtained in the publications where the different tests were proposed. As expected, rejection rates are around 0.05 when the data is drawn from a standard normal distribution, as in this case the data is drawn from a Gaussian process. Conversely, high rejection rates are registered for the other distributions. Low rejection rates are observed, however, for the \(\chi^2(10)\) distribution when making use of some of the tests. For instance, the Epps and bootstrap Epps tests, although they consistently tend to 1 when the length of the process, \(n,\) increases. Another case is the El Bouch test. However, this one maintains low rates for large values of \(|\phi|\) when \(n\) increases. Furthermore, for the random projections test, the number of projections used in this study is the default \(k = 1,\) which is by far a lower number than the recommended by Nieto-Reyes et al. (2014). However, even in these conditions, the obtained results are satisfactory, with the random projection test having even better performance than the tests of Epps (1987) or Psaradakis and Vávra (2017).

    +

    An important aspect in selecting a procedure is its computation time. Thus, for each length of the process, \(n,\) there is an additional column, max.phi, in Tables 1 and 2. Each entry in this column refers to a different distribution and contains the maximum running time in seconds to obtain the rejection rate among the different values of the AR parameter. That is, for a fix distribution, the rejection rates are computed for each of the five possibilities of \(\phi\) and the time that it takes recorded. The running time in the table is the largest among the five. Furthermore, in 3 we show the time in seconds that each studied test takes to check whether a given process is Gaussian. In particular, the table contains the average running time over 1,000 trials that takes to generate and check a Gaussian AR(1) process with parameter \(\phi = 0.5\). This is done for different sample sizes, \(n \in \{1000, 2000, 3000, 4000, 5000\}.\) According to the table, the asymptotic tests (Lobato and Velasco, Epps, random projections and El Bouch) have similar running times. On the contrary, the bootstrap based tests (Psaradakis and Vavra, Bootstrap Epps and Lobato and Velasco) have, as expected, higher running times on average. Furthermore, Tables 1 and 2 show similar results in time performance. There, the maximum running time of the bootstrap based tests exceeds in more than ten seconds the time obtained with the asymptotic based tests. It is worth saying that the tables have been obtained with R version 4.3.1 (2023-06-16) and platform aarch64-apple-darwin20 (64-bit),running under macOS Sonoma 14.2.1.

    +
    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 3: Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, \(n=1000\) (second column), \(n=2000\) (third column), \(n=3000\) (fourth column), \(n=4000\) (fifth column) and \(n=5000\) (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter \(phi = 0.5.\) +
    +tests + +n = 1000 + +n = 2000 + +n = 3000 + +n = 4000 + +n = 5000 +
    +Lobato and Velasco + +0.0010 + +0.0014 + +0.0020 + +0.0026 + +0.0035 +
    +Epps + +0.0010 + +0.0015 + +0.0021 + +0.0027 + +0.0035 +
    +Random Projections + +0.0026 + +0.0045 + +0.0063 + +0.0082 + +0.0105 +
    +El Bouch + +0.0023 + +0.0046 + +0.0074 + +0.0109 + +0.0152 +
    +Psaradakis and Vavra + +0.0286 + +0.0429 + +0.0565 + +0.0012 + +0.0014 +
    +Bootstrap Lobato + +0.0542 + +0.0014 + +0.0019 + +0.0025 + +0.0032 +
    +Bootstrap Epps + +0.0013 + +0.0018 + +0.0023 + +0.0029 + +0.0037 +
    +
    +

    4.2 Real data application

    +

    As an illustrative example, we analyze the monthly mean carbon dioxide, in parts per million (ppm), measured at the Mauna Loa Observatory, in Hawaii, from March 1958 to November 2018. The carbon dioxide data measured as the mole fraction in dry air on Mauna Loa constitute the longest record of direct measurements of \(CO2\) in the atmosphere. This dataset is available in the astsa package (Stoffer 2020) under the name cardox data and it is displayed in the left panel of Figure 1. The plot’s grid is created using the cowplot package (Wilke 2020).

    +

    The objective of this subsection is to propose a model to analyze this time series and check the assumptions on the residuals of the model using our implemented check_residuals() function. The time series clearly has trend and seasonal components (see left panel of Figure 1), therefore, an adequate model that filters both components has to be selected. We make use of an ETS model. For its implementation, we make use the ets() function from the forecast package (Hyndman and Khandakar 2008). This function fits 32 different ETS models and selects the best model according to information criteria such as Akaike’s information criterion (AIC) or Bayesian Information criteria (BIC) (Chen and Chen 2008). +The results provided by the ets() function are:

    +
    + +
    +
    +
    +
    library(astsa)
    +
    +autoplot(cardox, main = "Carbon Dioxide levels at Mauna Loa", 
    +         xlab = "years", ylab = "CO2 (ppm)")
    +
    +
    +CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. +

    +Figure 1: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. +

    +
    +
    +
    +
    +
    library(forecast)
    +library(astsa)
    +model = ets(cardox)
    +summary(model)
    +
    +
    #> ETS(M,A,A) 
    +#> 
    +#> Call:
    +#> ets(y = cardox)
    +#> 
    +#>   Smoothing parameters:
    +#>     alpha = 0.5451 
    +#>     beta  = 0.0073 
    +#>     gamma = 0.1076 
    +#> 
    +#>   Initial states:
    +#>     l = 314.4546 
    +#>     b = 0.0801 
    +#>     s = 0.6986 0.0648 -0.8273 -1.8999 -3.0527 -2.7629
    +#>            -1.2769 0.7015 2.1824 2.6754 2.3317 1.165
    +#> 
    +#>   sigma:  9e-04
    +#> 
    +#>      AIC     AICc      BIC 
    +#> 3429.637 3430.439 3508.867 
    +#> 
    +#> Training set error measures:
    +#>                    ME      RMSE       MAE         MPE       MAPE
    +#> Training set 0.018748 0.3158258 0.2476335 0.005051657 0.06933903
    +#>                  MASE       ACF1
    +#> Training set 0.152935 0.09308391
    +
    +

    The resulting model, proposed by the ets() function, for analyzing the carbon dioxide data in Mauna Loa is an \(ETS[M,A,A]\) model. The parameters \(\alpha, \beta \text{ and } \gamma\) (see Definition 1) have being estimated using the least squares method. If the assumptions on the model are satisfied, then the errors of the model behave like a Gaussian stationary process. To check it, we make use of the function check_residuals(). For more details on the compatibility of this function with the models obtained by other packages see the nortsTest repository. In the following, we display the results of using the Augmented Dickey-Fuller test (Subsection 3.1) to check the stationary assumption and the random projection test with k = 1 projections to check the normality assumption. For the other test options see the function’s documentation.

    +
    +
    +
    check_residuals(model,unit_root = "adf",normality = "rp",
    +                   plot = TRUE)
    +
    +
    +
    +
    #> 
    +#>  *************************************************** 
    +#> 
    +#>  Unit root test for stationarity: 
    +#> 
    +#>  Augmented Dickey-Fuller Test
    +#> 
    +#> data:  y
    +#> Dickey-Fuller = -9.8935, Lag order = 9, p-value = 0.01
    +#> alternative hypothesis: stationary
    +#> 
    +#> 
    +#>  Conclusion: y is stationary
    +#>  *************************************************** 
    +#> 
    +#>  Goodness of fit test for Gaussian Distribution: 
    +#> 
    +#>  k random projections test.
    +#> 
    +#> data:  y
    +#> k = 1, p.value adjust = Benjamini & Yekutieli, p-value = 1
    +#> alternative hypothesis: y does not follow a Gaussian Process
    +#> 
    +#> 
    +#>  Conclusion: y follows a Gaussian Process
    +#>  
    +#>  ***************************************************
    +
    +

    The obtained results indicate that the null hypothesis of non stationarity is rejected at significance level \(\alpha = 0.01.\) Additionally, there is no evidence to reject the null hypothesis of normality at significance level \(\alpha = 0.05.\) Consequently, we conclude that the residuals follow a stationary Gaussian process, having that the resulting \(ETS[M,A,A]\) model adjusts well to the carbon dioxide data in Mauna Loa.

    +

    In the above displayed check_residuals() function, the plot argument is set to TRUE. The resulting plots are shown in Figure 2. The plot in the top panel and the auto-correlation plots in the bottom panels insinuate that the residuals have a stationary behavior. The top panel plot shows slight oscillations around zero and the auto-correlations functions in the bottom panels have values close to zero in every lag. The histogram and qq-plot in the middle panels suggest that the marginal distribution of the residuals is normally distributed. Therefore, Figure 2 agrees with the reported results, indicating that the assumptions of the model are satisfied.

    +
    +
    +Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity. +

    +Figure 2: Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity. +

    +
    +
    +
    + +
    +

    As the assumptions of the model have been checked, it can be used for instance to forecast. The result of applying the following function is displayed in Figure 3. It presents the carbon dioxide data for the last 8 years and a forecast of the next 12 months. It is observable from the plot that the model captures the process trend and periodicity.

    +
    +
    +
    autoplot(forecast(model,h = 12),include = 100,
    +         xlab = "years",ylab = "CO2 (ppm)",
    +         main = "Forecast: Carbon Dioxide Levels at Mauna Loa")
    +
    +
    +Forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour. +

    +Figure 3: Forecast of the next 12 months for the CO2 levels at Mauna Loa, the model’s predictions capture the time-series behaviour. +

    +
    +
    +
    + +
    +

    5 Conclusions

    +

    For independent data, the nortest package (Gross and Ligges 2015) provides five different tests for normality, the mvnormtest package (Jarek 2012) performs the Shapiro-Wilks test for multivariate data and the MissMech package (Jamshidian et al. 2014) provides tests for normality in multivariate incomplete data. To test the normality of dependent data, some authors such as Psaradakis and Vávra (2017) and Nieto-Reyes et al. (2014) have available undocumented Matlab code, which is almost only helpful in re-doing their simulation studies.

    +

    To our knowledge, no consistent implementation or package of tests for normality of stationary processes has been done before. Therefore, the nortsTest is the first package to implement normality tests in stationary processes. This work gives a general overview of a careful selection of tests for normality in the stationary process, which consists of the most available types of tests. It additionally provides examples that illustrate each of the test implementations.

    +

    For checking the model’s assumptions, the forecast and astsa packages contain functions for visual diagnostic. Following the same idea, nortsTest provides similar diagnostic methods; it also reports the results of testing stationarity and normality, the main assumptions for the residuals in time series analysis.

    +

    6 Future work and projects

    +

    A further version of the nortsTest package will incorporate additional tests such as Bispectral (Hinich 1982) and Stein’s characterization (Bontemps and Meddahi 2005). Further future work will include a Bayesian version of a residuals check procedure that uses the random projection method. Any future version under development can be installed from GitHub using the following code.

    +
    +
    +
    if (!requireNamespace("remotes")) install.packages("remotes")
    +remotes::install_github("asael697/nortsTest",dependencies = TRUE)
    +
    +
    +

    Acknowledgment

    +

    This work was supported by grant PID2022-139237NB-I00 funded by “ERDF A way of making Europe” and MCIN/AEI/10.13039/501100011033.

    +
    +

    6.1 CRAN packages used

    +

    nortsTest, forecast, aTSA, nortest, mvnTest, ggplot2, tseries, uroot, fGarch, astsa, cowplot, mvnormtest, MissMech

    +

    6.2 CRAN Task Views implied by cited packages

    +

    ChemPhys, Econometrics, Environmetrics, Finance, MissingData, Phylogenetics, Spatial, TeachingStatistics, TimeSeries

    +
    +
    +T. W. Anderson. On the distribution of the two-sample Cramer-von Mises criterion. The Annals of Mathematical Statistics, 33(3): 1148–1159, 1962. URL https://doi.org/10.1214/aoms/1177704477. +
    +
    +T. W. Anderson and D. A. Darling. Asymptotic theory of certain goodness of fit criteria based on stochastic processes. Annals of Mathematical Statistics, 23(2): 193–212, 1952. DOI 10.1214/aoms/1177729437. +
    +
    +J. Bai and S. Ng. Tests for skewness, kurtosis, and normality for time series data. Journal of Business & Economic Statistics, 23(1): 49–60, 2005. DOI 10.1198/073500104000000271. +
    +
    +J. Beaulieu and J. A. Miron. Seasonal unit roots in aggregate U.S. data. Journal of Econometrics, 55(1): 305–328, 1993. DOI 10.1016/0304-4076(93)90018-Z. +
    +
    +Y. Benjamini and Y. Hochberg. Controlling the false discovery rate: A practical and powerful approach to multiple testing. Journal of the Royal Statistical Society. Series B (Methodological), 57(1): 289–300, 1995. URL http://www.jstor.org/stable/2346101. +
    +
    +Y. Benjamini and D. Yekutieli. The control of the false discovery rate in multiple testing under dependency. The Annals of Statistics, 29(4): 1165–1188, 2001. URL http://www.jstor.org/stable/2674075. +
    +
    +A. Berg, E. Paparoditis and D. N. Politis. A bootstrap test for time series linearity. Journal of Statistical Planning and Inference, 140(12): 3841–3857, 2010. DOI 10.1016/j.jspi.2010.04.047. +
    +
    +T. Bollerslev. Generalized autoregressive conditional heteroskedasticity. Journal of Econometrics, 31(3): 307–327, 1986. DOI 10.1016/0304-4076(86)90063-1. +
    +
    +C. Bontemps and N. Meddahi. Testing normality: A GMM approach. Journal of Econometrics, 124(1): 149–186, 2005. DOI 10.1016/j.jeconom.2004.02.014. +
    +
    +G. E. P. Box and G. Jenkins. Time series analysis, forecasting and control. USA: Holden-Day, Inc., 1990. URL https://www.wiley.com/en-us/Time+Series+Analysis. +
    +
    +G. E. P. Box and D. A. Pierce. Distribution of residual autocorrelations in autoregressive-integrated moving average time series models. Journal of the American Statistical Association, 65(332): 1509–1526, 1970. DOI 10.1080/01621459.1970.10481180. +
    +
    +P. Bühlmann. Sieve bootstrap for time series. Bernoulli, 3(2): 123–148, 1997. URL http://www.jstor.org/stable/3318584. +
    +
    +F. Canova and B. E. Hansen. Are seasonal patterns constant over time? A test for seasonal stability. Journal of Business & Economic Statistics, 13(3): 237–252, 1995. DOI 10.1080/07350015.1995.10524598. +
    +
    +J. Chen and Z. Chen. Extended bayesian information criteria for model selection with large model spaces. Biometrika, 95(3): 759–771, 2008. DOI 10.1093/biomet/asn034. +
    +
    +J. A. Cuesta-Albertos, E. del Barrio, R. Fraiman and C. Matrán. The random projection method in goodness of fit for functional data. Computational Statistics & Data Analysis, 51(10): 4814–4831, 2007. DOI 10.1016/j.csda.2006.09.007. +
    +
    +R. B. D’Agostino and M. A. Stephens. Goodness-of-fit techniques. Quality and Reliability Engineering International, 3(1): 71–71, 1986. DOI 10.1002/qre.4680030121. +
    +
    +G. E. Dallal and L. Wilkinson. An analytic approximation to the distribution of lilliefors’s test statistic for normality. The American Statistician, 40(4): 294–296, 1986. URL https://www.tandfonline.com/doi/abs/10.1080/00031305.1986.10475419. +
    +
    +J. A. Doornik and H. Hansen. An omnibus test for univariate and multivariate normality. Oxford Bulletin of Economics and Statistics, 70(s1): 927–939, 2008. URL https://ideas.repec.org/a/bla/obuest/v70y2008is1p927-939.html. +
    +
    +S. El Bouch, O. Michel and P. Comon. A normality test for multivariate dependent samples. Signal Processing, 201: 108705, 2022. DOI 10.1016/j.sigpro.2022.108705. +
    +
    +R. F. Engle. Autoregressive conditional heteroscedasticity with estimates of the variance of united kingdom inflation. Econometrica, 50(4): 987–1007, 1982. URL http://www.jstor.org/stable/1912773. +
    +
    +T. W. Epps. Testing that a stationary time series is Gaussian. The Annals of Statistics, 15(4): 1683–1698, 1987. DOI 10.1214/aos/1176350618. +
    +
    +T. Gasser. Goodness-of-fit tests for correlated data. Biometrika, 62(3): 563–570, 1975. URL http://www.jstor.org/stable/2335511. +
    +
    +A. Gelman, J. B. Carlin, H. S. Stern, D. B. Dunson, A. Vehtari and D. B. Rubin. Bayesian data analysis, third edition. Taylor & Francis, 2013. URL https://books.google.nl/books?id=ZXL6AQAAQBAJ. +
    +
    +J. Gross and U. Ligges. ‘Nortest‘: Tests for normality. 2015. URL https://CRAN.R-project.org/package=nortest. ‘R‘ package version 1.0-4. +
    +
    +N. Henze and B. Zirkler. A class of invariant consistent tests for multivariate normality. Communications in Statistics - Theory and Methods, 19(10): 3595–3617, 1990. URL https://doi.org/10.1080/03610929008830400. +
    +
    +M. J. Hinich. Testing for Gaussianity and linearity of a stationary time series. Journal of Time Series Analysis, 3(3): 169–176, 1982. DOI 10.1111/j.1467-9892.1982.tb00339. +
    +
    +C. C. Holt. Forecasting seasonals and trends by exponentially weighted moving averages. International Journal of Forecasting, 20(1): 5–10, 2004. DOI 10.1016/j.ijforecast.2003.09.015. +
    +
    +Y. Hong. Hypothesis testing in time series via the empirical characteristic function: A generalized spectral density approach. Journal of the American Statistical Association, 94(448): 1201–1220, 1999. DOI 10.2307/2669935. +
    +
    +R. J. Hyndman, A. B. Koehler, J. K. Ord and R. D. Snyder. Forecasting with exponential smoothing: The state space approach. Springer, 2008. DOI 10.1111/j.1751-5823.2009.00085_17. +
    +
    +R. Hyndman and Y. Khandakar. Automatic time series forecasting: The ‘forecast‘ package for ‘R‘. Journal of Statistical Software, Articles, 27(3): 1–22, 2008. DOI 10.18637/jss.v027.i03. +
    +
    +M. Jamshidian, S. Jalal and C. Jansen. ‘MissMech‘: An ‘R‘ package for testing homoscedasticity, multivariate normality, and missing completely at random (MCAR). Journal of Statistical Software, 56(6): 1–31, 2014. URL http://www.jstatsoft.org/v56/i06/. +
    +
    +S. Jarek. ‘Mvnormtest‘: Normality test for multivariate variables. 2012. URL https://CRAN.R-project.org/package=mvnormtest. ‘R‘ package version 0.1-9. +
    +
    +C. M. Jarque and A. K. Bera. Efficient tests for normality, homoscedasticity and serial independence of regression residuals. Economics Letters, 6(3): 255–259, 1980. DOI 10.1016/0165-1765(80)90024-5. +
    +
    +D. Kwiatkowski, P. C. B. Phillips, P. Schmidt and Y. Shin. Testing the null hypothesis of stationarity against the alternative of a unit root: How sure are we that economic time series have a unit root? Journal of Econometrics, 54(1): 159–178, 1992. DOI 10.1016/0304-4076(92)90104-Y. +
    +
    +I. Lobato and C. Velasco. A simple test of normality for time series. Econometric Theory, 20: 671–689, 2004. DOI 10.1017/S0266466604204030. +
    +
    +Z. Lomnicki. Tests for departure from normality in the case of linear stochastic processes. Metrika: International Journal for Theoretical and Applied Statistics, 4(1): 37–62, 1961. URL https://EconPapers.repec.org/RePEc:spr:metrik:v:4:y:1961:i:1:p:37-62. +
    +
    +J. López-de-Lacalle. ‘Uroot‘: Unit root tests for seasonal time series. 2019. URL https://CRAN.R-project.org/package=uroot. ‘R‘ package version 2.1-0. +
    +
    +K. V. Mardia. Measures of multivariate skewness and kurtosis with applications. Biometrika, 57(3): 519–530, 1970. URL http://www.jstor.org/stable/2334770. +
    +
    +S. G. Meintanis. A review of testing procedures based on the empirical characteristic function. South African Statistical Journal, 50(1): 1–14, 2016. DOI 10.10520/EJC186846. +
    +
    +E. Moulines, K. Choukri and M. Sharbit. Testing that a multivariate stationary time-series is Gaussian. In [1992] IEEE sixth SP workshop on statistical signal and array processing, pages. 185–188 1992. IEEE. DOI 10.1109/SSAP.1992.246818. +
    +
    +A. Nieto-Reyes. On the non-Gaussianity of sea surface elevations. Journal of Marine Science and Engineering, 10(9): 2022. URL https://www.mdpi.com/2077-1312/10/9/1303. +
    +
    +A. Nieto-Reyes. On the non-Gaussianity of the height of sea waves. Journal of Marine Science and Engineering, 9(12): 2021. URL https://www.mdpi.com/2077-1312/9/12/1446. +
    +
    +A. Nieto-Reyes, J. A. Cuesta-Albertos and F. Gamboa. A random-projection based test of Gaussianity for stationary processes. Computational Statistics & Data Analysis, 75: 124–141, 2014. DOI 10.1016/j.csda.2014.01.013. +
    +
    +D. R. Osborn, A. P. L. Chui, J. P. Smith and C. R. Birchenhall. Seasonality and the order of integration for consumption. Oxford Bulletin of Economics and Statistics, 50(4): 361–377, 1988. DOI 10.1111/j.1468-0084.1988.mp50004002.x. +
    +
    +K. Pearson and O. M. F. E. Henrici. X. Contributions to the mathematical theory of evolution.-II Skew variation in homogeneous material. Philosophical Transactions of the Royal Society of London. (A.), 186: 343–414, 1895. DOI 10.1098/rsta.1895.0010. +
    +
    +P. Perron. Trends and random walks in macroeconomic time series: Further evidence from a new spproach. Journal of Economic Dynamics and Control, 12(2): 297–332, 1988. DOI 10.1016/0165-1889(88)90043-7. +
    +
    +G. Petris, S. Petrone and P. Campagnoli. Dynamic linear models with ‘R‘. 78: 157–157, 2007. DOI 10.1111/j.1751-5823.2010.00109_26.x. +
    +
    +Z. Psaradakis. Normality tests for dependent data. WP 12/2017. Research Department, National Bank of Slovakia. 2017. URL https://ideas.repec.org/p/svk/wpaper/1053.html. +
    +
    +Z. Psaradakis and M. Vávra. A distance test of normality for a wide class of stationary processes. Econometrics and Statistics, 2: 50–60, 2017. DOI 10.1016/j.ecosta.2016.11.005. +
    +
    +Z. Psaradakis and M. Vávra. Normality tests for dependent data: Large-sample and bootstrap approaches. Communications in statistics-simulation and computation, 49(2): 283–304, 2020. DOI 10.1080/03610918.2018.1485941. +
    +
    +N. Pya, V. Voinov, R. Makarov and Y. Voinov. ‘mvnTest‘: Goodness of fit tests for multivariate normality. 2016. URL https://CRAN.R-project.org/package=mvnTest. ‘R‘ package version 1.1-0. +
    +
    +D. Qiu. ‘aTSA‘: Alternative time series analysis. 2015. URL https://CRAN.R-project.org/package=aTSA. ‘R‘ package version 3.1.2. +
    +
    +J. P. Royston. An extension of Shapiro and Wilk’s W test for normality to large samples. Journal of the Royal Statistical Society. Series C (Applied Statistics), 31(2): 115–124, 1982. URL http://www.jstor.org/stable/2347973. +
    +
    +J. P. Royston. Approximating the shapiro-wilk W-test for non-normality. Journal of Statistics and Computing, 2(3): 117–119, 1992. URL https://doi.org/10.1007/BF01891203. +
    +
    +P. Royston. A pocket-calculator algorithm for the Shapiro-Francia test for non-normality: An application to medicine. Statistics in Medicine, 12(2): 181–184, 1993. URL https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.4780120209. +
    +
    +S. E. Said and D. A. Dickey. Testing for unit roots in autoregressive-moving average models of unknown order. Biometrika, 71(3): 599–607, 1984. DOI 10.1093/biomet/71.3.599. +
    +
    +S. S. Shapiro and M. B. Wilk. An analysis of variance test for normality (complete samples). Biometrika, 52(3-4): 591–611, 1965. DOI 10.1093/biomet/52.3-4.591. +
    +
    +R. H. Shumway and D. S. Stoffer. Time series analysis and itts applications: With ‘R‘ examples. Springer New York, 2010. URL https://books.google.es/books?id=dbS5IQ8P5gYC. +
    +
    +N. Smirnov. Table for estimating the goodness of fit of empirical distributions. Annals of Mathematical Statistics, 19(2): 279–281, 1948. DOI 10.1214/aoms/1177730256. +
    +
    +Y. Steinberg and O. Zeitouni. On tests for normality. IEEE Transactions on Information Theory, 38(6): 1779–1787, 1992. DOI 10.1109/18.165450. +
    +
    +D. Stoffer. ‘Astsa‘: Applied statistical time series analysis. 2020. URL https://CRAN.R-project.org/package=astsa. ‘R‘ package version 1.10. +
    +
    +‘R‘. C. Team. ‘R‘: A language and environment for statistical computing. Vienna, Austria: ‘R‘ Foundation for Statistical Computing, 2018. URL https://www.R-project.org/. +
    +
    +A. Trapletti and K. Hornik. ‘Tseries‘: Time series analysis and computational finance. 2019. URL https://CRAN.R-project.org/package=tseries. ‘R‘ package version 0.10-47. +
    +
    +R. Tsay. Analysis of financial time series. Second Chicago: Wiley-Interscience, 2010. DOI 10.1002/0471264105. +
    +
    +R. M. Vassilly Voinov Natalie Pya and Y. Voinov. New invariant and consistent chi-squared type goodness-of-fit tests for multivariate normality and a related comparative simulation study. Communications in Statistics - Theory and Methods, 45(11): 3249–3263, 2016. URL https://doi.org/10.1080/03610926.2014.901370. +
    +
    +Larry. Wasserman. All of nonparametric statistics. New York: Springer, 2006. DOI 10.1007/0-387-30623-4. +
    +
    +M. West and J. Harrison. Bayesian forecasting and dynamic models. Springer New York, 2006. URL https://books.google.nl/books?id=0mPgBwAAQBAJ. +
    +
    +H. Wickham. ‘ggplot2‘: Elegant graphics for data analysis. Springer-Verlag New York, 2009. URL http://ggplot2.org. +
    +
    +C. O. Wilke. ‘Cowplot‘: Streamlined plot theme and plot annotations for ‘ggplot2‘. 2020. URL https://CRAN.R-project.org/package=cowplot. ‘R‘ package version 1.1.1. +
    +
    +D. Wuertz, T. Setz, Y. Chalabi, C. Boudt, P. Chausse and M. Miklovac. ‘fGarch‘: Rmetrics - autoregressive conditional heteroskedastic modelling. 2017. URL https://CRAN.R-project.org/package=fGarch. ‘R‘ package version 3042.83. +
    +
    + + +
    + +
    +
    + + + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Matamoros, et al., "nortsTest: An R Package for Assessing Normality of Stationary Processes", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-008,
    +  author = {Matamoros, Asael Alonzo and Nieto-Reyes, Alicia and Agostinelli, Claudio},
    +  title = {nortsTest: An R Package for Assessing Normality of Stationary Processes},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-008},
    +  doi = {10.32614/RJ-2024-008},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {135-156}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-008/RJ-2024-008.pdf b/_articles/RJ-2024-008/RJ-2024-008.pdf new file mode 100644 index 0000000000..d911802371 Binary files /dev/null and b/_articles/RJ-2024-008/RJ-2024-008.pdf differ diff --git a/_articles/RJ-2024-008/RJ-2024-008.tex b/_articles/RJ-2024-008/RJ-2024-008.tex new file mode 100644 index 0000000000..615a898b1c --- /dev/null +++ b/_articles/RJ-2024-008/RJ-2024-008.tex @@ -0,0 +1,844 @@ +% !TeX root = RJwrapper.tex +\title{nortsTest: An R Package for Assessing Normality of Stationary Processes} + + +\author{by Asael Alonzo Matamoros, Alicia Nieto-Reyes, and Claudio Agostinelli} + +\maketitle + +\abstract{% +Normality is the central assumption for analyzing dependent data in several time series models, and the literature has widely studied normality tests. However, the implementations of these tests are limited. The nortsTest package is dedicated to fill this void. The package performs the asymptotic and bootstrap versions of the tests of Epps and Lobato and Velasco and the tests of Psaradakis and Vavra, random projections and El Bouch for normality of stationary processes. These tests are for univariate stationary processes but for El Bouch that also allows bivariate stationary processes. In addition, the package offers visual diagnostics for checking stationarity and normality assumptions for the most used time series models in several R packages. This work aims to show the package's functionality, presenting each test performance with simulated examples and the package utility for model diagnostic in time series analysis. +} + +\section{Introduction}\label{introduction} + +Normality (\emph{a set of observations sampled from a Gaussian process}) is an essential assumption in various statistical models. Therefore, developing procedures for testing this assumption is a topic that has gained popularity over several years. Most existing literature and implementation is dedicated to independent and identically distributed random variables \citep{Dagostino1987}; no results show that these tests are consistent when applied to stationary processes. For this context, several tests have been proposed over the years, but as far as we know, no \texttt{R} package or consistent implementation exists. + +The proposed \CRANpkg{nortsTest} package provides seven test implementations to check normality of stationary processes. This work aims to present a review of these tests and introduce the package functionality. Thus, its novelty lies in being the first package and paper dedicated to the implementation of normality tests for stationary processes. The implemented tests are: (i) the asymptotic \emph{Epps} test, \citep{epps1987} and \citep{nietoreyes2014}, based on the characteristic function and (ii) its sieve bootstrap approximation \citep{psaradakis2020normality}, (iii) the corrected \emph{Skewness-Kurtosis} (SK) test implemented by \citet{Lobato2004} as an asymptotic test and (iv) by \citet{psaradakis2020normality} with a sieve bootstrap approximation, (v) the \emph{random projections test} proposed by \citet{nietoreyes2014}, which makes use of the tests in (i) and (iii), (vi) the \emph{Psadarakis and Vávra test} \citep{vavra2017} that uses a bootstrap approximation of the \citet{anderson1952} test statistic for stationary linear processes and (vii) a normality test by \citet{el2022normality} for multivariate dependent samples. Tests (i) to (vi) are for univariate stationary processes. + +Furthermore, we propose the \texttt{check\_residual()} function for checking time-series models' assumptions. This function returns a report for stationarity, seasonality, normality tests and visual diagnostics. \texttt{check\_residual()} supports models from the most used packages for time-series analysis, such as the packages \CRANpkg{forecast} \citep{Rob2007} and \CRANpkg{aTSA} \citep{aTSA} and even functions in the base \texttt{R} \citep{R}; for instance, it supports the \texttt{HoltWinters} (stats \texttt{R} package) function for the Holt and Winters method \citep{Holt2004}. In addition, the proposed \CRANpkg{nortsTest} package has already been applied in the literature, see \citet{Nieto-Reyes:2022-1} and \citet{Nieto-Reyes:2022-2}. + +Section 2 provides the theoretical background, including preliminary concepts and results. Section 3 introduces the normality tests for stationary processes, each subsection introducing a test framework and including examples of the tests functions with simulated data. Section 4 provides numerical experiments with simulated data and a real-world application: Subsection 4.1 reports a simulation study for the implemented normality tests and Subsection 4.2 the package's functionality for model checking in a real data application. The \emph{carbon dioxide} data measured in the Malua Loa Observatory \citep{astsa} is analyzed using a state space model from the \CRANpkg{forecast} package, evaluating the model's assumptions using our proposed \texttt{check\_residuals()} function. Section 5 discusses the package functionality and provides our conclusions. Furthermore, we mention our future intended work on the package. + +\section{Preliminary concepts}\label{preliminary-concepts} + +This section provides some theoretical aspects of stochastic processes that are a necessary theoretical framework for the following sections. \citet{shumway2010} and \citet{Ts2010} give more details of the following definitions and results below. + +For the purpose of this work, \(T\) is a set of real values denoted as time, \(T \subseteq \mathbb{R},\) for instance \(T=\mathbb{N}\) or \(T=\mathbb{Z},\) the naturals or integer numbers respectively. We denote by \(X:=\{X_t\}_{t\in T}\) a \textit{stochastic process} with \(X_t\) a real random variable for each \(t\in T.\) Following this notation, a \textit{time-series} is just a finite collection of ordered observations of \(X\) \citep{shumway2010}. An important measure for a stochastic process is its mean function \(\mu(t) := E[X_t]\) for each \(t \in T\), where \(E[\cdot]\) denotes the usual expected value of a random variable. A generalization of this measure is the k-th order centered moment function \(\mu_k(t) := E[(X_t -\mu(t))^k]\) for each \(t \in T\) and \(k > 1;\) with the process variance function being the second order centered moment, \(\sigma^2(t) := \mu_2(t)\). Other important measures are the auto-covariance and auto-correlation functions, which measure the linear dependency between two different time points of a given process. For any \(t,s \in T,\) they are, respectively, +\[ +\gamma(t,s) := E[(X_t -\mu(t))(X_s - \mu(s))] \mbox{ and } \rho(t,s) := \dfrac{\gamma(t,s)}{\sqrt{\mu_2(t)}\sqrt{\mu_2(s)}}. +\] +Other widely used measure functions for the analysis of processes are the skewness and kurtosis functions, defined as \(s(t) := \mu_3(t)/[\mu_2(t)]^{3/2}\) and \(k(t) := \mu_4(t)/[\mu_2(t)]^2\) for each \(t\in T,\) respectively. + +A generally used assumption for stochastic processes is stationarity. It has a key role in forecasting procedures of classic time-series modeling \citep{Ts2010} or as a principal assumption in de-noising methods for signal theory \citep{W2006}. + +\paragraph{Definition 1}\label{definition-1} + +A stochastic process \(X\) is said to be \emph{strictly stationary} if, for every collection \(\tau = \{t_1,t_2,\ldots, t_k\} \subset T\) and \(h > 0\), the joint distribution of \(\{X_t\}_{t \in \tau}\) is identical to that of \(\{X_{t+h}\}_{t \in \tau}.\) + +The previous definition is strong for applications. A milder version of it, which makes use of the process' first two moments, is weak stationarity. + +\paragraph{Definition 2}\label{definition-2} + +A stochastic process \(X\) is said to be \emph{weakly stationary} if its mean function is constant in time, \(\mu(t) = \mu\), its auto-covariance function only depends on the difference between times, \(\gamma(s,t) = \sigma|t-s|\) for a \(\sigma\in \mathbb{R}\), and it has a finite variance function, \(\mu_2(t) = \mu_2 < \infty\). + +For the rest of this work, the term \emph{stationary} will be used to specify a weakly stationary process. A direct consequence of the stationarity assumption is that the previous measure functions get simplified. Thus, given a stationary stochastic process \(X,\) its mean function, \(k\)-th order centered moment, for \(k>1,\) and auto-covariance function are respectively, +\[ + \mu = E[X_{t_1}]\mbox{, } \mu_k = E[(X_{t_1} -\mu)^k] \mbox{ and } \gamma(h) = E[(X_{t_1+h}-\mu)(X_{t_1}-\mu)], +\] +which are independent of \(t_1\in T.\) + +Given a sample \(x_1, \ldots, x_n,\) \(n\in\mathbb{N},\) of equally spaced observations of \(X,\) their corresponding estimators, sample mean, sample \(k\)-th order centered moment and sample auto-covariance, are respectively +\[ + \widehat{\mu} := n^{-1}\sum_{i=1}^nx_i\mbox{, } \widehat{\mu}_k := n^{-1}\sum_{i=1}^n(x_i - \widehat{\mu})^k \mbox{ and }\widehat{\gamma}(h) := n^{-1}\sum_{i = 1}^{n-h}(x_{i+h} - \widehat{\mu})(x_i - \widehat{\mu}). +\] + +A particular case in which stationarity implies strictly stationarity is a Gaussian process. + +\paragraph{Definition 3}\label{definition-3} + +A stochastic process \(X\) is said to be a \emph{Gaussian process} if for every finite collection \(\tau = \{t_1,t_2,\ldots, t_k\} \subset T,\) the joint distribution of \(\{X_t\}_{t \in \tau}\) has a multivariate normal distribution. + +A series of mean zero uncorrelated random variables with finite constant variance is known as \emph{white noise}. If additionally, it is formed of independent and identically distributed (i.i.d) normal random variables, it is known as \emph{Gaussian white noise}; which is a particular case of stationary Gaussian process. For the rest of the work, \(X_t \sim N(\mu,\sigma^2)\) denotes that the random variable \(X_t\) is normally distributed with mean \(\mu\) and variance \(\sigma^2\) and \(\chi^2(v)\) denotes the Chi square distribution with \(v\) degrees of freedom. + +Other classes of stochastic processes can be defined using collections of white noise, for instance, the linear process. + +\paragraph{Definition 4}\label{definition-4} + +Let \(X\) be a stochastic process. \(X\) is said to be \emph{linear} if it can be written as +\[ +X_t = \mu + \sum_{i\in\mathbb{Z}}\phi_i\epsilon_{t-i}, +\] +where \(\{\epsilon_i\}_{i\in\mathbb{Z}}\) is a collection of white noise random variables and \(\{\phi_i\}_{i\in\mathbb{Z}}\) is a set of real values such that \(\sum_{i\in\mathbb{Z}} |\phi_j| < \infty.\) + +An important class of processes is the \emph{auto-regressive moving average} (\(ARMA\)). \citet{Box1990} introduced it for time series analysis and forecast, becoming very well-known in the 90s and early 21st century. + +\paragraph{Definition 5}\label{definition-5} + +For any non-negative integers \(p,q,\) a stochastic process \(X\) is an \(ARMA(p,q)\) process if it is a stationary process and +\begin{equation} + X_t = \sum_{i=0}^p \phi_iX_{t-i} +\sum_{i=0}^q \theta_i\epsilon_{t-i}, \label{eq:ARMA} +\end{equation} +where \(\{\phi_i\}_{i=0}^p\) and \(\{\theta_i\}_{i=0}^q\) are sequences of real values with \(\phi_0= 0,\) \(\phi_p\neq 0,\) \(\theta_0=1\) and \(\theta_q\neq 0\) and \(\{\epsilon_{i}\}_{i\in\mathbb{Z}}\) is a collection of white noise random variables. + +Particular cases of \(ARMA\) processes are those known as auto-regressive (\(AR(p) := ARMA(p,0)\)) and mean average (\(MA(q) := ARMA(0,q)\)) processes. Additionally, a \emph{random walk} is a non stationary AR(1) +process satisfying \eqref{eq:ARMA} with \(p=1,\) \(\phi_1 = 1\) and \(q=0.\) Several properties of an \(ARMA\) process can be extracted from its structure. For that, the \(AR\) and \(MA\) polynomials are introduced +\[ + AR:\text{ } \phi(z) = 1-\sum_{i=0}^p \phi_i z^i \text{ and } MA:\text{ } \theta(z) = \sum_{i=0}^q \theta_i z^i, +\] +where \(z\) is a complex number and, as before, \(\phi_0 = 0,\) \(\phi_p\neq 0,\) \(\theta_0= 1\) and \(\theta_q\neq 0.\) Conditions for stationarity, order selection and, process behavior are properties studied from these two polynomials. + +For modeling volatility in financial data, \citet{Bollerslev1986} proposed the \emph{generalized auto-regressive conditional heteroscedastic} (GARCH) class of processes as a generalization of the \emph{auto-regressive conditional heteroscedastic} (ARCH) processes \citep{engle1982}. + +\paragraph{Definition 6}\label{definition-6} + +For any \(p,q \in \mathbb{N}\), a stochastic process \(X\) is a \(GARCH(p,q)\) process if it satisfies \(X_t = \mu + \sigma_{t}\epsilon_t\) with +\[ +\sigma_t^2 = \alpha_0 +\sum_{i=1}^p\alpha_i \epsilon_{t-i}^2 +\sum_{i=1}^q \beta_{i}\sigma^2_{t-i}. +\] +\(\mu\) is the process mean, \(\sigma_0\) is a positive constant value, \(\{\alpha_i\}_{i=1}^p\) and \(\{\beta_i\}_{i=1}^q\) are non-negative sequences of real values and \(\{\epsilon_{t}\}_{t \in T}\) is a collection of i.i.d. random variables. + +A more general class of processes are the \emph{state-space models} (\(SSMs\)), which have gained popularity over the years because they do not impose on the process common restrictions such as linearity or stationarity and are flexible in incorporating the process different characteristics \citep{OBrien2010}. They are widely used for smoothing \citep{west2006} and forecasting \citep{Rob2007} in time series analysis. The main idea is to model the process dependency with two equations: the \emph{state equation}, which models how parameters change over time, and the \emph{innovation equation}, which models the process in terms of the parameters. Some particular SSMs that analyze the level, trend and seasonal components of the process are known as \emph{error, trend, and seasonal} (ETS) models. There are over 32 different variations of ETS models \citep{Hyndman2008}. One of them is the \emph{multiplicative error, additive trend-seasonality} \((ETS(M,A,A))\) model. + +\paragraph{Definition 7}\label{definition-7} + +A SSM process \(X\) follows an ETS(M,A,A) model, if the process accepts\\ +\[ +X_t = [L_{t-1} +T_{t-1} + S_{t-1}](1 + \epsilon_t) +\] +as innovation equation and +\begin{eqnarray*}L_t &= &L_{t-1} +T_{t-1} +\alpha (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + T_t &= &T_{t-1} + \beta (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + S_t &= &S_{t-m} + \gamma (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t, +\end{eqnarray*}\\ +as state equations. +\(\alpha, \beta,\gamma \in [0,1]\), \(m\in\mathbb{N}\) denotes the period of the series and \(\{\epsilon_t\}\) are i.i.d normal random variables. For each \(t\in\mathbb{Z},\) \(L_t\), \(T_t\) and \(S_t\) represent respectively the level, trend and seasonal components. + +\section{Normality tests for stationary processes}\label{normality-tests-for-stationary-processes} + +Extensive literature exists on goodness of fit tests for normality under the assumption of independent and identically distributed random variables, including, among others, Pearson's chi-squared test \citep{Pearson1895}, Kolmogorov-Smirnov test \citep{Smirnov1948}, Anderson-Darling test \citep{anderson1952}, SK test \citep{jarque1980} and Shapiro-Wilk test, \citep{SWtest1965} and \citep{Royston1982}. These procedures have been widely used in many studies and applications, see \citet{Dagostino1987} for further details. There are no results, however, showing that the above tests are consistent in the context of stationary processes, in which case the independence assumption is violated. For instance, \citet{Gasser1975} provides a simulation study where Pearson's chi-squared test has an excessive rejection rate under the null hypothesis for dependent data. For this matter, several tests for stationary processes have been proposed over the years. A selection of which we reference here. \citet{epps1987} provides a test based on the characteristic function, \citet{Hinich1982} proposes a similar test based on the process' spectral density function \citep[for further insight]{Berg2010}. \citet{Gasser1975} gives a correction of the SK test, with several modifications made in \citet{Lobato2004}, \citet{bai2005} and \citet{MarianZach2017}, which are popular in many financial applications. \citet{Meddahi2005} constructs a test based on Stein's characterization of a Gaussian distribution. Using the random projection method \citep{Cuesta2007}, \citet{nietoreyes2014} build a test that upgrades the performance of \citet{epps1987} and \citet{Lobato2004} procedures. Furthermore, \citet{vavra2017} adapts the \citet{anderson1952} statistic for stationary linear processes approximating its sample distribution with a sieve bootstrap procedure. + +Despite the existing literature, consistent implementations of goodness of fit test for normality of stationary processes in programming languages such as \texttt{R} or \texttt{Python} are limited. This is not the case for normality of independent data, the \CRANpkg{nortest} package \citep{nortest2015} implements tests such as Lilliefors \citep{Wilkinson1986}, Shapiro-Francia \citep{Royston1993}, Pearson's chi-squared, Cramer von Misses \citep{vonMisses1962} and Anderson-Darling. For a multivariate counterpart, the \CRANpkg{mvnTest} package \citep{mvntest} implements the multivariate Shapiro-Wilk, Anderson-Darling, Cramer von Misses, Royston \citep{Royston1992}, Doornik and Hansen \citep{DH2008}, Henze and Zirkler \citep{HZ1990} and the multivariate Chi square test \citep{S2_2016}. For the case of dependent data, we present here the \CRANpkg{nortsTest} package. Type within \texttt{R} \texttt{install.packages("nortsTest",\ dependencies\ =\ TRUE)} to install its latest released version from \texttt{CRAN}. \CRANpkg{nortsTest} performs the tests proposed in \citet{epps1987}, \citet{Lobato2004}, \citet{psaradakis2020normality}, \citet{nietoreyes2014}, \citet{vavra2017} and \citet{el2022normality}. + +Additionally, the package offers visualization functions for descriptive time series analysis and several diagnostic methods for checking stationarity and normality assumptions for the most used time series models of several \texttt{R} packages. To elaborate on this, Subsection 3.1 introduces the package functionality and software and Subsection 3.2 provides an overview of tests for checking stationary and seasonality. Finally, Subsections 3.3-3.5 present a general framework of each of the implemented normality tests and their functionality by providing simulated data examples. + +\subsection{Software}\label{software} + +The package works as an extension of the \CRANpkg{nortest} package \citep{nortest2015}, which performs normality tests in random samples but for independent data. The building block functions of the \CRANpkg{nortsTest} package are: + +\begin{itemize} +\item + \texttt{epps.test()}, function that implements the test of Epps, +\item + \texttt{epps\_bootstrap.test()}, function that implements a bootstrap approximation of the test of Epps, +\item + \texttt{lobato.test()}, function that implements the asymptotic test of Lobato and Velasco, +\item + \texttt{lobato\_bootstrap.test()}, function that implements a bootstrap approximation of the test of Lobato and Velasco, +\item + \texttt{rp.test()}, function that implements the random projection test of Nieto-Reyes, Cuesta-Albertos and Gamboa, +\item + \texttt{vavra.test()}, function that implements the test of Psaradaki and Vavra, and +\item + \texttt{elbouch.test()}, function that implements the test of El Bouch, Michel and Comon. +\end{itemize} + +Each of these functions accepts a \texttt{numeric} (\emph{numeric}) or \texttt{ts} (\emph{time series}) class object for storing data, and returns a \texttt{htest} (\emph{hypothesis test}) class object with the main results for the test. To guarantee the accuracy of the results, each test performs unit root tests for checking stationarity and seasonality (see Subsection 3.2) and displays a warning message if any of them is not satisfied. + +For visual diagnostic, the package offers different plot functions based on the \CRANpkg{ggplot2} package \citep{ggplot2}: the \texttt{autoplot()} function plots \texttt{numeric}, \texttt{ts} and \texttt{mts} (\emph{multivariate time series}) classes while the \texttt{gghist()} and \texttt{ggnorm()} functions are for plotting histogram and qq-plots respectively; and on the \CRANpkg{forecast} package \citep{Rob2007}: \texttt{ggacf()} and \texttt{ggPacf()} for the display of the auto-correlation and partial auto-correlations functions respectively. + +Furthermore, inspired in the function \texttt{checkresiduals()} of the \CRANpkg{forecast} package, we provide the \texttt{check\_residuals()} function to test the model assumptions using the estimated residuals. The upgrade of our proposal is that, besides providing plots for visual diagnosis (setting the \texttt{plot} option as \texttt{TRUE}), it does check stationarity, seasonality (\emph{Subsection 3.2}) and normality, presenting a report of the used tests and conclusions for assessing the model's assumptions. An illustration of these functions is provided in Subsection 4.2, where we show the details of the functions and their utility for assumptions commonly checked in time series modeling. + +\subsection{Tests for stationarity}\label{tests-for-stationarity} + +For checking stationarity, the \CRANpkg{nortsTest} package uses \textit{unit root} and \textit{seasonal unit root} tests. These tests work similarly, checking whether a specific process follows a random walk model, which clearly is a non-stationary process. + +\subsubsection{Unit root tests}\label{unit-root-tests} + +A linear stochastic process \(X\) that follows a random walk model is non stationary. Its AR polynomial is \(\phi(z) = 1 - z\), whose solution (root) is unique and equal to one. Thus, it is common to test the non stationarity of a linear process by checking whether its AR polynomial has a unit root (a root equal to one). + +The most commonly used tests for unit root testing are \emph{Augmented Dickey-Fuller} \citep{dickey1984}, \emph{Phillips-Perron} \citep{Perron1988}, \emph{kpps} \citep{KppsI1992} and \textit{Ljung-Box} \citep{Box}. In particular, the \emph{Ljung-Box} test contrasts the null auto-correlation hypothesis of identically distributed Gaussian random variables, which is equivalent to test stationarity. The \texttt{uroot.test()} and \texttt{check\_residual()} functions perform these tests, making use of the \CRANpkg{tseries} package \citep{tseries}. + +\subsubsection{Seasonal unit root tests}\label{seasonal-unit-root-tests} + +Let \(X\) be a stationary process and \(m\) its period. Note that for observed data, \(m\) generally corresponds to the number of observations per unit of time. \(X\) follows a seasonal random walk if it can be written as +\[ + X_t = X_{t-m} + \epsilon_t, +\] +where \(\epsilon_t\) is a collection of i.i.d random variables. In a similar way, the process \(X\) is non-stationary if it follows a seasonal random walk. Or equivalently, \(X\) is non stationary if the seasonal AR(1) polynomial (\(\phi_m(z) = 1 - \phi z^m\)) has a unit root. The \texttt{seasonal.test()} and \texttt{check\_residuals()} functions perform the \emph{OCSB test} \citep{ocsb1988} from the \CRANpkg{forecast} package and the \emph{HEGY} \citep{Hegy1993} and \emph{Ch} \citep{ch1995} tests from the \CRANpkg{uroot} package \citep{uroot}. + +\subsection{Tests of Epps}\label{tests-of-epps} + +The \(\chi^2\) test for normality proposed by \citet{epps1987} compares the empirical characteristic function of the one-dimensional marginal of the process with the one of a normally distributed random variable evaluated at certain points on the real line. Several authors, including \citet{Lobato2004}, \citet{vavra2017} and \citet{el2022normality}, point out that the greatest challenge in the Epps' test is its implementation procedure, which we address with the \CRANpkg{nortsTest} package. Other existing tests based on the empirical characteristic function of the one-dimensional marginal of the process include \citet{hong1999hypothesis} and the references therein. This test differs, however, in that it uses spectral analysis and derivatives. + +Furthermore, \citet{meintanis2016review} reviews on testing procedures based on the empirical characteristic function. There, it is commented about the random projection test \citep[and here below]{nietoreyes2014} as a recent development of Epps' test. In fact, in \citet{nietoreyes2014} the consistency of Epps test is improved by taking at random the elements at which the characteristic function is evaluated. Additionally, \citet{el2022normality} proposes a sieve bootstrap modification of the Epps' test. In addition to the classical asymptotic Epps' test, we include these last two approaches here, and in the package, see the Example below and the paragraph before it. Let us provide now the foundation behind the Epps' tests. + +Let \(X\) be a stationary stochastic process that satisfies +\begin{equation} + \sum_{t=-\infty}^{\infty}|t|^k|\gamma(t)| <\infty \mbox{ for some } k >0. \label{eq:a} +\end{equation} +The null hypothesis is that the one-dimensional marginal distribution of \(X\) is a Gaussian process. The procedure for constructing the test consists of defining a function \(g\), estimating its inverse spectral matrix function, minimizing the generated quadratic function in terms of the unknown parameters of the random variable and, finally, obtaining the test statistic, which converges in distribution to a \(\chi^2.\) + +Given \(N \in\mathbb{N}\) with \(N \geq 2,\) let +\[ +\Lambda :=\{\lambda:=(\lambda_1, \ldots, \lambda_N) \in \mathbb{R}^N: \lambda_i \leq \lambda_{i+1} \text{ and } \lambda_i > 0, \text{ for } i = 1,2,\ldots, N \}, +\] +and \(g:\mathbb{R}\times \Lambda \rightarrow \mathbb{R}^n\) be a measurable function, where +\[ + g(x,\lambda):= [\cos(\lambda_1x),\sin(\lambda_1x),\ldots,\cos(\lambda_Nx),\sin(\lambda_Nx)]. +\] +Additionally, let \(g_\theta:\Lambda \rightarrow \mathbb{R}^N\) be a function defined by +\[ + g_\theta(\lambda) := \left[\mbox{Re}(\Phi_\theta(\lambda_1)),\mbox{Im}(\Phi_\theta(\lambda_1)),\ldots,\mbox{Re}(\Phi_\theta(\lambda_N)),\mbox{Im}(\Phi_\theta(\lambda_N)) \right]^t, +\] +where the \(\mbox{Re}(\cdot)\) and \(\mbox{Im}(\cdot)\) are the real and imaginary components of a complex number and \(\Phi_\theta\) is the characteristic function of a normal random variable with parameters \(\theta := (\mu,\sigma^2)\in \Theta,\) an open bounded set contained in \(\mathbb{R}\times \mathbb{R}^+\). For any \(\lambda\in\Lambda,\) let us also denote +\[ + \widehat{g}(\lambda) := \dfrac{1}{n}\sum_{t=1}^n [\cos(\lambda_1 x_t),\sin(\lambda_1x_t),\ldots,\cos(\lambda_N x_t),\sin(\lambda_N x_t)]^t. +\] +Let \(f(v;\theta,\lambda)\) be the spectral density matrix of \(\{g(X_t,\lambda)\}_{t \in\mathbb{Z}}\) at a frequency \(v.\) +Then, for \(v = 0\), it can be estimated by +\[ + \widehat{f}(0;\theta,\lambda) := \dfrac{1}{2\pi n}\left(\sum_{t=1}^n \widehat{G}(x_{t,0},\lambda) +2\sum_{i=1}^{\lfloor n^{2/5}\rfloor}(1 -i/\lfloor n^{2/5} \rfloor)\sum_{t=1}^{n-i}\widehat{G}(x_{t,i},\lambda) \right), +\] +where \(\widehat{G}(x_{t,i},\lambda) = (\widehat{g}(\lambda) -g(x_{t},\lambda))(\widehat{g}(\lambda) -g(x_{t+i},\lambda))^t\) and \(\lfloor \cdot \rfloor\) denotes the floor function. The test statistic general form under \(H_0\) is +\[ + Q_n(\lambda) := \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +\] +with +\[ + Q_n(\theta,\lambda):=(\widehat{g}(\lambda)-g_\theta(\lambda))^tG_n^+(\lambda)(\widehat{g}(\lambda)-g_\theta(\lambda)), +\] +where \(G^{+}_n\) is the generalized inverse of the spectral density matrix \(2 \pi \widehat{f}(0;\theta,\lambda)\). Let +\[ + \widehat{\theta} := \arg \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +\] +be the argument that minimizes \(Q_n(\theta,\lambda)\) such that \(\widehat{\theta}\) is in a neighborhood of \(\widehat{\theta}_n := (\widehat{\mu},\widehat{\gamma}(0))\). To guarantee its' existence and uniqueness, the following assumptions are required. We refer to them as assumption \((A.)\). + +\((A.)\) Let \(\theta_0\) be the true value of \(\theta\) under \(H_0\), then for every \(\lambda \in \Lambda\) the following conditions are satisfied. + +\begin{itemize} +\item + \(f(0;\theta,\lambda)\) is positive definite. +\item + \(\Phi_\theta(\lambda)\) is twice differential with respect to \(\theta\) in a neighborhood of \(\theta_0\). +\item + The matrix \(D(\theta_0,\lambda) = \dfrac{\partial \Phi_\theta(\lambda)}{\partial\theta |_{\theta = \theta_0}} \in \mathbb{R}^{N\times 2}\) has rank 2. +\item + The set \(\Theta_0(\lambda) := \{ \theta \in \Theta: \Phi_\theta(\lambda_i) = \Phi_{\theta_0}(\lambda_i), i=1, \ldots,N\}\) is a finite bounded set in \(\Theta\). And \(\theta\) is a bounded subset \(\mathbb{R}\times \mathbb{R}^+\). +\item + \(f(0;\theta,\lambda) = f(0;\theta_0,\lambda)\) and \(D(\theta_0,\lambda) = D(\theta_,\lambda)\) for all \(\theta \in \Theta_0(\lambda)\). +\end{itemize} + +Under these assumptions, the Epps's main result is presented as follows. + +\paragraph{\texorpdfstring{Theorem 1 \citep[Theorem 2.1]{epps1987}}{Theorem 1 {[}@epps1987, Theorem 2.1{]}}}\label{theorem-1-epps1987-theorem-2.1} + +Let \(X\) be a stationary Gaussian process such that \eqref{eq:a} and \((A.)\) are satisfied, then \(nQ_n(\lambda)\to_d \chi^2(2N - 2)\) for every \(\lambda \in \Lambda\). + +The current \CRANpkg{nortsTest} version, uses \(\Lambda := \{\verb|lambda|/\widehat{\gamma}(0)\}\) as the values to evaluate the empirical characteristic function, where \(\widehat{\gamma}(0)\) is the sample variance. By default \texttt{lambda\ =\ c(1,\ 2)}. Therefore, the implemented test statistic converges to a \(\chi^2\) distribution with two degrees of freedom. The user can change these \(\Lambda\) values as desired by simply specifying the function's \texttt{lambda} argument, as we show in the Example below. + +\paragraph{Example 1}\label{example-1} + +A stationary \(AR(2)\) process is drawn using a beta distribution with \texttt{shape1\ =\ 9} and \texttt{shape2\ =\ 1} parameters, and performed the implementation of the test of Epps, \texttt{epps.test()}. At significance level \(\alpha = 0.05\), the null hypothesis of normality is correctly rejected. + +\begin{verbatim} +set.seed(298) +x = arima.sim(250,model = list(ar =c(0.5,0.2)), + rand.gen = rbeta,shape1 = 9,shape2 = 1) + +# Asymptotic Epps test +epps.test(x) +#> +#> Epps test +#> +#> data: x +#> epps = 22.576, df = 2, p-value = 1.252e-05 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Asymptotic Epps test with random Lambda values as proposed in \citet{nietoreyes2014}. + +\begin{verbatim} +set.seed(298) +epps.test(x, lambda = abs(rnorm(mean = c(1, 2), 2))) +#> +#> Epps test +#> +#> data: x +#> epps = 25.898, df = 2, p-value = 2.379e-06 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Approximated sieve bootstrap Epps test using 1000 repetitions of 250 units. + +\begin{verbatim} +set.seed(298) +epps_bootstrap.test(x, seed = 298) +#> +#> Sieve-Bootstrap epps test +#> +#> data: y +#> bootstrap-epps = 22.576, p-value < 2.2e-16 +#> alternative hypothesis: y does not follow a Gaussian Process +\end{verbatim} + +\subsection{Tests of Lobato and Velasco}\label{tests-of-lobato-and-velasco} + +\citet{Lobato2004} provides a consistent estimator for the corrected SK test statistic for stationary processes, see \citet{Lomincki1961} and \citet{Gasser1975} for further insight. Note that the SK test is also known as the Jarque-Bera test \citep{jarque1980}, which is already available in several R packages \citep[for instance]{tseries}. The improvement of this proposal over those implementations is a correction in the skewness and kurtosis estimates by the process' auto-covariance function, resulting in a consistent test statistic under the assumption of correlated data. The test in \citet{Lobato2004} is asymptotic, which is computationally efficient, as opposed to a bootstrap based test. \citet{psaradakis2020normality} show that the bootstrap modification of the Lobato and Velasco's test is a fair competitor against the original asymptotic test, beating other tests for normality of the one-dimensional marginal distribution in terms of power. Thus, the package incorporates both the asymptotic, \texttt{lobato.test()} and its bootstrap version \texttt{lobato\_bootstrap.test()}. + +The general framework for the test is presented in what follows. On the contrary to the test of Epps, this proposal does not require additional parameters for the computation of the test sample statistic. + +Let \(X\) be a stationary stochastic process that satisfies + +\begin{equation} + \sum_{t=0}^{\infty}|\gamma(t)| <\infty. \label{eq:aLV} +\end{equation} + +The null hypothesis is that the one-dimensional marginal distribution of \(X\) is normally distributed, that is +\[ +H_0: X_t \sim N(\mu,\sigma^2) \text{ for all } t \in \mathbb{R}. +\] +Let \(k_q(j_1,j_2,\ldots,j_{q-1})\) be the q-th order cummulant of \(X_{1},X_{1+j_1},\ldots,X_{1+j_{q-1}}\). \(H_0\) is fulfilled if all the marginal cummulants above the second order are zero. In practice, it is tested just for the third and fourth order marginal cummulants. Equivalently, in terms of moments, the marginal distribution is normal by testing whether \(\mu_3 = 0\) and \(\mu_4 = 3 \mu_2^2\). For non-correlated data, the SK test compares the SK statistic against upper critical values from a \(\chi^2(2)\) distribution \citep{bai2005}. For a Gaussian process \(X\) satisfying \eqref{eq:aLV}, it holds the limiting result +\[ + \sqrt{n} \binom{\widehat{\mu}_3}{\widehat{\mu}_4 -3\widehat{\mu}^2_2} \to_d N[0_2,\Sigma_F)], +\] +where \(0_2 := (0,0)^t \in \mathbb{R}^2\) and \(\Sigma_F := \mbox{diag}(6F^{(3)}, \text{ } 24F^{(4)}) \in \mathbb{R}^{2x2}\) is a diagonal matrix with \(F^{(k)} := \sum_{j = -\infty}^{\infty}\gamma(j)^k\) for \(k=3,4\) \citep{Gasser1975}. + +The following consistent estimator in terms of the auto-covariance function is proposed in \citet{Lobato2004} +\[ + \widehat{F}^{(k)} := \sum_{t = 1-n}^{n-1}\widehat{\gamma}(t)[\widehat{\gamma}(t) +\widehat{\gamma}(n-|t|)]^{k-1}, +\] +to build a \emph{generalized SK test} statistic +\[ + G := \dfrac{n \widehat{\mu}_3^2}{6 \widehat{F}^{(3)}} + \dfrac{n(\widehat{\mu}_4 -3\widehat{\mu}_2)^2}{24\widehat{F}^{(4)}}. +\] +Similar to the SK test for non-correlated data, the \(G\) statistic is compared against upper critical values from a \(\chi^2(2)\) distribution. This is seen in the below result that establishes the asymptotic properties of the test statistics, so that the general test procedure can be constructed. The result requires the following assumptions, denoted by \((B.),\) for the process \(X.\) + +(B.) + +\begin{itemize} +\item + \(E[X_t^{16}] < \infty\) for \(t \in T.\) +\item + \(\sum_{j_1 = -\infty}^{\infty}\cdots \sum_{j_{q-1} = -\infty}^{\infty} |k_q(j_1,\ldots,j_{q-1})| < \infty \text{ for } q=2,3,\ldots,16.\) +\item + \(\sum_{j=1}^{\infty}\left(E \left[\text{ } E[(X_0-\mu)^k|B_j] -\mu_k\right]^2 \right)^{1/2} < \infty \text{ for } k = 3,4,\) where \(B_j\) denotes the \(\sigma\)-field generated by \(X_t\), \(t \leq -j.\) +\item + \(E\left[Z_k \right]^2 +2\sum_{j=1}^{\infty}E\left(\left[Z_k \right] \left[ (X_j -\mu)^k -\mu_k \right] \right) > 0\) for \(k = 3,4,\) with \(Z_k=(X_0 -\mu)^k -\mu_k.\) +\end{itemize} + +Note that these assumptions imply that the higher-order spectral densities up to order 16 are continuous and bounded. + +\paragraph{\texorpdfstring{Theorem 2 \citep[Theorem 1]{Lobato2004}}{Theorem 2 {[}@Lobato2004, Theorem 1{]}}}\label{theorem-2-lobato2004-theorem-1} + +Let \(X\) be a stationary process. If \(X\) is Gaussian and satisfies \eqref{eq:aLV} then \(G \to_d \chi^2(2)\), and under assumption (B.), the test statistic G diverges whenever \(\mu_3 \neq 0\) or \(\mu_4 \neq 3\mu_2^2.\) + +\paragraph{Example 2}\label{example-2} + +A stationary \(MA(3)\) process is drawn using a gamma distribution with \texttt{rate\ =\ 3} and \texttt{shape\ =\ 6} parameters. The \texttt{lobato.test()} function performs the test of \emph{Lobato and Velasco} to the simulated data. At significance level \(\alpha = 0.05\), the null hypothesis of normality is correctly rejected. + +\begin{verbatim} +set.seed(298) +x = arima.sim(250,model = list(ma = c(0.2, 0.3, -0.4)), + rand.gen = rgamma, rate = 3, shape = 6) +# Asymptotic Lobato & Velasco +lobato.test(x) +#> +#> Lobato and Velasco's test +#> +#> data: x +#> lobato = 65.969, df = 2, p-value = 4.731e-15 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Approximated sieve bootstrap Lobato and Velasco test using 1000 repetitions of 250 units. + +\begin{verbatim} +lobato_bootstrap.test(x, seed = 298) +#> +#> Sieve-Bootstrap lobato test +#> +#> data: y +#> bootstrap-lobato = 65.969, p-value < 2.2e-16 +#> alternative hypothesis: y does not follow a Gaussian Process +\end{verbatim} + +\subsection{The Random Projections test}\label{the-random-projections-test} + +The previous proposals only test for the normality of the one-dimensional marginal distribution of the process, which is inconsistent against alternatives whose one-dimensional marginal is Gaussian. \citet{nietoreyes2014} provides a procedure to fully test normality of a stationary process using a Crammér-Wold type result \citep{Cuesta2007} that uses random projections to differentiate among distributions. In \citet{nietoreyes2014} existing tests for the normality of the one dimensional marginal are applied to the random projections and the resulting p-values combined using the false discovery rate for dependent data \citep{Benjamin2001}. The \CRANpkg{nortsTest} package improves on this test by allowing to use the less conservative false discovery rate in \citet{Benjamin1995}. + +We show the Crammér-Wold type result below. The result works for separable Hilbert spaces, however here, for its later application, we restrict it to \(l^2,\) the space of square summable sequences over \(\mathbb{N},\) with inner product \(\langle \cdot,\cdot \rangle.\) + +\paragraph{\texorpdfstring{Theorem 3 \citep[Theorem 3.6]{Cuesta2007}}{Theorem 3 {[}@Cuesta2007, Theorem 3.6{]}}}\label{theorem-3-cuesta2007-theorem-3.6} + +Let \(\eta\) be a dissipative distribution on \(l^2\) and \(Z\) a \(l^2\)-valued random element, then \(Z\) is Gaussian if and only if +\[ + \eta\{h \in l^2: \langle Z,h \rangle \text{ has a Gaussian distribution}\} > 0. +\] +A dissipative distribution \citep[Definition 2.1]{nietoreyes2014} is a generalization of the concept of absolutely continuous distribution to the infinite-dimensional space. A Dirichlet process \citep{gelman2013} produces random elements with a dissipative distribution in \(l^2\). In practice, generate draws of \(h \in l^2\) with a stick-breaking process that makes use of beta distributions. + +Let \(X = \{X_t\}_{t\in\mathbb{Z}}\) be a stationary process. As \(X\) is normally distributed if the process \(X^{(t)} := \{X_k\}_{k \leq t}\) is Gaussian for each \(t\in\mathbb{Z},\) using the result above, \citet{nietoreyes2014} provides a procedure for testing that \(X\) is a Gaussian process by testing whether the process \(Y^h = \{Y^h_t\}_{t \in \mathbb{Z}}\) is Gaussian. +\begin{equation} + Y^h_t := \sum_{i=0}^\infty h_i X_{t-i} = \langle X^{ (t) },h \rangle, \label{eq:proj} +\end{equation} +where \(\langle X^{(t)},h \rangle\) is a real random variable for each \(t \in \mathbb{Z}\) and \(h\in l^2\). Thus, \(Y^h\) is a stationary process constructed by the projection of \(X^{(t)}\) on the space generated by \(h.\) Therefore, \(X\) is a Gaussian process if and only if the one dimensional marginal distribution of \(Y^{h}\) is normally distributed. Additionally, the hypothesis of the tests \emph{Lobato and Velasco} or \emph{Epps}, such as \eqref{eq:a}, \eqref{eq:aLV}, \((A)\) and \((B)\), imposed on \(X\) are inherited by \(Y^h\). Then, those tests can be applied to evaluate the normality of the one dimensional marginal distribution of \(Y^h\). Further considerations include the specific beta parameters used to construct the distribution from which to draw \(h\) and selecting a proper number of combinations to establish the number of projections required to improve the method performance. All of these details are discussed in \citet{nietoreyes2014}. + +Next, we summarize the test of random projections in practice: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\item + Select \(k,\) which results in \(2k\) independent random projections (\emph{by default} \texttt{k\ =\ 1}). +\item + Draw the \(2k\) random elements to project the process from a dissipative distribution that uses a particular beta distribution. By default, use a \(\beta(2,7)\) for the first \(k\) projections and a \(\beta(100,1)\) for the later \(k\). +\item + Apply the tests of \emph{Lobato and Velasco} to the even projected processes and \emph{Epps} to the odd projections. +\item + Combine the obtained \(2k\) \texttt{p-values} using the false discover rate. By default, use \citet{Benjamin2001} procedure. +\end{enumerate} + +The \texttt{rp.test()} function implements the above procedure. The user might provide optional parameters such as the number of projections \texttt{k}, the parameters of the first beta distribution \texttt{pars1} and those of the second \texttt{pars2}. The next example illustrates the application of the \texttt{rp.test()} to a stationary GARCH(1,1) process drawn using normal random variables. + +\paragraph{Example 3}\label{example-3} + +A stationary \texttt{GARCH(1,1)} process is drawn with a standard normal distribution and parameters \(\alpha_0 = 0,\) \(\alpha_1 = 0.2\) and \(\beta_1 = 0.3\) using the \citep[\CRANpkg{fGarch} package,][]{fGarch}. Note that a \texttt{GARCH(1,1)} process is stationary if the parameters \(\alpha_1\) and \(\beta_1\) satisfy the inequality \(\alpha_1 + \beta_1 < 1\) \citep{Bollerslev1986}. + +\begin{verbatim} +set.seed(3468) +library(fGarch) +spec = garchSpec(model = list(alpha = 0.2, beta = 0.3)) +x = ts(garchSim(spec, n = 300)) +rp.test(x) +#> +#> k random projections test. +#> +#> data: x +#> k = 1, p.value adjust = Benjamini & Yekutieli, p-value = 1 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +At significance level \(\alpha = 0.05,\) the applied \emph{random projections} test with \texttt{k\ =\ 1} as the number of projections shows no evidence to reject the null hypothesis of normality. + +\subsection{The Psaradakis and Vavra's test}\label{the-psaradakis-and-vavras-test} + +\citet{vavra2017} adapted a distance test for normality for a one-dimensional marginal distribution of a stationary process. Initially, the test was based on the Anderson (1952) test statistic and used an auto-regressive sieve bootstrap approximation to the null distribution of the sample test statistic. Later, \citet{psaradakis2020normality} considered this test as the ultimate normality test based on the empirical distribution function, and adapted its methodology to a wide range of tests, including Shapiro-Wilk \citep{SWtest1965}, Jarque-Bera \citep{jarque1980}, Cramer von Mises \citep{vonMisses1962}, Epps, and Lobato-Velasco. Their experiments show that the Lobato-Velasco and Jarque-Bera test's bootstrap version performs best in small samples. + +Although the test is said to be applicable to a wide class of non-stationary processes by transforming them into stationary by means of a fractional difference operator, no theoretic result was apparently provided to sustain this transformation. This work restricts the presentation of the original procedure to stationary processes. + +Let \(X\) be a stationary process satisfying +\begin{equation} + X_t = \sum_{i=0}^{\infty}\theta_i \epsilon_{t-i} + \mu_0, \ t \in \mathbb{Z}, \label{eq:aPV} +\end{equation} +where \(\mu_0 \in \mathbb{R}\), \(\{\theta_i\}_{i=0}^\infty\in l^2\) with \(\theta_0 = 1\) and \(\{\epsilon_t\}_{i=0}^\infty\) is a collection of mean zero i.i.d random variables. The null hypothesis is that the one dimensional marginal distribution of \(X\) is normally distributed, +\[ + H_0: F(\mu_0 +\sqrt{\gamma(0)}x)-F_N(x) = 0, \text{ for all } x\in \mathbb{R}, +\] +where F is the cumulative distribution function of \(X_0\), and \(F_N\) denotes the standard normal cumulative distribution function. Note that if \(\epsilon_0\) is normally distributed, then the null hypothesis is satisfied. Conversely, if the null hypothesis is satisfied, then \(\epsilon_0\) is normally distributed and, consequently, \(X_0\).\\ +The considered test for \(H_0\) is based on the Anderson-Darling distance statistic +\begin{equation} + A_d = \int_{-\infty}^{\infty}\dfrac{[{F_n}(\widehat{\mu}+\sqrt{\widehat{\gamma}(0)}x)-F_N(x)]^2}{F_N(x)[1-F_N(x)]}dF_N(x), \label{eq:aPV1} +\end{equation} +where \({F_n}(\cdot)\) is the empirical distribution function associated to \(F\) based on a simple random sample of size \(n\). \citet{vavra2017} proposes an auto-regressive sieve bootstrap procedure to approximate the sampling properties of \(A_d\) arguing that making use of classical asymptotic inference for \(A_d\) is problematic and involved. This scheme is motivated by the fact that under some assumptions for \(X,\) including \eqref{eq:aPV}, \(\epsilon_t\) admits the representation +\begin{equation} + \epsilon_t = \sum_{i=1}^{\infty}\phi_i(X_{t-i} - \mu_0), \ t \in \mathbb{Z}, \label{eq:ePV} +\end{equation} +for certain type of \(\{\phi_i\}_{i=1}^\infty\in l^2\). The main idea behind this approach is to generate a bootstrap sample \(\epsilon_t^*\) to approximate \(\epsilon_t\) with a finite-order auto-regressive model. This is because the distribution of the processes \(\epsilon_t\) and \(\epsilon_t^*\) coincide asymptotically if the order of the auto-regressive approximation grows simultaneously with \(n\) at an appropriate rate \citep{Buhlmann1997}. The procedure makes use of the \(\epsilon_t^{*'}s\) to obtain the \(X_t^{*'}s\) through the bootstrap analog of \eqref{eq:ePV}. Then, generate a bootstrap sample of the \(A_d\) statistic, \(A_d^{*},\) making use of the bootstrap analog of \eqref{eq:aPV}. + +The \texttt{vavra.test()} function implements \citet{psaradakis2020normality} procedure. By default, it generates 1,000 sieve-bootstrap replications of the Anderson-Darling statistic. The user can provide different test procedures, such as the \emph{Shapiro-Wilk, Jarque-Bera, Cramer von Mises, Epps} or \emph{Lobato-Velasco} test, by specifying a text value to the \texttt{normality} argument. The presented values are Monte Carlo estimates of the \(A_d\) statistic and \texttt{p.value}. + +\paragraph{Example 4}\label{example-4} + +A stationary \(ARMA\)(1,1) process is simulated using a standard normal distribution and performs \emph{Psaradakis and Vávra} procedure using Anderson-Darling and Cramer von Mises test statistics. At significance level \(\alpha = 0.05\), there is no evidence to reject the null hypothesis of normality. + +\begin{verbatim} +set.seed(298) +x = arima.sim(250,model = list(ar = 0.2, ma = 0.34)) +# Default, Psaradakis and Vavra's procedure +vavra.test(x, seed = 298) +#> +#> Psaradakis-Vavra test +#> +#> data: x +#> bootstrap-ad = 0.48093, p-value = 0.274 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Approximate Cramer von Mises test for the Psaradakis and Vavra's procedure + +\begin{verbatim} +vavra.test(x, normality = "cvm", seed = 298) +#> +#> Sieve-Bootstrap cvm test +#> +#> data: x +#> bootstrap-cvm = 0.056895, p-value = 0.49 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +\subsection{The multivariate kurtosis test}\label{the-multivariate-kurtosis-test} + +The literature contains some procedures to test the null hypothesis that a multivariate stochastic process is Gaussian. Those include \citet{moulines1992testing}, a test based on the characteristic function, and \citet{Steinberg1992}, a test based on properties of the entropy of Gaussian processes that does not make use of cumulant computations. According to \citet{el2022normality}, these tests may hardly be executable in real time. Consequently, they propose a test based on multivariate kurtosis \citep{mardia1970measures}. The proposed procedure is for \(p=1,2,\) and we elaborate on it in what follows. In Section 6.3 of \citet{el2022normality}, they suggest to apply random projections for higher dimensions but they do not investigate the procedure any further. + +The p-value of this test is obtained as \(2(1-F_N(z))\) where, as above, \(F_N\) denotes the standard normal cumulative distribution function. There, +\[ + z:=(\hat{B}_p-E[\hat{B}_p])/\sqrt{E[(\hat{B}_p-E[\hat{B}_p])^2]}, + \] +where +\[ + \hat{B}_p:=n^{-1}\sum_{t=1}^n(x_t^t \hat{S}^{-1}x_t)^2, + \] +and +\[ + \hat{S}:=n^{-1}\sum_{t=1}^n x_t x_t^t. +\] +In \citet{el2022normality}, there reader can found the exact computations of \(E[\hat{B}_p]\) and \(E[(\hat{B}_p-E[\hat{B}_p])^2].\) + +This test is implemented in the \texttt{elbouch.test()} function. By default, the function computes the univariate El Bouch test. If the user provides a secondary data set, the function computes the bivariate counterpart. + +\paragraph{Example 5}\label{example-5} + +Simulate a two-dimensional stationary VAR(2) process using independent AR(1) and AR(2) processes with standard normal distributions and apply the bivariate El Bouch test. At significance level \(\alpha = 0.05\), there is no evidence to reject the null hypothesis of normality. + +\begin{verbatim} +set.seed(23890) +x = arima.sim(250,model = list(ar = 0.2)) +y = arima.sim(250,model = list(ar = c(0.4,0,.1))) +elbouch.test(y = y,x = x) +#> +#> El Bouch, Michel & Comon's test +#> +#> data: w = (y, x) +#> Z = 0.92978, p-value = 0.1762 +#> alternative hypothesis: w = (y, x) does not follow a Gaussian Process +\end{verbatim} + +\section{Simulations and data analysis}\label{simulations-and-data-analysis} + +\subsection{Numerical experiments}\label{numerical-experiments} + +Inspired by the simulation studies in \citet{vavra2017} and \citet{nietoreyes2014}, we propose here a procedure that involves drawing data from the \(AR(1)\) process +\begin{equation} + X_t = \phi X_{t-1} + \epsilon_t, \ t \in\mathbb{Z}, \text{ for } \phi \in \{ 0,\pm 0.25,\pm 0.4\}, \label{eq:eqAR} +\end{equation} +where the \(\{\epsilon_t\}_{t\in\mathbb{Z}}\) are i.i.d random variables. For the distribution of the \(\epsilon_t\) we consider different scenarios: standard normal (\(N\)), standard log-normal (\(\log N\)), Student t with 3 degrees of freedom (\(t_3\)), chi-squared with 10 degrees of freedom (\(\chi^2(10)\)) and gamma with \((7, 1)\) shape and scale parameters (\(\Gamma(7,1)\)). + +\begin{table}[!h] +\centering +\caption{\label{tab:tab1-static}Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.} +\centering +\resizebox{\ifdim\width>\linewidth\linewidth\else\width\fi}{!}{ +\begin{tabular}[t]{lrrrrrrrrrrrr} +\toprule +\multicolumn{1}{c}{ } & \multicolumn{6}{c}{n = 100} & \multicolumn{6}{c}{n = 250} \\ +\cmidrule(l{3pt}r{3pt}){2-7} \cmidrule(l{3pt}r{3pt}){8-13} +phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi\\ +\midrule +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Lobato and Velasco}}\\ +\hspace{1em}N & 0.041 & 0.044 & 0.047 & 0.032 & 0.035 & 0.769 & 0.059 & 0.037 & 0.054 & 0.040 & 0.037 & 0.646\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.610 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.653\\ +\hspace{1em}t3 & 0.797 & 0.853 & 0.902 & 0.875 & 0.829 & 0.627 & 0.990 & 0.994 & 0.998 & 0.999 & 0.983 & 0.674\\ +\hspace{1em}chisq10 & 0.494 & 0.698 & 0.770 & 0.707 & 0.610 & 0.620 & 0.930 & 0.995 & 0.998 & 0.997 & 0.977 & 0.657\\ +\hspace{1em}Gamma(7,1) & 0.995 & 1.000 & 0.999 & 0.996 & 0.988 & 0.634 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.665\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Epps}}\\ +\hspace{1em}N & 0.056 & 0.051 & 0.062 & 0.060 & 0.063 & 0.695 & 0.048 & 0.058 & 0.053 & 0.066 & 0.063 & 0.736\\ +\hspace{1em}logN & 0.908 & 0.917 & 0.972 & 0.985 & 0.984 & 0.729 & 1.000 & 1.000 & 1.000 & 0.999 & 1.000 & 0.777\\ +\hspace{1em}t3 & 0.243 & 0.291 & 0.370 & 0.317 & 0.248 & 0.722 & 0.776 & 0.872 & 0.908 & 0.881 & 0.780 & 0.769\\ +\hspace{1em}chisq10 & 0.267 & 0.440 & 0.548 & 0.469 & 0.360 & 0.699 & 0.611 & 0.850 & 0.930 & 0.866 & 0.721 & 0.739\\ +\hspace{1em}Gamma(7,1) & 0.866 & 0.961 & 0.996 & 0.993 & 0.965 & 0.722 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.782\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Random Projections}}\\ +\hspace{1em}N & 0.051 & 0.042 & 0.045 & 0.039 & 0.050 & 1.301 & 0.045 & 0.033 & 0.046 & 0.038 & 0.050 & 1.905\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.330 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.906\\ +\hspace{1em}t3 & 0.790 & 0.863 & 0.879 & 0.823 & 0.727 & 1.320 & 0.982 & 0.994 & 0.995 & 0.991 & 0.975 & 1.949\\ +\hspace{1em}chisq10 & 0.589 & 0.730 & 0.757 & 0.640 & 0.542 & 1.295 & 0.957 & 0.994 & 0.994 & 0.969 & 0.888 & 1.926\\ +\hspace{1em}Gamma(7,1) & 0.998 & 1.000 & 1.000 & 0.998 & 0.989 & 1.308 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.963\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Psaradakis and Vavra}}\\ +\hspace{1em}N & 0.052 & 0.048 & 0.051 & 0.058 & 0.050 & 17.905 & 0.061 & 0.046 & 0.038 & 0.051 & 0.045 & 22.115\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 17.149 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 21.841\\ +\hspace{1em}t3 & 0.700 & 0.799 & 0.851 & 0.780 & 0.695 & 17.503 & 0.960 & 0.979 & 0.991 & 0.977 & 0.960 & 22.183\\ +\hspace{1em}chisq10 & 0.498 & 0.673 & 0.804 & 0.689 & 0.550 & 18.029 & 0.902 & 0.983 & 0.997 & 0.988 & 0.933 & 22.197\\ +\hspace{1em}Gamma(7,1) & 0.989 & 1.000 & 1.000 & 1.000 & 0.998 & 18.467 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 22.292\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Lobato}}\\ +\hspace{1em}N & 0.057 & 0.052 & 0.047 & 0.059 & 0.052 & 37.141 & 0.035 & 0.049 & 0.048 & 0.058 & 0.049 & 40.532\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 32.509 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 40.793\\ +\hspace{1em}t3 & 0.797 & 0.867 & 0.899 & 0.869 & 0.809 & 32.755 & 0.989 & 0.994 & 0.996 & 0.996 & 0.989 & 41.158\\ +\hspace{1em}chisq10 & 0.567 & 0.729 & 0.801 & 0.745 & 0.649 & 32.242 & 0.942 & 0.990 & 1.000 & 0.994 & 0.963 & 40.950\\ +\hspace{1em}Gamma(7,1) & 0.999 & 1.000 & 1.000 & 0.998 & 0.991 & 31.763 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 41.277\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Epps}}\\ +\hspace{1em}N & 0.047 & 0.053 & 0.048 & 0.052 & 0.044 & 57.749 & 0.058 & 0.052 & 0.053 & 0.048 & 0.043 & 65.367\\ +\hspace{1em}logN & 0.846 & 0.877 & 0.963 & 0.974 & 0.959 & 56.756 & 1.000 & 1.000 & 1.000 & 1.000 & 0.999 & 65.968\\ +\hspace{1em}t3 & 0.183 & 0.238 & 0.313 & 0.230 & 0.196 & 57.350 & 0.752 & 0.863 & 0.913 & 0.841 & 0.754 & 65.699\\ +\hspace{1em}chisq10 & 0.252 & 0.364 & 0.527 & 0.450 & 0.358 & 56.627 & 0.596 & 0.813 & 0.913 & 0.854 & 0.685 & 65.369\\ +\hspace{1em}Gamma(7,1) & 0.816 & 0.948 & 0.993 & 0.979 & 0.931 & 56.986 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 65.315\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{El Bouch}}\\ +\hspace{1em}N & 0.040 & 0.047 & 0.044 & 0.033 & 0.050 & 0.798 & 0.040 & 0.054 & 0.052 & 0.061 & 0.059 & 1.020\\ +\hspace{1em}logN & 0.990 & 0.998 & 0.998 & 0.995 & 0.980 & 0.805 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.025\\ +\hspace{1em}t3 & 0.833 & 0.883 & 0.928 & 0.886 & 0.846 & 0.824 & 0.996 & 0.999 & 0.998 & 0.998 & 0.991 & 1.044\\ +\hspace{1em}chisq10 & 0.041 & 0.152 & 0.281 & 0.155 & 0.046 & 0.812 & 0.062 & 0.386 & 0.597 & 0.388 & 0.065 & 1.031\\ +\hspace{1em}Gamma(7,1) & 0.833 & 0.905 & 0.929 & 0.898 & 0.818 & 0.818 & 0.993 & 0.998 & 0.999 & 0.995 & 0.989 & 1.042\\ +\bottomrule +\end{tabular}} +\end{table} + +As in \citet{vavra2017}, \(m=1,000\) independent draws of the above process are generated for each pair of parameter \(\phi\) and distribution. Each draw is taken of length \(past+n,\) with \(past=500\) and \(n \in \{100,250,500,1000 \}\). The first 500 data points of each realization are then discarded in order to eliminate start-up effects. The \(n\) remaining data points are used to compute the value of the test statistic of interest. In each particular scenario, the rejection rate is obtained by computing the proportion of times that the test is rejected among the \(m\) trials. + +\begin{table}[!h] +\centering +\caption{\label{tab:tab2-static}Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.} +\centering +\resizebox{\ifdim\width>\linewidth\linewidth\else\width\fi}{!}{ +\begin{tabular}[t]{lrrrrrrrrrrrr} +\toprule +\multicolumn{1}{c}{ } & \multicolumn{6}{c}{n = 500} & \multicolumn{6}{c}{n = 1,000} \\ +\cmidrule(l{3pt}r{3pt}){2-7} \cmidrule(l{3pt}r{3pt}){8-13} +phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi\\ +\midrule +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Lobato and Velasco}}\\ +\hspace{1em}N & 0.041 & 0.035 & 0.052 & 0.035 & 0.049 & 0.729 & 0.048 & 0.050 & 0.040 & 0.062 & 0.040 & 1.065\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.743 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.076\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.844 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.116\\ +\hspace{1em}chisq10 & 0.999 & 1.000 & 1.000 & 1.000 & 1.000 & 0.824 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.082\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.825 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.105\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Epps}}\\ +\hspace{1em}N & 0.048 & 0.046 & 0.056 & 0.065 & 0.050 & 0.905 & 0.034 & 0.038 & 0.046 & 0.033 & 0.059 & 1.182\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.931 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.294\\ +\hspace{1em}t3 & 0.991 & 0.994 & 0.996 & 0.997 & 0.985 & 0.936 & 1.000 & 0.998 & 1.000 & 1.000 & 0.999 & 1.235\\ +\hspace{1em}chisq10 & 0.924 & 0.991 & 0.999 & 0.991 & 0.969 & 0.917 & 0.997 & 1.000 & 1.000 & 1.000 & 1.000 & 1.202\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.873 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.239\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Random Projections}}\\ +\hspace{1em}N & 0.044 & 0.043 & 0.040 & 0.040 & 0.048 & 2.723 & 0.021 & 0.027 & 0.043 & 0.043 & 0.047 & 4.544\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.759 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.588\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.755 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.531\\ +\hspace{1em}chisq10 & 1.000 & 1.000 & 1.000 & 1.000 & 0.998 & 2.782 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.520\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.843 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.527\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Psaradakis and Vavra}}\\ +\hspace{1em}N & 0.048 & 0.050 & 0.045 & 0.053 & 0.039 & 26.957 & 0.055 & 0.045 & 0.047 & 0.043 & 0.033 & 37.993\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 27.209 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 37.282\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 26.599 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 37.642\\ +\hspace{1em}chisq10 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 27.418 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 37.731\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 27.659 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 38.232\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Lobato}}\\ +\hspace{1em}N & 0.055 & 0.048 & 0.053 & 0.037 & 0.035 & 53.110 & 0.050 & 0.046 & 0.067 & 0.049 & 0.047 & 72.528\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 52.632 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 71.845\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 52.763 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 71.454\\ +\hspace{1em}chisq10 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 52.455 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 73.413\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 53.204 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 72.253\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Epps}}\\ +\hspace{1em}N & 0.051 & 0.043 & 0.033 & 0.043 & 0.051 & 78.920 & 0.055 & 0.054 & 0.056 & 0.044 & 0.064 & 101.883\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 78.194 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 101.753\\ +\hspace{1em}t3 & 0.979 & 0.995 & 0.998 & 0.996 & 0.985 & 79.735 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 100.766\\ +\hspace{1em}chisq10 & 0.911 & 0.986 & 0.996 & 0.995 & 0.945 & 80.841 & 0.997 & 1.000 & 1.000 & 1.000 & 0.998 & 101.250\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 78.688 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 101.360\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{El Bouch}}\\ +\hspace{1em}N & 0.065 & 0.053 & 0.047 & 0.061 & 0.059 & 1.419 & 0.055 & 0.064 & 0.051 & 0.048 & 0.045 & 2.467\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.435 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.500\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.453 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.492\\ +\hspace{1em}chisq10 & 0.100 & 0.609 & 0.871 & 0.609 & 0.076 & 1.439 & 0.176 & 0.858 & 0.984 & 0.865 & 0.173 & 2.470\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.444 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.483\\ +\bottomrule +\end{tabular}} +\end{table} + +Tables \ref{tab:tab1-static} and \ref{tab:tab2-static} present the rejection rate estimates. For every process of length \(n,\) the columns represent the used \(AR(1)\) parameter and the rows the distribution used to draw the process. The obtained results are consistent with those obtained in the publications where the different tests were proposed. As expected, rejection rates are around 0.05 when the data is drawn from a standard normal distribution, as in this case the data is drawn from a Gaussian process. Conversely, high rejection rates are registered for the other distributions. Low rejection rates are observed, however, for the \(\chi^2(10)\) distribution when making use of some of the tests. For instance, the \emph{Epps} and \emph{bootstrap Epps} tests, although they consistently tend to 1 when the length of the process, \(n,\) increases. Another case is the El Bouch test. However, this one maintains low rates for large values of \(|\phi|\) when \(n\) increases. Furthermore, for the random projections test, the number of projections used in this study is the default \(k = 1,\) which is by far a lower number than the recommended by \citet{nietoreyes2014}. However, even in these conditions, the obtained results are satisfactory, with the random projection test having even better performance than the tests of \citet{epps1987} or \citet{vavra2017}. + +An important aspect in selecting a procedure is its computation time. Thus, for each length of the process, \(n,\) there is an additional column, max.phi, in \emph{Tables} \ref{tab:tab1-static} and \ref{tab:tab2-static}. Each entry in this column refers to a different distribution and contains the maximum running time in seconds to obtain the rejection rate among the different values of the AR parameter. That is, for a fix distribution, the rejection rates are computed for each of the five possibilities of \(\phi\) and the time that it takes recorded. The running time in the table is the largest among the five. Furthermore, in \textit{Table} \ref{tab:tab3-static} we show the time in seconds that each studied test takes to check whether a given process is Gaussian. In particular, the table contains the average running time over 1,000 trials that takes to generate and check a Gaussian AR(1) process with parameter \(\phi = 0.5\). This is done for different sample sizes, \(n \in \{1000, 2000, 3000, 4000, 5000\}.\) According to the table, the asymptotic tests (Lobato and Velasco, Epps, random projections and El Bouch) have similar running times. On the contrary, the bootstrap based tests (Psaradakis and Vavra, Bootstrap Epps and Lobato and Velasco) have, as expected, higher running times on average. Furthermore, Tables \ref{tab:tab1-static} and \ref{tab:tab2-static} show similar results in time performance. There, the maximum running time of the bootstrap based tests exceeds in more than ten seconds the time obtained with the asymptotic based tests. It is worth saying that the tables have been obtained with R version 4.3.1 (2023-06-16) and platform aarch64-apple-darwin20 (64-bit),running under macOS Sonoma 14.2.1. + +\begin{table} + +\caption{\label{tab:tab3-static}Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$} +\centering +\begin{tabular}[t]{lrrrrr} +\toprule +tests & n = 1000 & n = 2000 & n = 3000 & n = 4000 & n = 5000\\ +\midrule +Lobato and Velasco & 0.0010 & 0.0014 & 0.0020 & 0.0026 & 0.0035\\ +Epps & 0.0010 & 0.0015 & 0.0021 & 0.0027 & 0.0035\\ +Random Projections & 0.0026 & 0.0045 & 0.0063 & 0.0082 & 0.0105\\ +El Bouch & 0.0023 & 0.0046 & 0.0074 & 0.0109 & 0.0152\\ +Psaradakis and Vavra & 0.0286 & 0.0429 & 0.0565 & 0.0012 & 0.0014\\ +\addlinespace +Bootstrap Lobato & 0.0542 & 0.0014 & 0.0019 & 0.0025 & 0.0032\\ +Bootstrap Epps & 0.0013 & 0.0018 & 0.0023 & 0.0029 & 0.0037\\ +\bottomrule +\end{tabular} +\end{table} + +\subsection{Real data application}\label{real-data-application} + +As an illustrative example, we analyze the monthly mean carbon dioxide, in parts per million (\emph{ppm}), measured at the Mauna Loa Observatory, in Hawaii, from March 1958 to November 2018. The carbon dioxide data measured as the mole fraction in dry air on Mauna Loa constitute the longest record of direct measurements of \(CO2\) in the atmosphere. This dataset is available in the \CRANpkg{astsa} package \citep{astsa} under the name \emph{cardox} data and it is displayed in the left panel of Figure \ref{fig:fig1-static}. The plot's grid is created using the \CRANpkg{cowplot} package \citep{cowplot}. + +The objective of this subsection is to propose a model to analyze this time series and check the assumptions on the residuals of the model using our implemented \texttt{check\_residuals()} function. The time series clearly has trend and seasonal components (see left panel of Figure \ref{fig:fig1-static}), therefore, an adequate model that filters both components has to be selected. We make use of an ETS model. For its implementation, we make use the \texttt{ets()} function from the \CRANpkg{forecast} package \citep{Rob2007}. This function fits 32 different ETS models and selects the best model according to information criteria such as \emph{Akaike's information criterion} (AIC) or \emph{Bayesian Information criteria} (BIC) \citep{BIC2006}. +The results provided by the \texttt{ets()} function are: + +\begin{figure} + +{\centering \includegraphics[width=0.75\linewidth,alt={Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour.}]{figures/fig1-static-1} + +} + +\caption{Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour.}\label{fig:fig1-static} +\end{figure} + + + +\begin{verbatim} +library(forecast) +library(astsa) +model = ets(cardox) +summary(model) +#> ETS(M,A,A) +#> +#> Call: +#> ets(y = cardox) +#> +#> Smoothing parameters: +#> alpha = 0.5451 +#> beta = 0.0073 +#> gamma = 0.1076 +#> +#> Initial states: +#> l = 314.4546 +#> b = 0.0801 +#> s = 0.6986 0.0648 -0.8273 -1.8999 -3.0527 -2.7629 +#> -1.2769 0.7015 2.1824 2.6754 2.3317 1.165 +#> +#> sigma: 9e-04 +#> +#> AIC AICc BIC +#> 3429.637 3430.439 3508.867 +#> +#> Training set error measures: +#> ME RMSE MAE MPE MAPE MASE +#> Training set 0.018748 0.3158258 0.2476335 0.005051657 0.06933903 0.152935 +#> ACF1 +#> Training set 0.09308391 +\end{verbatim} + +The resulting model, proposed by the \texttt{ets()} function, for analyzing the \emph{carbon dioxide} data in \emph{Mauna Loa} is an \(ETS[M,A,A]\) model. The parameters \(\alpha, \beta \text{ and } \gamma\) (see Definition 1) have being estimated using the least squares method. If the assumptions on the model are satisfied, then the errors of the model behave like a Gaussian stationary process. To check it, we make use of the function \texttt{check\_residuals()}. For more details on the compatibility of this function with the models obtained by other packages see the \CRANpkg{nortsTest} repository. In the following, we display the results of using the \emph{Augmented Dickey-Fuller} test (\emph{Subsection 3.1}) to check the stationary assumption and the \emph{random projection} test with \texttt{k\ =\ 1} projections to check the normality assumption. For the other test options see the function's documentation. + +\begin{verbatim} +check_residuals(model,unit_root = "adf",normality = "rp", + plot = TRUE) +\end{verbatim} + +\begin{verbatim} +#> +#> *************************************************** +#> +#> Unit root test for stationarity: +#> +#> Augmented Dickey-Fuller Test +#> +#> data: y +#> Dickey-Fuller = -9.8935, Lag order = 9, p-value = 0.01 +#> alternative hypothesis: stationary +#> +#> +#> Conclusion: y is stationary +#> *************************************************** +#> +#> Goodness of fit test for Gaussian Distribution: +#> +#> k random projections test. +#> +#> data: y +#> k = 1, p.value adjust = Benjamini & Yekutieli, p-value = 1 +#> alternative hypothesis: y does not follow a Gaussian Process +#> +#> +#> Conclusion: y follows a Gaussian Process +#> +#> *************************************************** +\end{verbatim} + +The obtained results indicate that the null hypothesis of non stationarity is rejected at significance level \(\alpha = 0.01.\) Additionally, there is no evidence to reject the null hypothesis of normality at significance level \(\alpha = 0.05.\) Consequently, we conclude that the residuals follow a stationary Gaussian process, having that the resulting \(ETS[M,A,A]\) model adjusts well to the \emph{carbon dioxide} data in \emph{Mauna Loa}. + +In the above displayed \texttt{check\_residuals()} function, the \texttt{plot} argument is set to \texttt{TRUE}. The resulting plots are shown in Figure \ref{fig:fig2-static}. The plot in the \emph{top} panel and the auto-correlation plots in the bottom panels insinuate that the residuals have a stationary behavior. The \emph{top} panel plot shows slight oscillations around zero and the auto-correlations functions in the \emph{bottom} panels have values close to zero in every lag. The histogram and qq-plot in the \emph{middle} panels suggest that the marginal distribution of the residuals is normally distributed. Therefore, Figure \ref{fig:fig2-static} agrees with the reported results, indicating that the assumptions of the model are satisfied. + +\begin{figure} + +{\centering \includegraphics[width=1\linewidth,alt={Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity.}]{figures/fig2-static-1} + +} + +\caption{Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity.}\label{fig:fig2-static} +\end{figure} + + + +As the assumptions of the model have been checked, it can be used for instance to forecast. The result of applying the following function is displayed in Figure \ref{fig:fig1-static}. It presents the carbon dioxide data for the last 8 years and a forecast of the next 12 months. It is observable from the plot that the model captures the process trend and periodicity. + +\begin{verbatim} +autoplot(forecast(model,h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: Carbon Dioxide Levels at Mauna Loa") +\end{verbatim} + + + +\section{Conclusions}\label{conclusions} + +For independent data, the \CRANpkg{nortest} package \citep{nortest2015} provides five different tests for normality, the \CRANpkg{mvnormtest} package \citep{mvnormtest2012} performs the Shapiro-Wilks test for multivariate data and the \CRANpkg{MissMech} package \citep{Mortaza2014} provides tests for normality in multivariate incomplete data. To test the normality of dependent data, some authors such as \citet{vavra2017} and \citet{nietoreyes2014} have available undocumented \texttt{Matlab} code, which is almost only helpful in re-doing their simulation studies. + +To our knowledge, no consistent implementation or package of tests for normality of stationary processes has been done before. Therefore, the \CRANpkg{nortsTest} is the first package to implement normality tests in stationary processes. This work gives a general overview of a careful selection of tests for normality in the stationary process, which consists of the most available types of tests. It additionally provides examples that illustrate each of the test implementations. + +For checking the model's assumptions, the \CRANpkg{forecast} and \CRANpkg{astsa} packages contain functions for visual diagnostic. Following the same idea, \CRANpkg{nortsTest} provides similar diagnostic methods; it also reports the results of testing stationarity and normality, the main assumptions for the residuals in time series analysis. + +\section{Future work and projects}\label{future-work-and-projects} + +A further version of the \CRANpkg{nortsTest} package will incorporate additional tests such as Bispectral \citep{Hinich1982} and Stein's characterization \citep{Meddahi2005}. Further future work will include a Bayesian version of a \emph{residuals check} procedure that uses the random projection method. Any future version under development can be installed from \texttt{GitHub} using the following code. + +\begin{verbatim} +if (!requireNamespace("remotes")) install.packages("remotes") +remotes::install_github("asael697/nortsTest",dependencies = TRUE) +\end{verbatim} + +\section*{Acknowledgment}\label{acknowledgment} +\addcontentsline{toc}{section}{Acknowledgment} + +This work was supported by grant PID2022-139237NB-I00 funded by ``ERDF A way of making Europe'' and MCIN/AEI/10.13039/501100011033. + +\bibliography{RJreferences.bib} + +\address{% +Asael Alonzo Matamoros\\ +Aalto University\\% +Department of Computer Science\\ Eespo, Finland\\ +% +\url{https://asael697.github.io}\\% +% +\href{mailto:izhar.alonzomatamoros@aalto.fi}{\nolinkurl{izhar.alonzomatamoros@aalto.fi}}% +} + +\address{% +Alicia Nieto-Reyes\\ +Universidad de Cantabria\\% +Departmento de Mathemáticas, Estadística y Computación\\ Avd. de los Castros s/n.~39005 Santander, Spain\\ +% +\url{https://orcid.org/0000-0002-0268-3322}\\% +% +\href{mailto:alicia.nieto@unican.es}{\nolinkurl{alicia.nieto@unican.es}}% +} + +\address{% +Claudio Agostinelli\\ +University of Trento\\% +Department of Mathematics\\ Via Sommarive, 14 - 38123 Povo\\ +% +\url{https://orcid.org/0000-0001-6702-4312}\\% +% +\href{mailto:claudio.agostinelli@unitn.it}{\nolinkurl{claudio.agostinelli@unitn.it}}% +} diff --git a/_articles/RJ-2024-008/RJournal.sty b/_articles/RJ-2024-008/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-008/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-008/RJreferences.bib b/_articles/RJ-2024-008/RJreferences.bib new file mode 100644 index 0000000000..79fa2d1253 --- /dev/null +++ b/_articles/RJ-2024-008/RJreferences.bib @@ -0,0 +1,1115 @@ +@article{mardia1970measures, + title={Measures of multivariate skewness and kurtosis with applications}, + author={Mardia, Kanti V}, + journal={Biometrika}, + volume={57}, + number={3}, + pages={519--530}, + year={1970}, + url = {http://www.jstor.org/stable/2334770}, + publisher={Oxford University Press} +} + +@inproceedings{moulines1992testing, + title={Testing that a multivariate stationary time-series is {G}aussian}, + author={Moulines, E and Choukri, K and Sharbit, M}, + booktitle={[1992] IEEE Sixth SP Workshop on Statistical Signal and Array Processing}, + pages={185--188}, + year={1992}, + organization={IEEE}, + doi={10.1109/SSAP.1992.246818} +} + +@article{Steinberg1992, + author={Steinberg, Y. and Zeitouni, O.}, + journal={IEEE Transactions on Information Theory}, + title={On tests for normality}, + year={1992}, + volume={38}, + number={6}, + pages={1779-1787}, + doi={10.1109/18.165450} +} + +@article{el2022normality, + title={A normality test for multivariate dependent samples}, + author={El Bouch, Sara and Michel, Olivier and Comon, Pierre}, + journal={Signal Processing}, + volume={201}, + pages={108705}, + year={2022}, + doi = {10.1016/j.sigpro.2022.108705}, + publisher={Elsevier} +} +@article{meintanis2016review, + title={A review of testing procedures based on the empirical characteristic function}, + author={Meintanis, Simos G}, + journal={South African Statistical Journal}, + volume={50}, + number={1}, + pages={1--14}, + year={2016}, + doi = {10.10520/EJC186846}, + publisher={South African Statistical Association (SASA)} +} + +@article{hong1999hypothesis, + title={Hypothesis testing in time series via the empirical characteristic function: a generalized spectral density approach}, + author={Hong, Yongmiao}, + journal={Journal of the American Statistical Association}, + volume={94}, + number={448}, + pages={1201--1220}, + year={1999}, + doi={10.2307/2669935}, + publisher={Taylor \& Francis} +} + +@article{psaradakis2020normality, + title={Normality tests for dependent data: large-sample and bootstrap approaches}, + author={Psaradakis, Zacharias and V{\'a}vra, Mari{\'a}n}, + journal={Communications in statistics-simulation and computation}, + volume={49}, + number={2}, + pages={283--304}, + year={2020}, + doi = {10.1080/03610918.2018.1485941}, + publisher={Taylor \& Francis} +} +@article{nietoreyes2014, + title = {A random-projection based test of {G}aussianity for stationary processes}, + journal = {Computational Statistics \& Data Analysis}, + volume = {75}, + pages = {124 - 141}, + year = {2014}, + issn = {0167-9473}, + doi = {10.1016/j.csda.2014.01.013}, + author = {Alicia Nieto-Reyes and Juan Antonio Cuesta-Albertos and Fabrice Gamboa} +} +@article{Lobato2004, + author = {Lobato, Ignacio and Velasco, Carlos}, + year = {2004}, + month = {08}, + pages = {671-689}, + title = {A simple test of normality for time series}, + volume = {20}, + journal = {Econometric Theory}, + doi = {10.1017/S0266466604204030} +} +@article{vavra2017, + title = {A distance test of normality for a wide class of stationary processes}, + journal = {Econometrics and Statistics}, + volume = {2}, + pages = {50 - 60}, + year = {2017}, + issn = {2452-3062}, + doi = {10.1016/j.ecosta.2016.11.005}, + author = {Zacharias Psaradakis and Marián Vávra} +} +@article{epps1987, + author = {Epps, T. W.}, + doi = {10.1214/aos/1176350618}, + journal = {The Annals of Statistics}, + month = {12}, + number = {4}, + pages = {1683--1698}, + publisher = {The Institute of Mathematical Statistics}, + title = {Testing that a stationary time series is {G}aussian}, + volume = {15}, + year = {1987} +} +@article{Hinich1982, + author = {Hinich, Melvin J.}, + title = {Testing for {G}aussianity and linearity of a stationary time series}, + journal = {Journal of Time Series Analysis}, + volume = {3}, + number = {3}, + pages = {169-176}, + keywords = {Bispectrum, skewness, time series, spectrum}, + doi = {10.1111/j.1467-9892.1982.tb00339}, + year = {1982} +} +@article{Berg2010, + title = {A bootstrap test for time series linearity}, + journal = {Journal of Statistical Planning and Inference}, + author = {Arthur Berg and Efstathios Paparoditis and Dimitris N. Politis}, + volume = {140}, + number = {12}, + pages = {3841 - 3857}, + year = {2010}, + issn = {0378-3758}, + doi = {10.1016/j.jspi.2010.04.047} +} +@article{Cuesta2007, + title = {The random projection method in goodness of fit for functional data}, + journal = {Computational Statistics \& Data Analysis}, + volume = {51}, + number = {10}, + pages = {4814 - 4831}, + year = {2007}, + issn = {0167-9473}, + doi = {10.1016/j.csda.2006.09.007}, + author = {J.A. Cuesta-Albertos and E. del Barrio and R. Fraiman and C. Matrán} +} +@article{Meddahi2005, + title = {Testing normality: a GMM approach}, + journal = {Journal of Econometrics}, + volume = {124}, + number = {1}, + pages = {149 - 186}, + year = {2005}, + issn = {0304-4076}, + doi = {10.1016/j.jeconom.2004.02.014}, + author = {Christian Bontemps and Nour Meddahi} +} +@article{Epps1983, + ISSN = {00063444}, + URL = {http://www.jstor.org/stable/2336512}, + author = {T. W. Epps and Lawrence B. Pulley}, + journal = {Biometrika}, + number = {3}, + pages = {723--726}, + publisher = {[Oxford University Press, Biometrika Trust]}, + title = {A test for normality based on the empirical characteristic function}, + volume = {70}, + year = {1983} +} +@article{Henze1990, + author={Henze, N.}, + title={An approximation to the limit distribution of the Epps-Pulley test statistic for normality}, + journal={Metrika}, + year={1990}, + month={Dec}, + day={01}, + volume={37}, + number={1}, + pages={7--18}, + issn={1435-926X}, + doi={10.1007/BF02613501}, +} +@article{Lomincki1961, + title = {Tests for departure from normality in the case of linear stochastic processes}, + author = {Lomnicki, Z.}, + year = {1961}, + journal = {Metrika: International Journal for Theoretical and Applied Statistics}, + volume = {4}, + number = {1}, + pages = {37-62}, + url = {https://EconPapers.repec.org/RePEc:spr:metrik:v:4:y:1961:i:1:p:37-62} +} +@article{Gasser1975, + ISSN = {00063444}, + URL = {http://www.jstor.org/stable/2335511}, + author = {Theo Gasser}, + journal = {Biometrika}, + number = {3}, + pages = {563--570}, + publisher = {[Oxford University Press, Biometrika Trust]}, + title = {Goodness-of-fit tests for correlated data}, + volume = {62}, + year = {1975} +} +@article{Pearson1895, + author = {Pearson, Karl and Henrici, Olaus Magnus Friedrich Erdmann}, + title = {X. {C}ontributions to the mathematical theory of evolution.-{II} {S}kew variation in homogeneous material}, + journal = {Philosophical Transactions of the Royal Society of London. (A.)}, + volume = {186}, + number = {}, + pages = {343-414}, + year = {1895}, + doi = {10.1098/rsta.1895.0010} +} +@article{Smirnov1948, + author = {Smirnov, N.}, + doi = {10.1214/aoms/1177730256}, + journal = {Annals of Mathematical Statistics}, + month = {06}, + number = {2}, + pages = {279--281}, + publisher = {The Institute of Mathematical Statistics}, + title = {Table for estimating the goodness of fit of empirical distributions}, + volume = {19}, + year = {1948} +} +@article{Test1977, + ISSN = {00063444}, + author = {E. S. Pearson and R. B. D'Agostino and K. O. Bowman}, + journal = {Biometrika}, + number = {2}, + pages = {231--246}, + publisher = {Oxford University Press, Biometrika Trust}, + title = {Tests for departure from normality: comparison of powers}, + volume = {64}, + doi={10.2307/2335689}, + year = {1977} +} +@article{jarque1980, + title = {Efficient tests for normality, homoscedasticity and serial independence of regression residuals}, + journal = {Economics Letters}, + volume = {6}, + number = {3}, + pages = {255 - 259}, + year = {1980}, + issn = {0165-1765}, + doi = {10.1016/0165-1765(80)90024-5}, + author = {Carlos M. Jarque and Anil K. Bera} +} +@article{anderson1952, + author = {Anderson, T. W. and Darling, D. A.}, + doi = {10.1214/aoms/1177729437}, + journal = {Annals of Mathematical Statistics}, + month = {06}, + number = {2}, + pages = {193--212}, + year = 1952, + publisher = {The Institute of Mathematical Statistics}, + title = {Asymptotic theory of certain goodness of fit criteria based on stochastic processes}, + volume = {23} +} +@article{vonMisses1962, + author = {T. W. Anderson}, + title = {{On the distribution of the two-sample Cramer-von Mises criterion}}, + volume = {33}, + journal = {The Annals of Mathematical Statistics}, + number = {3}, + publisher = {Institute of Mathematical Statistics}, + pages = {1148 -- 1159}, + year = {1962}, + doi = {10.1214/aoms/1177704477}, + URL = {https://doi.org/10.1214/aoms/1177704477} +} +@article{SWtest1965, + author = {Shapiro, S. S. and Wilk, M. B.}, + title = {An analysis of variance test for normality (complete samples)}, + journal = {Biometrika}, + volume = {52}, + number = {3-4}, + pages = {591-611}, + year = {1965}, + month = {12}, + issn = {0006-3444}, + doi = {10.1093/biomet/52.3-4.591}, +} +@article{Royston1982, + ISSN = {00359254, 14679876}, + URL = {http://www.jstor.org/stable/2347973}, + author = {J. P. Royston}, + journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)}, + number = {2}, + pages = {115--124}, + publisher = {Wiley, Royal Statistical Society}, + title = {An extension of {S}hapiro and {W}ilk's {W} test for normality to large samples}, + volume = {31}, + year = {1982} +} +@article{Royston1992, + URL = {https://doi.org/10.1007/BF01891203}, + author = {J. P. Royston}, + journal = {Journal of Statistics and Computing}, + number = {3}, + pages = {117--119}, + publisher = {Springer Link}, + title = {Approximating the Shapiro-Wilk {W}-test for non-normality}, + volume = {2}, + year = {1992} +} +@article{George2003, + author = {George Marsaglia and Wai Wan Tsang and Jingbo Wang}, + title = {Evaluating Kolmogorov's distribution}, + journal = {Journal of Statistical Software, Articles}, + volume = {8}, + number = {18}, + year = {2003}, + issn = {1548-7660}, + pages = {1--4}, + doi = {10.18637/jss.v008.i18} +} +@article{bai2005, + author = {Jushan Bai and Serena Ng}, + title = {Tests for skewness, kurtosis, and normality for time series data}, + journal = {Journal of Business \& Economic Statistics}, + volume = {23}, + number = {1}, + pages = {49-60}, + year = {2005}, + publisher = {Taylor & Francis}, + doi = {10.1198/073500104000000271} +} +@TechReport{MarianZach2017, + author={Zacharias Psaradakis}, + title={Normality tests for dependent data}, + year={2017}, + institution={Research Department, National Bank of Slovakia}, + type={Working and Discussion Papers}, + url={https://ideas.repec.org/p/svk/wpaper/1053.html}, + number={WP 12/2017} +} +@article{engle1982, + ISSN = {00129682, 14680262}, + URL = {http://www.jstor.org/stable/1912773}, + author = {Robert F. Engle}, + journal = {Econometrica}, + number = {4}, + pages = {987--1007}, + publisher = {Wiley, Econometric Society}, + title = {Autoregressive conditional heteroscedasticity with estimates of the variance of United Kingdom inflation}, + volume = {50}, + year = {1982} +} +@article{Bollerslev1986, + title = {Generalized autoregressive conditional heteroskedasticity}, + journal = {Journal of Econometrics}, + volume = {31}, + number = {3}, + pages = {307 - 327}, + year = {1986}, + issn = {0304-4076}, + doi = {10.1016/0304-4076(86)90063-1}, + author = {Tim Bollerslev} +} +@article{arfima, + author = {HOSKING, J. R. M.}, + title = {Fractional differencing in autoregressive processes.}, + journal = {Biometrika}, + volume = {68}, + number = {1}, + pages = {165-176}, + year = {1981}, + month = {04}, + issn = {0006-3444}, + doi = {10.1093/biomet/68.1.165} +} +@Book{sarima, + title = {Periodic time series models}, + author = {Franses, Philip Hans and Paap, Richard}, + publisher = {O}, + year = {2004}, + url = {http://hdl.handle.net/1765/2036} +} +@ARTICLE{harima, + title = {Forecasting with dynamic regression models}, + author = {Kennedy, Peter}, + year = {1992}, + journal = {International Journal of Forecasting}, + volume = {8}, + number = {4}, + pages = {647-648}, + url = {https://EconPapers.repec.org/RePEc:eee:intfor:v:8:y:1992:i:4:p:647-648} +} +@article{mcleod1983, + author = {McLeod, A. I. and Li, W. K.}, + title = {Diagnostic checking ARMA time series models using squared-residual autocorrelations}, + journal = {Journal of Time Series Analysis}, + volume = {4}, + number = {4}, + pages = {269-273}, + doi = {10.1111/j.1467-9892.1983.tb00373.x}, + year = {1983} +} +@article{BIC2006, + author = {Chen, Jiahua and Chen, Zehua}, + title = {Extended Bayesian information criteria for model selection with large model spaces}, + journal = {Biometrika}, + volume = {95}, + number = {3}, + pages = {759-771}, + year = {2008}, + issn = {0006-3444}, + doi = {10.1093/biomet/asn034} +} +@misc{OBrien2010, + address = {Berlin}, + author = {Petris, Giovanni and Petrone, Sonia and Campagnoli, Patrizia}, + doi = {10.1111/j.1751-5823.2010.00109_26.x}, + isbn = {9780387772387 0387772383}, + issn = {03067734}, + number = {1}, + pages = {157--157}, + publisher = {Springer}, + title = {Dynamic linear models With {`R`}}, + volume = {78}, + year = {2007} +} +@inproceedings{Boris2012, + author = {Rogozhnikov, Andrey and Lemeshko, Boris}, + year = {2012}, + month = {10}, + title = {A review of tests for exponentiality}, + journal = {2012 11th International Conference on Actual Problems of Electronic Instrument Engineering, APEIE 2012 - Proceedings}, + doi = {10.1109/APEIE.2012.6629166} +} +@misc{VanZyl2016, + title={The performance of univariate goodness-of-fit tests for normality based on the empirical characteristic function in large samples}, + author={J. Martin van Zyl}, + year={2016}, + eprint={1605.06293}, + archivePrefix={arXiv}, + primaryClass={stat.CO} +} +@article{Perron1988, + title = {Trends and random walks in macroeconomic time series: Further evidence from a new spproach}, + journal = {Journal of Economic Dynamics and Control}, + volume = {12}, + number = {2}, + pages = {297 - 332}, + year = {1988}, + issn = {0165-1889}, + doi = {10.1016/0165-1889(88)90043-7}, + author = {Pierre Perron} +} +@article{Box, + author = { G.E.P. Box and David A. Pierce}, + title = {Distribution of residual autocorrelations in autoregressive-integrated moving average time series models}, + journal = {Journal of the American Statistical Association}, + volume = {65}, + number = {332}, + pages = {1509-1526}, + year = {1970}, + publisher = {Taylor & Francis}, + doi = {10.1080/01621459.1970.10481180} +} +@article{dickey1984, + author = {Said, Said E. and Dickey, David A.}, + title = {Testing for unit roots in autoregressive-moving average models of unknown order}, + journal = {Biometrika}, + volume = {71}, + number = {3}, + pages = {599-607}, + year = {1984}, + month = {12}, + issn = {0006-3444}, + doi = {10.1093/biomet/71.3.599} +} +@article{KppsI1992, + title = {Testing the null hypothesis of stationarity against the alternative of a unit root: How sure are we that economic time series have a unit root?}, + journal = {Journal of Econometrics}, + volume = {54}, + number = {1}, + pages = {159 - 178}, + year = {1992}, + issn = {0304-4076}, + doi = {10.1016/0304-4076(92)90104-Y}, + author = {Denis Kwiatkowski and Peter C.B. Phillips and Peter Schmidt and Yongcheol Shin} +} +@article{ocsb1988, + author = {Osborn, Denise R. and Chui, A. P. L. and Smith, Jeremy P. and Birchenhall, C. R.}, + title = {Seasonality and the order of integration for consumption}, + journal = {Oxford Bulletin of Economics and Statistics}, + volume = {50}, + number = {4}, + pages = {361-377}, + doi = {10.1111/j.1468-0084.1988.mp50004002.x}, + year = {1988} +} +@article{Hegy1993, + title = {Seasonal unit roots in aggregate {U.S.} data}, + journal = {Journal of Econometrics}, + volume = {55}, + number = {1}, + pages = {305 - 328}, + year = {1993}, + issn = {0304-4076}, + doi = {10.1016/0304-4076(93)90018-Z}, + author = {Joseph Beaulieu and Jeffrey A. Miron} +} +@article{ch1995, + author = {Fabio Canova and Bruce E. Hansen}, + title = {Are seasonal patterns constant over time? A test for seasonal stability}, + journal = {Journal of Business \& Economic Statistics}, + volume = {13}, + number = {3}, + pages = {237-252}, + year = {1995}, + publisher = {Taylor & Francis}, + doi = {10.1080/07350015.1995.10524598} +} +@article{box1978, + author = {Ljung, G.M. and Box, G.E.P.}, + title = {On a measure of lack of fit in time series models}, + journal = {Biometrika}, + volume = {65}, + number = {2}, + pages = {297-303}, + year = {1978}, + month = {08}, + issn = {0006-3444}, + doi = {10.1093/biomet/65.2.297} +} +@book{Box1990, + author = {Box, George Edward Pelham and Jenkins, Gwilym}, + title = {Time series analysis, forecasting and control}, + year = {1990}, + isbn = {0816211043}, + publisher = {Holden-Day, Inc.}, + address = {USA}, + url = {https://www.wiley.com/en-us/Time+Series+Analysis} +} +@article{Donoho2006, + author = {Donoho, David and Jin, Jiashun}, + year = {2006}, + month = {02}, + title = {Asymptotic minimaxity of false discovery rate thresholding for sparse exponential data, technical report}, + volume = {34}, + journal = {Annals of Mathematical Statistics}, + doi = {10.1214/009053606000000920} +} +@article{Benjamin2001, + ISSN = {00905364}, + URL = {http://www.jstor.org/stable/2674075}, + author = {Yoav Benjamini and Daniel Yekutieli}, + journal = {The Annals of Statistics}, + number = {4}, + pages = {1165--1188}, + publisher = {Institute of Mathematical Statistics}, + title = {The control of the false discovery rate in multiple testing under dependency}, + volume = {29}, + year = {2001} +} +@article{Benjamin1995, + ISSN = {00359246}, + URL = {http://www.jstor.org/stable/2346101}, + author = {Yoav Benjamini and Yosef Hochberg}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + number = {1}, + pages = {289--300}, + publisher = {Royal Statistical Society, Wiley}, + title = {Controlling the false discovery rate: A practical and powerful approach to multiple testing}, + volume = {57}, + year = {1995} +} +@article{efron1987, + ISSN = {01621459}, + URL = {http://www.jstor.org/stable/2289144}, + author = {Bradley Efron}, + journal = {Journal of the American Statistical Association}, + number = {397}, + pages = {171--185}, + publisher = {American Statistical Association, Taylor & Francis, Ltd.}, + title = {Better bootstrap confidence intervals}, + volume = {82}, + year = {1987} +} +@article{efron1979, + author = {Efron, B.}, + doi = {10.1214/aos/1176344552}, + journal = {The Annals of Statistics}, + month = {01}, + number = {1}, + pages = {1--26}, + publisher = {The Institute of Mathematical Statistics}, + title = {Bootstrap methods: Another look at the jackknife}, + volume = {7}, + year = {1979} +} +@article{Buhlmann1997, + ISSN = {13507265}, + URL = {http://www.jstor.org/stable/3318584}, + author = {Peter Bühlmann}, + journal = {Bernoulli}, + number = {2}, + pages = {123--148}, + publisher = {International Statistical Institute (ISI) and Bernoulli Society for Mathematical Statistics and Probability}, + title = {Sieve bootstrap for time series}, + volume = {3}, + year = {1997} +} +@article{Dagostino1987, + author = {Ralph B. D'Agostino and Michael A. Stephens}, + title = {Goodness-of-fit techniques}, + journal = {Quality and Reliability Engineering International}, + volume = {3}, + number = {1}, + pages = {71-71}, + doi = {10.1002/qre.4680030121}, + year = {1986} +} +@book{Ts2010, + address = {Chicago}, + author = {Tsay, R.}, + doi = {10.1002/0471264105}, + edition = {Second}, + isbn = {978-0470414354}, + issn = {0040-1706}, + pages = {605}, + pmid = {10118702}, + publisher = {Wiley-Interscience}, + title = {Analysis of financial time series}, + year = {2010} +} +@book{shumway2010, + title={Time series analysis and itts applications: with {`R`} examples}, + author={Shumway, R.H. and Stoffer, D.S.}, + isbn={9781441978646}, + lccn={2011287083}, + series={Springer Texts in Statistics}, + url={https://books.google.es/books?id=dbS5IQ8P5gYC}, + year={2010}, + publisher={Springer New York} +} +@book{Casella, + added-at = {2009-10-28T04:42:52.000+0100}, + author = {Casella, George and Berger, Roger}, + isbn = {0534243126}, + url = {http://www.amazon.fr/exec/obidos/ASIN/0534243126/citeulike04-21}, + publisher = {Duxbury Resource Center}, + title = {Statistical inference}, + year = {2001} +} +@book{degroot2012, + title={Probability and statistics}, + author={DeGroot, M.H. and Schervish, M.J.}, + isbn={9780321500465}, + lccn={2010001486}, + url={https://books.google.es/books?id=4TlEPgAACAAJ}, + year={2012}, + publisher={Addison-Wesley} +} +@Inbook{Johnstone1987, + author={Johnstone, David}, + editor={Viertl, R.}, + title={On the interpretation of hypothesis tests following Neyman and Pearson}, + year={1987}, + publisher={Springer US}, + address={Boston, MA}, + pages={267--277}, + isbn={978-1-4613-1885-9}, + doi={10.1007/978-1-4613-1885-9_28} +} +@book{W2006, + address = {New York}, + author = {Wasserman, Larry.}, + doi = {10.1007/0-387-30623-4}, + isbn = {9780387251455}, + issn = {01621459}, + pages = {272}, + pmid = {10911016}, + publisher = {Springer}, + title = {All of nonparametric statistics}, + year = {2006} +} +@book{muller2015, + title={Bayesian nonparametric data analysis}, + author={Muller, P. and Quintana, F.A. and Jara, A. and Hanson, T.}, + isbn={9783319189680}, + lccn={2015943065}, + series={Springer Series in Statistics}, + url={https://books.google.nl/books?id=Ht\_yCQAAQBAJ}, + year={2015}, + publisher={Springer International Publishing} +} +@book{gelman2013, + title={Bayesian data analysis, third edition}, + author={Gelman, A. and Carlin, J.B. and Stern, H.S. and Dunson, D.B. and Vehtari, A. and Rubin, D.B.}, + isbn={9781439840955}, + lccn={2013039507}, + series={Chapman \& Hall/CRC Texts in Statistical Science}, + url={https://books.google.nl/books?id=ZXL6AQAAQBAJ}, + year={2013}, + publisher={Taylor \& Francis} +} +@book{west2006, + title={Bayesian forecasting and dynamic models}, + author={West, M. and Harrison, J.}, + isbn={9780387227771}, + lccn={96038166}, + series={Springer Series in Statistics}, + url={https://books.google.nl/books?id=0mPgBwAAQBAJ}, + year={2006}, + publisher={Springer New York} +} +@book{Hyndman2008, + title = {Forecasting with exponential smoothing: The state space approach}, + author = {Hyndman, Robin John and Koehler, Anne B and Ord, J Keith and Snyder, Ralph David}, + year = {2008}, + language = {English}, + isbn = {9783540719168}, + publisher = {Springer}, + doi = {10.1111/j.1751-5823.2009.00085_17} +} +@book{davison1997, + place={Cambridge}, + series={Cambridge Series in Statistical and Probabilistic Mathematics}, + title={Bootstrap methods and their application}, + doi={10.1017/CBO9780511802843}, + publisher={Cambridge University Press}, + author={Davison, A. C. and Hinkley, D. V.}, + year={1997}, + collection={Cambridge Series in Statistical and Probabilistic Mathematics} +} +@Manual{R, + title = {{`R`}: A language and environment for statistical computing}, + author = {{`R`} Core Team}, + organization = { {`R`} Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2018}, + url = {https://www.R-project.org/} +} +@article{Rob2007, + author = {Rob Hyndman and Yeasmin Khandakar}, + title = {Automatic time series forecasting: The `forecast` package for {`R`}}, + journal = {Journal of Statistical Software, Articles}, + volume = {27}, + number = {3}, + year = {2008}, + issn = {1548-7660}, + pages = {1--22}, + doi = {10.18637/jss.v027.i03} +} +@Manual{fGarch, + title = {`fGarch`: Rmetrics - autoregressive conditional heteroskedastic modelling}, + author = {Diethelm Wuertz and Tobias Setz and Yohan Chalabi and Chris Boudt and Pierre Chausse and Michal Miklovac}, + year = {2017}, + note = {{`R`} package version 3042.83}, + url = {https://CRAN.R-project.org/package=fGarch} +} +@Book{ggplot2, + author = {Hadley Wickham}, + title = {`ggplot2`: Elegant graphics for data analysis}, + publisher = {Springer-Verlag New York}, + year = {2009}, + isbn = {978-0-387-98140-6}, + url = {http://ggplot2.org} +} +@Manual{aTSA, + title = {`aTSA`: Alternative time series analysis}, + author = {Debin Qiu}, + year = {2015}, + note = {{`R`} package version 3.1.2}, + url = {https://CRAN.R-project.org/package=aTSA} +} +@article{Petris2010, + author = {Giovanni Petris}, + title = {An {`R`} package for dynamic linear models}, + journal = {Journal of Statistical Software, Articles}, + volume = {36}, + number = {12}, + year = {2010}, + issn = {1548-7660}, + pages = {1--16}, + doi = {10.18637/jss.v036.i12} +} +@Manual{astsa, + title = {`astsa`: Applied statistical time series analysis}, + author = {David Stoffer}, + year = {2020}, + note = {{`R`} package version 1.10}, + url = {https://CRAN.R-project.org/package=astsa} +} +@Manual{nortest2015, + title = {`nortest`: Tests for normality}, + author = {Juergen Gross and Uwe Ligges}, + year = {2015}, + note = {{`R`} package version 1.0-4}, + url = {https://CRAN.R-project.org/package=nortest} +} +@article{Mortaza2014, + title = {`MissMech`: An {`R`} package for testing homoscedasticity, Multivariate normality, and missing completely at random (MCAR)}, + author = {Mortaza Jamshidian and Siavash Jalal and Camden Jansen}, + journal = {Journal of Statistical Software}, + year = {2014}, + volume = {56}, + number = {6}, + pages = {1-31}, + url = {http://www.jstatsoft.org/v56/i06/} +} +@Manual{mvnormtest2012, + title = {`mvnormtest`: Normality test for multivariate variables}, + author = {Slawomir Jarek}, + year = {2012}, + note = {{`R`} package version 0.1-9}, + url = {https://CRAN.R-project.org/package=mvnormtest} +} +@Manual{tseries, + title = {`tseries`: Time series analysis and computational finance}, + author = {Adrian Trapletti and Kurt Hornik}, + year = {2019}, + note = {{`R`} package version 0.10-47.}, + url = {https://CRAN.R-project.org/package=tseries} +} +@Manual{uroot, + title = {`uroot`: Unit root tests for seasonal time series}, + author = {Javier López-de-Lacalle}, + year = {2019}, + note = {{`R`} package version 2.1-0}, + url = {https://CRAN.R-project.org/package=uroot} +} +@article{Holt2004, + title = {Forecasting seasonals and trends by exponentially weighted moving averages}, + journal = {International Journal of Forecasting}, + author = {Charles C. Holt}, + volume = {20}, + number = {1}, + pages = {5 - 10}, + year = {2004}, + issn = {0169-2070}, + doi = {10.1016/j.ijforecast.2003.09.015} +} +@article{Gabry2019, + author = {Gabry, Jonah and Simpson, Daniel and Vehtari, Aki and Betancourt, Michael and Gelman, Andrew}, + title = {Visualization in Bayesian workflow}, + journal = {Journal of the Royal Statistical Society: Series A (Statistics in Society)}, + volume = {182}, + number = {2}, + pages = {389-402}, + doi = {10.1111/rssa.12378}, + year = {2019} +} +@article{Guttman1967, + ISSN = {00359246}, + URL = {http://www.jstor.org/stable/2984569}, + author = {Irwin Guttman}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + number = {1}, + pages = {83--100}, + publisher = {[Royal Statistical Society, Wiley]}, + title = {The use of the concept of a future observation in goodness-of-fit problems}, + volume = {29}, + year = {1967} +} +@article{Vehtari2016, + title={Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC}, + volume={27}, + ISSN={1573-1375}, + doi={10.1007/s11222-016-9696-4}, + number={5}, + journal={Statistics and Computing}, + publisher={Springer Science and Business Media LLC}, + author={Vehtari, Aki and Gelman, Andrew and Gabry, Jonah}, + year={2016}, + month={Aug}, + pages={1413–1432} +} +@Article{bridgsampling2020, + title = {`bridgesampling`: An {`R`} package for estimating normalizing constants}, + author = {Quentin F. Gronau and Henrik Singmann and Eric-Jan Wagenmakers}, + journal = {Journal of Statistical Software}, + year = {2020}, + volume = {92}, + number = {10}, + pages = {1--29}, + doi = {10.18637/jss.v092.i10}, + } +@misc{gronau2017, + title={A tutorial on bridge sampling}, + author={Quentin F. Gronau and Alexandra Sarafoglou and Dora Matzke and Alexander Ly and Udo Boehm and Maarten Marsman and David S. Leslie and Jonathan J. Forster and Eric-Jan Wagenmakers and Helen Steingroever}, + year={2017}, + eprint={1703.05984}, + archivePrefix={arXiv}, + primaryClass={stat.CO} +} +@article{bayesfactor, + ISSN = {01621459}, + URL = {http://www.jstor.org/stable/2291091}, + author = {Robert E. Kass and Adrian E. Raftery}, + journal = {Journal of the American Statistical Association}, + number = {430}, + pages = {773--795}, + publisher = {American Statistical Association, Taylor & Francis, Ltd.}, + title = {Bayes factors}, + volume = {90}, + year = {1995} +} +@article{watanabe, + author = {Sumio Watanabe}, + journal = {Journal of Machine Learning Research}, + volume ={11}, + year = {2010}, + title={Asymptotic equivalence of Bayes cross validation and widely applicable information criterion in singular learning theory}, + url = {http://www.jmlr.org/papers/volume11/watanabe10a/watanabe10a.pdf} +} +@article{David, + author = {Spiegelhalter, David J. and Best, Nicola G. and Carlin, Bradley P. and Van Der Linde, Angelika}, + title = {Bayesian measures of model complexity and fit}, + journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + volume = {64}, + number = {4}, + pages = {583-639}, + doi = {10.1111/1467-9868.00353}, + year = {2002} +} +@misc{vehtari, + title={Pareto smoothed importance sampling}, + author={Aki Vehtari and Daniel Simpson and Andrew Gelman and Yuling Yao and Jonah Gabry}, + year={2015}, + eprint={1507.02646}, + archivePrefix={arXiv}, + primaryClass={stat.CO} +} +@article{plummer, + author = {Plummer, Martyn}, + title = {Penalized loss functions for Bayesian model comparison}, + journal = {Biostatistics}, + volume = {9}, + number = {3}, + pages = {523-539}, + year = {2008}, + month = {01}, + issn = {1465-4644}, + doi = {10.1093/biostatistics/kxm049} +} +@article{vanderLinde, + author = {van der Linde, Angelika}, + title = {DIC in variable selection}, + journal = {Statistica Neerlandica}, + volume = {59}, + number = {1}, + pages = {45-56}, + doi = {10.1111/j.1467-9574.2005.00278.x} +} +@article{johnson2004, + author = {Johnson, Valen E.}, + doi = {10.1214/009053604000000616}, + journal = {Annals of Statistics}, + month = {12}, + number = {6}, + pages = {2361--2384}, + publisher = {The Institute of Mathematical Statistics}, + title = {A Bayesian Chi test for goodness-of-fit}, + volume = {32}, + year = {2004} +} +@article{hoffman14, + author = {Matthew D. Hoffman and Andrew Gelman}, + title = {The No-U-Turn Sampler: Adaptively setting path lengths in Hamiltonian Monte Carlo}, + journal = {Journal of Machine Learning Research}, + year = {2014}, + volume = {15}, + pages = {1593-1623}, + url = {http://jmlr.org/papers/v15/hoffman14a.html} +} +@misc{betancourt2017, + title={A conceptual introduction to Hamiltonian Monte Carlo}, + author={Michael Betancourt}, + year={2017}, + eprint={1701.02434}, + archivePrefix={arXiv}, + primaryClass={stat.ME} +} +@article{Duane1987, + title = {Hybrid Monte Carlo}, + journal = {Physics Letters B}, + volume = {95}, + number = {2}, + pages = {216 - 222}, + year = {1987}, + issn = {0370-2693}, + doi = {https://doi.org/10.1016/0370-2693(87)91197-X"}, + author = {Duane, S, et al.} +} +@article{Pettit1986, + ISSN = {00390526, 14679884}, + URL = {http://www.jstor.org/stable/2987522}, + author = {L. I. Pettit}, + journal = {Journal of the Royal Statistical Society. Series D (The Statistician)}, + number = {2}, + pages = {183--190}, + publisher = {Royal Statistical Society, Wiley}, + title = {Diagnostics in Bayesian model choice}, + volume = {35}, + year = {1986} +} +@misc{matamoros2020, + title={varstan: An {`R`} package for Bayesian analysis of structured time series models with Stan}, + author={Izhar Asael Alonzo Matamoros and Cristian Andres Cruz Torres}, + year={2020}, + eprint={2005.10361}, + archivePrefix={arXiv}, + primaryClass={stat.CO} +} +@article{Nieto-Reyes:2022-1, + author = {Nieto-Reyes, Alicia}, + title = {On the non-{G}aussianity of the height of sea waves}, + journal = {Journal of Marine Science and Engineering}, + volume = {9}, + year = {2021}, + number = {12}, + article-number = {1446}, + url = {https://www.mdpi.com/2077-1312/9/12/1446}, + issn = {2077-1312}, +} +@article{Nieto-Reyes:2022-2, + author = {Nieto-Reyes, Alicia}, + title = {On the non-{G}aussianity of sea surface elevations}, + journal = {Journal of Marine Science and Engineering}, + volume = {10}, + year = {2022}, + number = {9}, + article-number = {1303}, + url = {https://www.mdpi.com/2077-1312/10/9/1303}, + issn = {2077-1312}, + doi = {10.3390/jmse10091303} +} +@Manual{mvntest, + title = {`mvnTest`: Goodness of fit tests for multivariate normality}, + author = {Natalya Pya and Vassilly Voinov and Rashid Makarov and Yevgeniy Voinov}, + year = {2016}, + note = {{`R`} package version 1.1-0}, + url = {https://CRAN.R-project.org/package=mvnTest} +} +@article{Royston1993, + author = {Royston, Patrick}, + title = {A pocket-calculator algorithm for the {S}hapiro-{F}rancia test for non-normality: An application to medicine}, + journal = {Statistics in Medicine}, + volume = {12}, + number = {2}, + pages = {181-184}, + doi = {10.1002/sim.4780120209}, + url ={https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.4780120209}, + year = {1993} +} +@article{Wilkinson1986, + author = {Gerard E. Dallal and Leland Wilkinson}, + title = {An analytic approximation to the distribution of Lilliefors's test statistic for normality}, + journal = {The American Statistician}, + volume = {40}, + number = {4}, + pages = {294-296}, + year = {1986}, + publisher = {Taylor & Francis}, + doi = {10.1080/00031305.1986.10475419}, + URL = {https://www.tandfonline.com/doi/abs/10.1080/00031305.1986.10475419} +} +@Article{DH2008, + author={Jurgen A. Doornik and Henrik Hansen}, + title={{An omnibus test for univariate and multivariate normality}}, + journal={Oxford Bulletin of Economics and Statistics}, + year=2008, + volume={70}, + number={s1}, + pages={927-939}, + month={December}, + keywords={}, + doi={10.1111/j.1468-0084.2008.}, + url={https://ideas.repec.org/a/bla/obuest/v70y2008is1p927-939.html} +} +@article{HZ1990, + author = {N. Henze and B. Zirkler}, + title = {A class of invariant consistent tests for multivariate normality}, + journal = {Communications in Statistics - Theory and Methods}, + volume = {19}, + number = {10}, + pages = {3595-3617}, + year = {1990}, + publisher = {Taylor & Francis}, + doi = {10.1080/03610929008830400}, + URL = {https://doi.org/10.1080/03610929008830400} +} +@article{S2_2016, + author = {Vassilly Voinov, Natalie Pya, Rashid Makarov and Yevgeniy Voinov}, + title = {New invariant and consistent Chi-squared type goodness-of-fit tests for multivariate normality and a related comparative simulation study}, + journal = {Communications in Statistics - Theory and Methods}, + volume = {45}, + number = {11}, + pages = {3249-3263}, + year = {2016}, + publisher = {Taylor & Francis}, + doi = {10.1080/03610926.2014.901370}, + URL = {https://doi.org/10.1080/03610926.2014.901370} +} +@Manual{cowplot, + title = {`cowplot`: Streamlined plot theme and plot annotations for `ggplot2`}, + author = {Claus O. Wilke}, + year = {2020}, + note = {{`R`} package version 1.1.1}, + url = {https://CRAN.R-project.org/package=cowplot} +} diff --git a/_articles/RJ-2024-008/RJwrapper.tex b/_articles/RJ-2024-008/RJwrapper.tex new file mode 100644 index 0000000000..a8ceaf8ec0 --- /dev/null +++ b/_articles/RJ-2024-008/RJwrapper.tex @@ -0,0 +1,70 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + + +% tightlist command for lists without linebreak +\providecommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} + +\usepackage{longtable} + +% Always define CSL refs as bib entries are contained in separate doc +% Pandoc citation processing +%From Pandoc 3.1.8 +% definitions for citeproc citations +\NewDocumentCommand\citeproctext{}{} +\NewDocumentCommand\citeproc{mm}{% + \begingroup\def\citeproctext{#2}\cite{#1}\endgroup} +\makeatletter + % allow citations to break across lines + \let\@cite@ofmt\@firstofone + % avoid brackets around text for \cite: + \def\@biblabel#1{} + \def\@cite#1#2{{#1\if@tempswa , #2\fi}} +\makeatother +\newlength{\cslhangindent} +\setlength{\cslhangindent}{1.5em} +\newlength{\csllabelwidth} +\setlength{\csllabelwidth}{3em} +\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing + {\begin{list}{}{% + \setlength{\itemindent}{0pt} + \setlength{\leftmargin}{0pt} + \setlength{\parsep}{0pt} + % turn on hanging indent if param 1 is 1 + \ifodd #1 + \setlength{\leftmargin}{\cslhangindent} + \setlength{\itemindent}{-1\cslhangindent} + \fi + % set entry spacing + \setlength{\itemsep}{#2\baselineskip}}} + {\end{list}} +\usepackage{calc} +\newcommand{\CSLBlock}[1]{#1\hfill\break} +\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}} +\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break} +\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} + + + +\begin{document} + + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{135} + +\begin{article} + \input{RJ-2024-008} +\end{article} + + +\end{document} diff --git a/_articles/RJ-2024-008/data/r_sim.Rdata b/_articles/RJ-2024-008/data/r_sim.Rdata new file mode 100644 index 0000000000..a82f38fed7 Binary files /dev/null and b/_articles/RJ-2024-008/data/r_sim.Rdata differ diff --git a/_articles/RJ-2024-008/data/runtime.Rdata b/_articles/RJ-2024-008/data/runtime.Rdata new file mode 100644 index 0000000000..58a3010138 Binary files /dev/null and b/_articles/RJ-2024-008/data/runtime.Rdata differ diff --git a/_articles/RJ-2024-008/figures/fig1-interactive-1.png b/_articles/RJ-2024-008/figures/fig1-interactive-1.png new file mode 100644 index 0000000000..135f1823a9 Binary files /dev/null and b/_articles/RJ-2024-008/figures/fig1-interactive-1.png differ diff --git a/_articles/RJ-2024-008/figures/fig1-interactive.svg b/_articles/RJ-2024-008/figures/fig1-interactive.svg new file mode 100644 index 0000000000..92c0ce4950 --- /dev/null +++ b/_articles/RJ-2024-008/figures/fig1-interactive.svg @@ -0,0 +1,596 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-008/figures/fig1-static-1.png b/_articles/RJ-2024-008/figures/fig1-static-1.png new file mode 100644 index 0000000000..400d022fa4 Binary files /dev/null and b/_articles/RJ-2024-008/figures/fig1-static-1.png differ diff --git a/_articles/RJ-2024-008/figures/fig1-static.svg b/_articles/RJ-2024-008/figures/fig1-static.svg new file mode 100644 index 0000000000..ed6a9691f3 --- /dev/null +++ b/_articles/RJ-2024-008/figures/fig1-static.svg @@ -0,0 +1,751 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-008/figures/fig2-interactive-1.png b/_articles/RJ-2024-008/figures/fig2-interactive-1.png new file mode 100644 index 0000000000..2b43840002 Binary files /dev/null and b/_articles/RJ-2024-008/figures/fig2-interactive-1.png differ diff --git a/_articles/RJ-2024-008/figures/fig2-interactive.svg b/_articles/RJ-2024-008/figures/fig2-interactive.svg new file mode 100644 index 0000000000..f93431feb1 --- /dev/null +++ b/_articles/RJ-2024-008/figures/fig2-interactive.svg @@ -0,0 +1,1592 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-008/figures/fig2-static-1.png b/_articles/RJ-2024-008/figures/fig2-static-1.png new file mode 100644 index 0000000000..96a16f435c Binary files /dev/null and b/_articles/RJ-2024-008/figures/fig2-static-1.png differ diff --git a/_articles/RJ-2024-008/figures/fig2-static.svg b/_articles/RJ-2024-008/figures/fig2-static.svg new file mode 100644 index 0000000000..073c63ea87 --- /dev/null +++ b/_articles/RJ-2024-008/figures/fig2-static.svg @@ -0,0 +1,1271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-008/figures/fig3-dynamic-1.png b/_articles/RJ-2024-008/figures/fig3-dynamic-1.png new file mode 100644 index 0000000000..d2878f5ad7 Binary files /dev/null and b/_articles/RJ-2024-008/figures/fig3-dynamic-1.png differ diff --git a/_articles/RJ-2024-008/figures/fig3-dynamic.svg b/_articles/RJ-2024-008/figures/fig3-dynamic.svg new file mode 100644 index 0000000000..fcc426f783 --- /dev/null +++ b/_articles/RJ-2024-008/figures/fig3-dynamic.svg @@ -0,0 +1,474 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_articles/RJ-2024-008/nortsTest.R b/_articles/RJ-2024-008/nortsTest.R new file mode 100644 index 0000000000..4b6ba4b0e5 --- /dev/null +++ b/_articles/RJ-2024-008/nortsTest.R @@ -0,0 +1,226 @@ +# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand +# Please edit nortsTest.Rmd to modify this file + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = FALSE, + warning = FALSE, + message = FALSE +) + +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "figures/", + dev = "png", + dpi = 150, + fig.asp = 0.8, + fig.width = 8, + fig.height = 4, + out.width = "60%", + fig.align = "center" +) + +library(kableExtra) +library(nortsTest) +library(fGarch) +library(knitr) +library(forecast) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +x = arima.sim(250,model = list(ar =c(0.5,0.2)), + rand.gen = rbeta,shape1 = 9,shape2 = 1) + +# Asymptotic Epps test +epps.test(x) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +epps.test(x, lambda = abs(rnorm(mean = c(1, 2), 2))) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +epps_bootstrap.test(x, seed = 298) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +x = arima.sim(250,model = list(ma = c(0.2, 0.3, -0.4)), + rand.gen = rgamma, rate = 3, shape = 6) +# Asymptotic Lobato & Velasco +lobato.test(x) + + +## ----echo = TRUE-------------------------------------------------------------- +lobato_bootstrap.test(x, seed = 298) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(3468) +library(fGarch) +spec = garchSpec(model = list(alpha = 0.2, beta = 0.3)) +x = ts(garchSim(spec, n = 300)) +rp.test(x) + + +## ----echo = TRUE-------------------------------------------------------------- +set.seed(298) +x = arima.sim(250,model = list(ar = 0.2, ma = 0.34)) +# Default, Psaradakis and Vavra's procedure +vavra.test(x, seed = 298) + + +## ----echo = TRUE-------------------------------------------------------------- +vavra.test(x, normality = "cvm", seed = 298) + + +## ----echo=TRUE---------------------------------------------------------------- +set.seed(23890) +x = arima.sim(250,model = list(ar = 0.2)) +y = arima.sim(250,model = list(ar = c(0.4,0,.1))) +elbouch.test(y = y,x = x) + + +## ----tab1-static, eval = knitr::is_latex_output(),warning = FALSE------------- +load("data/r_sim.Rdata") +phi = c("-0.4","-0.25","0.0","0.25","0.4","max.phi") + +r1 = results1[,2:14] +colnames(r1) = c("phi", phi, phi) + +kable(r1, "latex", booktabs = TRUE,digits = 3, caption = "Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 100" = 6, "n = 250" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) + + +## ----tab1-interactive, eval = knitr::is_html_output(),warning = FALSE--------- +#> load("data/r_sim.Rdata") +#> phi = c("-0.4","-0.25","0.0","0.25","0.4","max.phi") +#> +#> r1 = results1[,2:14] +#> colnames(r1) = c("phi", phi, phi) +#> +#> kable(r1, "html", booktabs = TRUE, digits = 3, caption = "Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +#> kable_styling(latex_options = c("hold_position", "scale_down"))%>% +#> add_header_above(c(" " = 1, "n = 100" = 6, "n = 250" = 6))%>% +#> pack_rows("Lobato and Velasco", 1, 5) %>% +#> pack_rows("Epps", 6, 10) %>% +#> pack_rows("Random Projections", 11, 15) %>% +#> pack_rows("Psaradakis and Vavra", 16, 20)%>% +#> pack_rows("Bootstrap Lobato", 21, 25)%>% +#> pack_rows("Bootstrap Epps", 26, 30)%>% +#> pack_rows("El Bouch", 31, 35) + + +## ----tab2-static, eval = knitr::is_latex_output(),warning = FALSE------------- +r2 = results2[,2:14] +colnames(r2) = c("phi", phi, phi) + +kable(r2, "latex", booktabs = TRUE, digits = 3, caption = "Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +kable_styling(latex_options = c("hold_position", "scale_down"))%>% +add_header_above(c(" " = 1, "n = 500" = 6, "n = 1,000" = 6))%>% +pack_rows("Lobato and Velasco", 1, 5) %>% +pack_rows("Epps", 6, 10) %>% +pack_rows("Random Projections", 11, 15) %>% +pack_rows("Psaradakis and Vavra", 16, 20)%>% +pack_rows("Bootstrap Lobato", 21, 25)%>% +pack_rows("Bootstrap Epps", 26, 30)%>% +pack_rows("El Bouch", 31, 35) + + +## ----tab2-interactive, eval = knitr::is_html_output(),warning = FALSE--------- +#> r2 = results2[,2:14] +#> colnames(r2) = c("phi", phi, phi) +#> +#> kable(r2, "html", booktabs = TRUE, digits = 3, caption = "Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.") %>% +#> kable_styling(latex_options = c("hold_position", "scale_down"))%>% +#> add_header_above(c(" " = 1, "n = 500" = 6, "n = 1,000" = 6))%>% +#> pack_rows("Lobato and Velasco", 1, 5) %>% +#> pack_rows("Epps", 6, 10) %>% +#> pack_rows("Random Projections", 11, 15) %>% +#> pack_rows("Psaradakis and Vavra", 16, 20)%>% +#> pack_rows("Bootstrap Lobato", 21, 25)%>% +#> pack_rows("Bootstrap Epps", 26, 30)%>% +#> pack_rows("El Bouch", 31, 35) + + +## ----tab3-static, eval = knitr::is_latex_output(),warning = FALSE------------- +load("data/runtime.Rdata") + +kable(runtime, "latex", booktabs = TRUE, digits = 4, caption = "Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$") + + +## ----tab3-interactive, eval = knitr::is_html_output(),warning = FALSE--------- +#> load("data/runtime.Rdata") +#> +#> kable(runtime,"html", booktabs = TRUE, digits = 4, caption = "Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$") + + +## ----fig1-static, fig.cap = "Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour.", eval = knitr::is_latex_output(), fig.alt="(ref:demo-caption1)", out.width = "75%"---- +library(astsa) +g1 = autoplot(cardox, main = "CO2 levels at Mauna Loa", + xlab = "years", ylab = "CO2 (ppm)") +g2 = autoplot(forecast(ets(cardox), h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: CO2 Levels at Mauna Loa") +cowplot::plot_grid(g1,g2,ncol = 2) + + +## ----fig1-interactive, echo = knitr::is_html_output(), eval = knitr::is_html_output(),fig.cap="CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality."---- +#> library(astsa) +#> +#> autoplot(cardox, main = "Carbon Dioxide levels at Mauna Loa", +#> xlab = "years", ylab = "CO2 (ppm)") + + +## ----echo = TRUE-------------------------------------------------------------- +library(forecast) +library(astsa) +model = ets(cardox) +summary(model) + + +## ----echo = TRUE, eval = FALSE------------------------------------------------ +#> check_residuals(model,unit_root = "adf",normality = "rp", +#> plot = TRUE) + + +## ----echo = FALSE, eval = TRUE------------------------------------------------ +check_residuals(model,unit_root = "adf",normality = "rp", plot = FALSE) + + +## ----fig2-interactive, eval = knitr::is_html_output(), fig.cap = "Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity."---- +#> check_plot(model) + + +## ----fig2-static, fig.cap = "Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity.", eval = knitr::is_latex_output(), fig.alt= "(ref:demo-caption2)", out.width = "100%"---- +check_plot(model) + + +## ----fig3-dynamic, echo = knitr::is_html_output(), eval = knitr::is_html_output(), fig.cap = "Forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour."---- +#> autoplot(forecast(model,h = 12),include = 100, +#> xlab = "years",ylab = "CO2 (ppm)", +#> main = "Forecast: Carbon Dioxide Levels at Mauna Loa") + + +## ----echo = knitr::is_latex_output(), eval = FALSE, fig.cap = "(ref:demo-caption3)"---- +#> autoplot(forecast(model,h = 12),include = 100, +#> xlab = "years",ylab = "CO2 (ppm)", +#> main = "Forecast: Carbon Dioxide Levels at Mauna Loa") + + +## ----echo = TRUE, eval = FALSE------------------------------------------------ +#> if (!requireNamespace("remotes")) install.packages("remotes") +#> remotes::install_github("asael697/nortsTest",dependencies = TRUE) + diff --git a/_articles/RJ-2024-008/nortsTest.tex b/_articles/RJ-2024-008/nortsTest.tex new file mode 100644 index 0000000000..81c2508062 --- /dev/null +++ b/_articles/RJ-2024-008/nortsTest.tex @@ -0,0 +1,1095 @@ +% !TeX root = RJwrapper.tex +\title{nortsTest: An R Package for Assessing Normality of Stationary Processes} + + +\author{by Asael Alonzo Matamoros, Alicia Nieto-Reyes, and Claudio Agostinelli} + +\maketitle + +\abstract{% +Normality is the central assumption for analyzing dependent data in several time series models, and the literature has widely studied normality tests. However, the implementations of these tests are limited. The nortsTest package is dedicated to fill this void. The package performs the asymptotic and bootstrap versions of the tests of Epps and Lobato and Velasco and the tests of Psaradakis and Vavra, random projections and El Bouch for normality of stationary processes. These tests are for univariate stationary processes but for El Bouch that also allows bivariate stationary processes. In addition, the package offers visual diagnostics for checking stationarity and normality assumptions for the most used time series models in several R packages. This work aims to show the package's functionality, presenting each test performance with simulated examples and the package utility for model diagnostic in time series analysis. +} + +\hypertarget{introduction}{% +\section{Introduction}\label{introduction}} + +Normality (\emph{a set of observations sampled from a Gaussian process}) is an essential assumption in various statistical models. Therefore, developing procedures for testing this assumption is a topic that has gained popularity over several years. Most existing literature and implementation is dedicated to independent and identically distributed random variables (D'Agostino and Stephens 1986); no results show that these tests are consistent when applied to stationary processes. For this context, several tests have been proposed over the years, but as far as we know, no \texttt{R} package or consistent implementation exists. + +The proposed \CRANpkg{nortsTest} package provides seven test implementations to check normality of stationary processes. This work aims to present a review of these tests and introduce the package functionality. Thus, its novelty lies in being the first package and paper dedicated to the implementation of normality tests for stationary processes. The implemented tests are: (i) the asymptotic \emph{Epps} test, (Epps 1987) and (Nieto-Reyes, Cuesta-Albertos, and Gamboa 2014), based on the characteristic function and (ii) its sieve bootstrap approximation (Psaradakis and Vávra 2020), (iii) the corrected \emph{Skewness-Kurtosis} (SK) test implemented by Lobato and Velasco (2004) as an asymptotic test and (iv) by Psaradakis and Vávra (2020) with a sieve bootstrap approximation, (v) the \emph{random projections test} proposed by Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014), which makes use of the tests in (i) and (iii), (vi) the \emph{Psadarakis and Vávra test} (Psaradakis and Vávra 2017) that uses a bootstrap approximation of the Anderson and Darling (1952) test statistic for stationary linear processes and (vii) a normality test by El Bouch, Michel, and Comon (2022) for multivariate dependent samples. Tests (i) to (vi) are for univariate stationary processes. + +Furthermore, we propose the \texttt{check\_residual()} function for checking time-series models' assumptions. This function returns a report for stationarity, seasonality, normality tests and visual diagnostics. \texttt{check\_residual()} supports models from the most used packages for time-series analysis, such as the packages \CRANpkg{forecast} (R. Hyndman and Khandakar 2008) and \CRANpkg{aTSA} (Qiu 2015) and even functions in the base \texttt{R} (Team 2018); for instance, it supports the \texttt{HoltWinters} (stats \texttt{R} package) function for the Holt and Winters method (Holt 2004). In addition, the proposed \CRANpkg{nortsTest} package has already been applied in the literature, see Nieto-Reyes (2021) and Nieto-Reyes (2022). + +Section 2 provides the theoretical background, including preliminary concepts and results. Section 3 introduces the normality tests for stationary processes, each subsection introducing a test framework and including examples of the tests functions with simulated data. Section 4 provides numerical experiments with simulated data and a real-world application: Subsection 4.1 reports a simulation study for the implemented normality tests and Subsection 4.2 the package's functionality for model checking in a real data application. The \emph{carbon dioxide} data measured in the Malua Loa Observatory (Stoffer 2020) is analyzed using a state space model from the \CRANpkg{forecast} package, evaluating the model's assumptions using our proposed \texttt{check\_residuals()} function. Section 5 discusses the package functionality and provides our conclusions. Furthermore, we mention our future intended work on the package. + +\hypertarget{preliminary-concepts}{% +\section{Preliminary concepts}\label{preliminary-concepts}} + +This section provides some theoretical aspects of stochastic processes that are a necessary theoretical framework for the following sections. Shumway and Stoffer (2010) and Tsay (2010) give more details of the following definitions and results below. + +For the purpose of this work, \(T\) is a set of real values denoted as time, \(T \subseteq \mathbb{R},\) for instance \(T=\mathbb{N}\) or \(T=\mathbb{Z},\) the naturals or integer numbers respectively. We denote by \(X:=\{X_t\}_{t\in T}\) a \textit{stochastic process} with \(X_t\) a real random variable for each \(t\in T.\) Following this notation, a \textit{time-series} is just a finite collection of ordered observations of \(X\) (Shumway and Stoffer 2010). An important measure for a stochastic process is its mean function \(\mu(t) := E[X_t]\) for each \(t \in T\), where \(E[\cdot]\) denotes the usual expected value of a random variable. A generalization of this measure is the k-th order centered moment function \(\mu_k(t) := E[(X_t -\mu(t))^k]\) for each \(t \in T\) and \(k > 1;\) with the process variance function being the second order centered moment, \(\sigma^2(t) := \mu_2(t)\). Other important measures are the auto-covariance and auto-correlation functions, which measure the linear dependency between two different time points of a given process. For any \(t,s \in T,\) they are, respectively, +\[ +\gamma(t,s) := E[(X_t -\mu(t))(X_s - \mu(s))] \mbox{ and } \rho(t,s) := \dfrac{\gamma(t,s)}{\sqrt{\mu_2(t)}\sqrt{\mu_2(s)}}. +\] +Other widely used measure functions for the analysis of processes are the skewness and kurtosis functions, defined as \(s(t) := \mu_3(t)/[\mu_2(t)]^{3/2}\) and \(k(t) := \mu_4(t)/[\mu_2(t)]^2\) for each \(t\in T,\) respectively. + +A generally used assumption for stochastic processes is stationarity. It has a key role in forecasting procedures of classic time-series modeling (Tsay 2010) or as a principal assumption in de-noising methods for signal theory (Wasserman 2006). + +\hypertarget{definition-1}{% +\paragraph{Definition 1}\label{definition-1}} + +A stochastic process \(X\) is said to be \emph{strictly stationary} if, for every collection \(\tau = \{t_1,t_2,\ldots, t_k\} \subset T\) and \(h > 0\), the joint distribution of \(\{X_t\}_{t \in \tau}\) is identical to that of \(\{X_{t+h}\}_{t \in \tau}.\) + +The previous definition is strong for applications. A milder version of it, which makes use of the process' first two moments, is weak stationarity. + +\hypertarget{definition-2}{% +\paragraph{Definition 2}\label{definition-2}} + +A stochastic process \(X\) is said to be \emph{weakly stationary} if its mean function is constant in time, \(\mu(t) = \mu\), its auto-covariance function only depends on the difference between times, \(\gamma(s,t) = \sigma|t-s|\) for a \(\sigma\in \mathbb{R}\), and it has a finite variance function, \(\mu_2(t) = \mu_2 < \infty\). + +For the rest of this work, the term \emph{stationary} will be used to specify a weakly stationary process. A direct consequence of the stationarity assumption is that the previous measure functions get simplified. Thus, given a stationary stochastic process \(X,\) its mean function, \(k\)-th order centered moment, for \(k>1,\) and auto-covariance function are respectively, +\[ + \mu = E[X_{t_1}]\mbox{, } \mu_k = E[(X_{t_1} -\mu)^k] \mbox{ and } \gamma(h) = E[(X_{t_1+h}-\mu)(X_{t_1}-\mu)], +\] +which are independent of \(t_1\in T.\) + +Given a sample \(x_1, \ldots, x_n,\) \(n\in\mathbb{N},\) of equally spaced observations of \(X,\) their corresponding estimators, sample mean, sample \(k\)-th order centered moment and sample auto-covariance, are respectively +\[ + \widehat{\mu} := n^{-1}\sum_{i=1}^nx_i\mbox{, } \widehat{\mu}_k := n^{-1}\sum_{i=1}^n(x_i - \widehat{\mu})^k \mbox{ and }\widehat{\gamma}(h) := n^{-1}\sum_{i = 1}^{n-h}(x_{i+h} - \widehat{\mu})(x_i - \widehat{\mu}). +\] + +A particular case in which stationarity implies strictly stationarity is a Gaussian process. + +\hypertarget{definition-3}{% +\paragraph{Definition 3}\label{definition-3}} + +A stochastic process \(X\) is said to be a \emph{Gaussian process} if for every finite collection \(\tau = \{t_1,t_2,\ldots, t_k\} \subset T,\) the joint distribution of \(\{X_t\}_{t \in \tau}\) has a multivariate normal distribution. + +A series of mean zero uncorrelated random variables with finite constant variance is known as \emph{white noise}. If additionally, it is formed of independent and identically distributed (i.i.d) normal random variables, it is known as \emph{Gaussian white noise}; which is a particular case of stationary Gaussian process. For the rest of the work, \(X_t \sim N(\mu,\sigma^2)\) denotes that the random variable \(X_t\) is normally distributed with mean \(\mu\) and variance \(\sigma^2\) and \(\chi^2(v)\) denotes the Chi square distribution with \(v\) degrees of freedom. + +Other classes of stochastic processes can be defined using collections of white noise, for instance, the linear process. + +\hypertarget{definition-4}{% +\paragraph{Definition 4}\label{definition-4}} + +Let \(X\) be a stochastic process. \(X\) is said to be \emph{linear} if it can be written as +\[ +X_t = \mu + \sum_{i\in\mathbb{Z}}\phi_i\epsilon_{t-i}, +\] +where \(\{\epsilon_i\}_{i\in\mathbb{Z}}\) is a collection of white noise random variables and \(\{\phi_i\}_{i\in\mathbb{Z}}\) is a set of real values such that \(\sum_{i\in\mathbb{Z}} |\phi_j| < \infty.\) + +An important class of processes is the \emph{auto-regressive moving average} (\(ARMA\)). George Edward Pelham Box and Jenkins (1990) introduced it for time series analysis and forecast, becoming very well-known in the 90s and early 21st century. + +\hypertarget{definition-5}{% +\paragraph{Definition 5}\label{definition-5}} + +For any non-negative integers \(p,q,\) a stochastic process \(X\) is an \(ARMA(p,q)\) process if it is a stationary process and +\begin{equation} + X_t = \sum_{i=0}^p \phi_iX_{t-i} +\sum_{i=0}^q \theta_i\epsilon_{t-i}, \label{eq:ARMA} +\end{equation} +where \(\{\phi_i\}_{i=0}^p\) and \(\{\theta_i\}_{i=0}^q\) are sequences of real values with \(\phi_0= 0,\) \(\phi_p\neq 0,\) \(\theta_0=1\) and \(\theta_q\neq 0\) and \(\{\epsilon_{i}\}_{i\in\mathbb{Z}}\) is a collection of white noise random variables. + +Particular cases of \(ARMA\) processes are those known as auto-regressive (\(AR(p) := ARMA(p,0)\)) and mean average (\(MA(q) := ARMA(0,q)\)) processes. Additionally, a \emph{random walk} is a non stationary AR(1) +process satisfying \eqref{eq:ARMA} with \(p=1,\) \(\phi_1 = 1\) and \(q=0.\) Several properties of an \(ARMA\) process can be extracted from its structure. For that, the \(AR\) and \(MA\) polynomials are introduced +\[ + AR:\text{ } \phi(z) = 1-\sum_{i=0}^p \phi_i z^i \text{ and } MA:\text{ } \theta(z) = \sum_{i=0}^q \theta_i z^i, +\] +where \(z\) is a complex number and, as before, \(\phi_0 = 0,\) \(\phi_p\neq 0,\) \(\theta_0= 1\) and \(\theta_q\neq 0.\) Conditions for stationarity, order selection and, process behavior are properties studied from these two polynomials. + +For modeling volatility in financial data, Bollerslev (1986) proposed the \emph{generalized auto-regressive conditional heteroscedastic} (GARCH) class of processes as a generalization of the \emph{auto-regressive conditional heteroscedastic} (ARCH) processes (Engle 1982). + +\hypertarget{definition-6}{% +\paragraph{Definition 6}\label{definition-6}} + +For any \(p,q \in \mathbb{N}\), a stochastic process \(X\) is a \(GARCH(p,q)\) process if it satisfies \(X_t = \mu + \sigma_{t}\epsilon_t\) with +\[ +\sigma_t^2 = \alpha_0 +\sum_{i=1}^p\alpha_i \epsilon_{t-i}^2 +\sum_{i=1}^q \beta_{i}\sigma^2_{t-i}. +\] +\(\mu\) is the process mean, \(\sigma_0\) is a positive constant value, \(\{\alpha_i\}_{i=1}^p\) and \(\{\beta_i\}_{i=1}^q\) are non-negative sequences of real values and \(\{\epsilon_{t}\}_{t \in T}\) is a collection of i.i.d. random variables. + +A more general class of processes are the \emph{state-space models} (\(SSMs\)), which have gained popularity over the years because they do not impose on the process common restrictions such as linearity or stationarity and are flexible in incorporating the process different characteristics (Petris, Petrone, and Campagnoli 2007). They are widely used for smoothing (West and Harrison 2006) and forecasting (R. Hyndman and Khandakar 2008) in time series analysis. The main idea is to model the process dependency with two equations: the \emph{state equation}, which models how parameters change over time, and the \emph{innovation equation}, which models the process in terms of the parameters. Some particular SSMs that analyze the level, trend and seasonal components of the process are known as \emph{error, trend, and seasonal} (ETS) models. There are over 32 different variations of ETS models (R. J. Hyndman et al. 2008). One of them is the \emph{multiplicative error, additive trend-seasonality} \((ETS(M,A,A))\) model. + +\hypertarget{definition-7}{% +\paragraph{Definition 7}\label{definition-7}} + +A SSM process \(X\) follows an ETS(M,A,A) model, if the process accepts\\ +\[ +X_t = [L_{t-1} +T_{t-1} + S_{t-1}](1 + \epsilon_t) +\] +as innovation equation and +\begin{eqnarray*}L_t &= &L_{t-1} +T_{t-1} +\alpha (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + T_t &= &T_{t-1} + \beta (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t\\ + S_t &= &S_{t-m} + \gamma (L_{t-1} +T_{t-1} +S_{t-m})\epsilon_t, +\end{eqnarray*}\\ +as state equations. +\(\alpha, \beta,\gamma \in [0,1]\), \(m\in\mathbb{N}\) denotes the period of the series and \(\{\epsilon_t\}\) are i.i.d normal random variables. For each \(t\in\mathbb{Z},\) \(L_t\), \(T_t\) and \(S_t\) represent respectively the level, trend and seasonal components. + +\hypertarget{normality-tests-for-stationary-processes}{% +\section{Normality tests for stationary processes}\label{normality-tests-for-stationary-processes}} + +Extensive literature exists on goodness of fit tests for normality under the assumption of independent and identically distributed random variables, including, among others, Pearson's chi-squared test (Pearson and Henrici 1895), Kolmogorov-Smirnov test (Smirnov 1948), Anderson-Darling test (Anderson and Darling 1952), SK test (Jarque and Bera 1980) and Shapiro-Wilk test, (Shapiro and Wilk 1965) and (J. P. Royston 1982). These procedures have been widely used in many studies and applications, see D'Agostino and Stephens (1986) for further details. There are no results, however, showing that the above tests are consistent in the context of stationary processes, in which case the independence assumption is violated. For instance, Gasser (1975) provides a simulation study where Pearson's chi-squared test has an excessive rejection rate under the null hypothesis for dependent data. For this matter, several tests for stationary processes have been proposed over the years. A selection of which we reference here. Epps (1987) provides a test based on the characteristic function, Hinich (1982) proposes a similar test based on the process' spectral density function (Berg, Paparoditis, and Politis 2010, for further insight). Gasser (1975) gives a correction of the SK test, with several modifications made in Lobato and Velasco (2004), Bai and Ng (2005) and Psaradakis (2017), which are popular in many financial applications. Bontemps and Meddahi (2005) constructs a test based on Stein's characterization of a Gaussian distribution. Using the random projection method (Cuesta-Albertos et al. 2007), Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014) build a test that upgrades the performance of Epps (1987) and Lobato and Velasco (2004) procedures. Furthermore, Psaradakis and Vávra (2017) adapts the Anderson and Darling (1952) statistic for stationary linear processes approximating its sample distribution with a sieve bootstrap procedure. + +Despite the existing literature, consistent implementations of goodness of fit test for normality of stationary processes in programming languages such as \texttt{R} or \texttt{Python} are limited. This is not the case for normality of independent data, the \CRANpkg{nortest} package (Gross and Ligges 2015) implements tests such as Lilliefors (Dallal and Wilkinson 1986), Shapiro-Francia (P. Royston 1993), Pearson's chi-squared, Cramer von Misses (Anderson 1962) and Anderson-Darling. For a multivariate counterpart, the \CRANpkg{mvnTest} package (Pya et al. 2016) implements the multivariate Shapiro-Wilk, Anderson-Darling, Cramer von Misses, Royston (J. P. Royston 1992), Doornik and Hansen (Doornik and Hansen 2008), Henze and Zirkler (Henze and Zirkler 1990) and the multivariate Chi square test (Vassilly Voinov and Voinov 2016). For the case of dependent data, we present here the \CRANpkg{nortsTest} package. Type within \texttt{R} \texttt{install.packages("nortsTest",\ dependencies\ =\ TRUE)} to install its latest released version from \texttt{CRAN}. \CRANpkg{nortsTest} performs the tests proposed in Epps (1987), Lobato and Velasco (2004), Psaradakis and Vávra (2020), Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014), Psaradakis and Vávra (2017) and El Bouch, Michel, and Comon (2022). + +Additionally, the package offers visualization functions for descriptive time series analysis and several diagnostic methods for checking stationarity and normality assumptions for the most used time series models of several \texttt{R} packages. To elaborate on this, Subsection 3.1 introduces the package functionality and software and Subsection 3.2 provides an overview of tests for checking stationary and seasonality. Finally, Subsections 3.3-3.5 present a general framework of each of the implemented normality tests and their functionality by providing simulated data examples. + +\hypertarget{software}{% +\subsection{Software}\label{software}} + +The package works as an extension of the \CRANpkg{nortest} package (Gross and Ligges 2015), which performs normality tests in random samples but for independent data. The building block functions of the \CRANpkg{nortsTest} package are: + +\begin{itemize} +\item + \texttt{epps.test()}, function that implements the test of Epps, +\item + \texttt{epps\_bootstrap.test()}, function that implements a bootstrap approximation of the test of Epps, +\item + \texttt{lobato.test()}, function that implements the asymptotic test of Lobato and Velasco, +\item + \texttt{lobato\_bootstrap.test()}, function that implements a bootstrap approximation of the test of Lobato and Velasco, +\item + \texttt{rp.test()}, function that implements the random projection test of Nieto-Reyes, Cuesta-Albertos and Gamboa, +\item + \texttt{vavra.test()}, function that implements the test of Psaradaki and Vavra, and +\item + \texttt{elbouch.test()}, function that implements the test of El Bouch, Michel and Comon. +\end{itemize} + +Each of these functions accepts a \texttt{numeric} (\emph{numeric}) or \texttt{ts} (\emph{time series}) class object for storing data, and returns a \texttt{htest} (\emph{hypothesis test}) class object with the main results for the test. To guarantee the accuracy of the results, each test performs unit root tests for checking stationarity and seasonality (see Subsection 3.2) and displays a warning message if any of them is not satisfied. + +For visual diagnostic, the package offers different plot functions based on the \CRANpkg{ggplot2} package (Wickham 2009): the \texttt{autoplot()} function plots \texttt{numeric}, \texttt{ts} and \texttt{mts} (\emph{multivariate time series}) classes while the \texttt{gghist()} and \texttt{ggnorm()} functions are for plotting histogram and qq-plots respectively; and on the \CRANpkg{forecast} package (R. Hyndman and Khandakar 2008): \texttt{ggacf()} and \texttt{ggPacf()} for the display of the auto-correlation and partial auto-correlations functions respectively. + +Furthermore, inspired in the function \texttt{checkresiduals()} of the \CRANpkg{forecast} package, we provide the \texttt{check\_residuals()} function to test the model assumptions using the estimated residuals. The upgrade of our proposal is that, besides providing plots for visual diagnosis (setting the \texttt{plot} option as \texttt{TRUE}), it does check stationarity, seasonality (\emph{Subsection 3.2}) and normality, presenting a report of the used tests and conclusions for assessing the model's assumptions. An illustration of these functions is provided in Subsection 4.2, where we show the details of the functions and their utility for assumptions commonly checked in time series modeling. + +\hypertarget{tests-for-stationarity}{% +\subsection{Tests for stationarity}\label{tests-for-stationarity}} + +For checking stationarity, the \CRANpkg{nortsTest} package uses \textit{unit root} and \textit{seasonal unit root} tests. These tests work similarly, checking whether a specific process follows a random walk model, which clearly is a non-stationary process. + +\hypertarget{unit-root-tests}{% +\subsubsection{Unit root tests}\label{unit-root-tests}} + +A linear stochastic process \(X\) that follows a random walk model is non stationary. Its AR polynomial is \(\phi(z) = 1 - z\), whose solution (root) is unique and equal to one. Thus, it is common to test the non stationarity of a linear process by checking whether its AR polynomial has a unit root (a root equal to one). + +The most commonly used tests for unit root testing are \emph{Augmented Dickey-Fuller} (Said and Dickey 1984), \emph{Phillips-Perron} (Perron 1988), \emph{kpps} (Kwiatkowski et al. 1992) and \textit{Ljung-Box} (G. E. P. Box and Pierce 1970). In particular, the \emph{Ljung-Box} test contrasts the null auto-correlation hypothesis of identically distributed Gaussian random variables, which is equivalent to test stationarity. The \texttt{uroot.test()} and \texttt{check\_residual()} functions perform these tests, making use of the \CRANpkg{tseries} package (Trapletti and Hornik 2019). + +\hypertarget{seasonal-unit-root-tests}{% +\subsubsection{Seasonal unit root tests}\label{seasonal-unit-root-tests}} + +Let \(X\) be a stationary process and \(m\) its period. Note that for observed data, \(m\) generally corresponds to the number of observations per unit of time. \(X\) follows a seasonal random walk if it can be written as +\[ + X_t = X_{t-m} + \epsilon_t, +\] +where \(\epsilon_t\) is a collection of i.i.d random variables. In a similar way, the process \(X\) is non-stationary if it follows a seasonal random walk. Or equivalently, \(X\) is non stationary if the seasonal AR(1) polynomial (\(\phi_m(z) = 1 - \phi z^m\)) has a unit root. The \texttt{seasonal.test()} and \texttt{check\_residuals()} functions perform the \emph{OCSB test} (Osborn et al. 1988) from the \CRANpkg{forecast} package and the \emph{HEGY} (Beaulieu and Miron 1993) and \emph{Ch} (Canova and Hansen 1995) tests from the \CRANpkg{uroot} package (López-de-Lacalle 2019). + +\hypertarget{tests-of-epps}{% +\subsection{Tests of Epps}\label{tests-of-epps}} + +The \(\chi^2\) test for normality proposed by Epps (1987) compares the empirical characteristic function of the one-dimensional marginal of the process with the one of a normally distributed random variable evaluated at certain points on the real line. Several authors, including Lobato and Velasco (2004), Psaradakis and Vávra (2017) and El Bouch, Michel, and Comon (2022), point out that the greatest challenge in the Epps' test is its implementation procedure, which we address with the \CRANpkg{nortsTest} package. Other existing tests based on the empirical characteristic function of the one-dimensional marginal of the process include Hong (1999) and the references therein. This test differs, however, in that it uses spectral analysis and derivatives. + +Furthermore, Meintanis (2016) reviews on testing procedures based on the empirical characteristic function. There, it is commented about the random projection test (Nieto-Reyes, Cuesta-Albertos, and Gamboa 2014, and here below) as a recent development of Epps' test. In fact, in Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014) the consistency of Epps test is improved by taking at random the elements at which the characteristic function is evaluated. Additionally, El Bouch, Michel, and Comon (2022) proposes a sieve bootstrap modification of the Epps' test. In addition to the classical asymptotic Epps' test, we include these last two approaches here, and in the package, see the Example below and the paragraph before it. Let us provide now the foundation behind the Epps' tests. + +Let \(X\) be a stationary stochastic process that satisfies +\begin{equation} + \sum_{t=-\infty}^{\infty}|t|^k|\gamma(t)| <\infty \mbox{ for some } k >0. \label{eq:a} +\end{equation} +The null hypothesis is that the one-dimensional marginal distribution of \(X\) is a Gaussian process. The procedure for constructing the test consists of defining a function \(g\), estimating its inverse spectral matrix function, minimizing the generated quadratic function in terms of the unknown parameters of the random variable and, finally, obtaining the test statistic, which converges in distribution to a \(\chi^2.\) + +Given \(N \in\mathbb{N}\) with \(N \geq 2,\) let +\[ +\Lambda :=\{\lambda:=(\lambda_1, \ldots, \lambda_N) \in \mathbb{R}^N: \lambda_i \leq \lambda_{i+1} \text{ and } \lambda_i > 0, \text{ for } i = 1,2,\ldots, N \}, +\] +and \(g:\mathbb{R}\times \Lambda \rightarrow \mathbb{R}^n\) be a measurable function, where +\[ + g(x,\lambda):= [\cos(\lambda_1x),\sin(\lambda_1x),\ldots,\cos(\lambda_Nx),\sin(\lambda_Nx)]. +\] +Additionally, let \(g_\theta:\Lambda \rightarrow \mathbb{R}^N\) be a function defined by +\[ + g_\theta(\lambda) := \left[\mbox{Re}(\Phi_\theta(\lambda_1)),\mbox{Im}(\Phi_\theta(\lambda_1)),\ldots,\mbox{Re}(\Phi_\theta(\lambda_N)),\mbox{Im}(\Phi_\theta(\lambda_N)) \right]^t, +\] +where the \(\mbox{Re}(\cdot)\) and \(\mbox{Im}(\cdot)\) are the real and imaginary components of a complex number and \(\Phi_\theta\) is the characteristic function of a normal random variable with parameters \(\theta := (\mu,\sigma^2)\in \Theta,\) an open bounded set contained in \(\mathbb{R}\times \mathbb{R}^+\). For any \(\lambda\in\Lambda,\) let us also denote +\[ + \widehat{g}(\lambda) := \dfrac{1}{n}\sum_{t=1}^n [\cos(\lambda_1 x_t),\sin(\lambda_1x_t),\ldots,\cos(\lambda_N x_t),\sin(\lambda_N x_t)]^t. +\] +Let \(f(v;\theta,\lambda)\) be the spectral density matrix of \(\{g(X_t,\lambda)\}_{t \in\mathbb{Z}}\) at a frequency \(v.\) +Then, for \(v = 0\), it can be estimated by +\[ + \widehat{f}(0;\theta,\lambda) := \dfrac{1}{2\pi n}\left(\sum_{t=1}^n \widehat{G}(x_{t,0},\lambda) +2\sum_{i=1}^{\lfloor n^{2/5}\rfloor}(1 -i/\lfloor n^{2/5} \rfloor)\sum_{t=1}^{n-i}\widehat{G}(x_{t,i},\lambda) \right), +\] +where \(\widehat{G}(x_{t,i},\lambda) = (\widehat{g}(\lambda) -g(x_{t},\lambda))(\widehat{g}(\lambda) -g(x_{t+i},\lambda))^t\) and \(\lfloor \cdot \rfloor\) denotes the floor function. The test statistic general form under \(H_0\) is +\[ + Q_n(\lambda) := \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +\] +with +\[ + Q_n(\theta,\lambda):=(\widehat{g}(\lambda)-g_\theta(\lambda))^tG_n^+(\lambda)(\widehat{g}(\lambda)-g_\theta(\lambda)), +\] +where \(G^{+}_n\) is the generalized inverse of the spectral density matrix \(2 \pi \widehat{f}(0;\theta,\lambda)\). Let +\[ + \widehat{\theta} := \arg \min_{\theta \in \Theta} \left\{ Q_n(\theta,\lambda) \right\}, +\] +be the argument that minimizes \(Q_n(\theta,\lambda)\) such that \(\widehat{\theta}\) is in a neighborhood of \(\widehat{\theta}_n := (\widehat{\mu},\widehat{\gamma}(0))\). To guarantee its' existence and uniqueness, the following assumptions are required. We refer to them as assumption \((A.)\). + +\((A.)\) Let \(\theta_0\) be the true value of \(\theta\) under \(H_0\), then for every \(\lambda \in \Lambda\) the following conditions are satisfied. + +\begin{itemize} +\item + \(f(0;\theta,\lambda)\) is positive definite. +\item + \(\Phi_\theta(\lambda)\) is twice differential with respect to \(\theta\) in a neighborhood of \(\theta_0\). +\item + The matrix \(D(\theta_0,\lambda) = \dfrac{\partial \Phi_\theta(\lambda)}{\partial\theta |_{\theta = \theta_0}} \in \mathbb{R}^{N\times 2}\) has rank 2. +\item + The set \(\Theta_0(\lambda) := \{ \theta \in \Theta: \Phi_\theta(\lambda_i) = \Phi_{\theta_0}(\lambda_i), i=1, \ldots,N\}\) is a finite bounded set in \(\Theta\). And \(\theta\) is a bounded subset \(\mathbb{R}\times \mathbb{R}^+\). +\item + \(f(0;\theta,\lambda) = f(0;\theta_0,\lambda)\) and \(D(\theta_0,\lambda) = D(\theta_,\lambda)\) for all \(\theta \in \Theta_0(\lambda)\). +\end{itemize} + +Under these assumptions, the Epps's main result is presented as follows. + +\hypertarget{theorem-1-epps1987-theorem-2.1}{% +\paragraph{Theorem 1 (Epps 1987, Theorem 2.1)}\label{theorem-1-epps1987-theorem-2.1}} + +Let \(X\) be a stationary Gaussian process such that \eqref{eq:a} and \((A.)\) are satisfied, then \(nQ_n(\lambda)\to_d \chi^2(2N - 2)\) for every \(\lambda \in \Lambda\). + +The current \CRANpkg{nortsTest} version, uses \(\Lambda := \{\verb|lambda|/\widehat{\gamma}(0)\}\) as the values to evaluate the empirical characteristic function, where \(\widehat{\gamma}(0)\) is the sample variance. By default \texttt{lambda\ =\ c(1,\ 2)}. Therefore, the implemented test statistic converges to a \(\chi^2\) distribution with two degrees of freedom. The user can change these \(\Lambda\) values as desired by simply specifying the function's \texttt{lambda} argument, as we show in the Example below. + +\hypertarget{example-1}{% +\paragraph{Example 1}\label{example-1}} + +A stationary \(AR(2)\) process is drawn using a beta distribution with \texttt{shape1\ =\ 9} and \texttt{shape2\ =\ 1} parameters, and performed the implementation of the test of Epps, \texttt{epps.test()}. At significance level \(\alpha = 0.05\), the null hypothesis of normality is correctly rejected. + +\begin{verbatim} +set.seed(298) +x = arima.sim(250,model = list(ar =c(0.5,0.2)), + rand.gen = rbeta,shape1 = 9,shape2 = 1) + +# Asymptotic Epps test +epps.test(x) +#> +#> Epps test +#> +#> data: x +#> epps = 22.576, df = 2, p-value = 1.252e-05 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Asymptotic Epps test with random Lambda values as proposed in Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014). + +\begin{verbatim} +set.seed(298) +epps.test(x, lambda = abs(rnorm(mean = c(1, 2), 2))) +#> +#> Epps test +#> +#> data: x +#> epps = 25.898, df = 2, p-value = 2.379e-06 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Approximated sieve bootstrap Epps test using 1000 repetitions of 250 units. + +\begin{verbatim} +set.seed(298) +epps_bootstrap.test(x, seed = 298) +#> +#> Sieve-Bootstrap epps test +#> +#> data: y +#> bootstrap-epps = 22.576, p-value < 2.2e-16 +#> alternative hypothesis: y does not follow a Gaussian Process +\end{verbatim} + +\hypertarget{tests-of-lobato-and-velasco}{% +\subsection{Tests of Lobato and Velasco}\label{tests-of-lobato-and-velasco}} + +Lobato and Velasco (2004) provides a consistent estimator for the corrected SK test statistic for stationary processes, see Lomnicki (1961) and Gasser (1975) for further insight. Note that the SK test is also known as the Jarque-Bera test (Jarque and Bera 1980), which is already available in several R packages (Trapletti and Hornik 2019, for instance). The improvement of this proposal over those implementations is a correction in the skewness and kurtosis estimates by the process' auto-covariance function, resulting in a consistent test statistic under the assumption of correlated data. The test in Lobato and Velasco (2004) is asymptotic, which is computationally efficient, as opposed to a bootstrap based test. Psaradakis and Vávra (2020) show that the bootstrap modification of the Lobato and Velasco's test is a fair competitor against the original asymptotic test, beating other tests for normality of the one-dimensional marginal distribution in terms of power. Thus, the package incorporates both the asymptotic, \texttt{lobato.test()} and its bootstrap version \texttt{lobato\_bootstrap.test()}. + +The general framework for the test is presented in what follows. On the contrary to the test of Epps, this proposal does not require additional parameters for the computation of the test sample statistic. + +Let \(X\) be a stationary stochastic process that satisfies + +\begin{equation} + \sum_{t=0}^{\infty}|\gamma(t)| <\infty. \label{eq:aLV} +\end{equation} + +The null hypothesis is that the one-dimensional marginal distribution of \(X\) is normally distributed, that is +\[ +H_0: X_t \sim N(\mu,\sigma^2) \text{ for all } t \in \mathbb{R}. +\] +Let \(k_q(j_1,j_2,\ldots,j_{q-1})\) be the q-th order cummulant of \(X_{1},X_{1+j_1},\ldots,X_{1+j_{q-1}}\). \(H_0\) is fulfilled if all the marginal cummulants above the second order are zero. In practice, it is tested just for the third and fourth order marginal cummulants. Equivalently, in terms of moments, the marginal distribution is normal by testing whether \(\mu_3 = 0\) and \(\mu_4 = 3 \mu_2^2\). For non-correlated data, the SK test compares the SK statistic against upper critical values from a \(\chi^2(2)\) distribution (Bai and Ng 2005). For a Gaussian process \(X\) satisfying \eqref{eq:aLV}, it holds the limiting result +\[ + \sqrt{n} \binom{\widehat{\mu}_3}{\widehat{\mu}_4 -3\widehat{\mu}^2_2} \to_d N[0_2,\Sigma_F)], +\] +where \(0_2 := (0,0)^t \in \mathbb{R}^2\) and \(\Sigma_F := \mbox{diag}(6F^{(3)}, \text{ } 24F^{(4)}) \in \mathbb{R}^{2x2}\) is a diagonal matrix with \(F^{(k)} := \sum_{j = -\infty}^{\infty}\gamma(j)^k\) for \(k=3,4\) (Gasser 1975). + +The following consistent estimator in terms of the auto-covariance function is proposed in Lobato and Velasco (2004) +\[ + \widehat{F}^{(k)} := \sum_{t = 1-n}^{n-1}\widehat{\gamma}(t)[\widehat{\gamma}(t) +\widehat{\gamma}(n-|t|)]^{k-1}, +\] +to build a \emph{generalized SK test} statistic +\[ + G := \dfrac{n \widehat{\mu}_3^2}{6 \widehat{F}^{(3)}} + \dfrac{n(\widehat{\mu}_4 -3\widehat{\mu}_2)^2}{24\widehat{F}^{(4)}}. +\] +Similar to the SK test for non-correlated data, the \(G\) statistic is compared against upper critical values from a \(\chi^2(2)\) distribution. This is seen in the below result that establishes the asymptotic properties of the test statistics, so that the general test procedure can be constructed. The result requires the following assumptions, denoted by \((B.),\) for the process \(X.\) + +(B.) + +\begin{itemize} +\item + \(E[X_t^{16}] < \infty\) for \(t \in T.\) +\item + \(\sum_{j_1 = -\infty}^{\infty}\cdots \sum_{j_{q-1} = -\infty}^{\infty} |k_q(j_1,\ldots,j_{q-1})| < \infty \text{ for } q=2,3,\ldots,16.\) +\item + \(\sum_{j=1}^{\infty}\left(E \left[\text{ } E[(X_0-\mu)^k|B_j] -\mu_k\right]^2 \right)^{1/2} < \infty \text{ for } k = 3,4,\) where \(B_j\) denotes the \(\sigma\)-field generated by \(X_t\), \(t \leq -j.\) +\item + \(E\left[Z_k \right]^2 +2\sum_{j=1}^{\infty}E\left(\left[Z_k \right] \left[ (X_j -\mu)^k -\mu_k \right] \right) > 0\) for \(k = 3,4,\) with \(Z_k=(X_0 -\mu)^k -\mu_k.\) +\end{itemize} + +Note that these assumptions imply that the higher-order spectral densities up to order 16 are continuous and bounded. + +\hypertarget{theorem-2-lobato2004-theorem-1}{% +\paragraph{Theorem 2 (Lobato and Velasco 2004, Theorem 1)}\label{theorem-2-lobato2004-theorem-1}} + +Let \(X\) be a stationary process. If \(X\) is Gaussian and satisfies \eqref{eq:aLV} then \(G \to_d \chi^2(2)\), and under assumption (B.), the test statistic G diverges whenever \(\mu_3 \neq 0\) or \(\mu_4 \neq 3\mu_2^2.\) + +\hypertarget{example-2}{% +\paragraph{Example 2}\label{example-2}} + +A stationary \(MA(3)\) process is drawn using a gamma distribution with \texttt{rate\ =\ 3} and \texttt{shape\ =\ 6} parameters. The \texttt{lobato.test()} function performs the test of \emph{Lobato and Velasco} to the simulated data. At significance level \(\alpha = 0.05\), the null hypothesis of normality is correctly rejected. + +\begin{verbatim} +set.seed(298) +x = arima.sim(250,model = list(ma = c(0.2, 0.3, -0.4)), + rand.gen = rgamma, rate = 3, shape = 6) +# Asymptotic Lobato & Velasco +lobato.test(x) +#> +#> Lobato and Velasco's test +#> +#> data: x +#> lobato = 65.969, df = 2, p-value = 4.731e-15 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Approximated sieve bootstrap Lobato and Velasco test using 1000 repetitions of 250 units. + +\begin{verbatim} +lobato_bootstrap.test(x, seed = 298) +#> +#> Sieve-Bootstrap lobato test +#> +#> data: y +#> bootstrap-lobato = 65.969, p-value < 2.2e-16 +#> alternative hypothesis: y does not follow a Gaussian Process +\end{verbatim} + +\hypertarget{the-random-projections-test}{% +\subsection{The Random Projections test}\label{the-random-projections-test}} + +The previous proposals only test for the normality of the one-dimensional marginal distribution of the process, which is inconsistent against alternatives whose one-dimensional marginal is Gaussian. Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014) provides a procedure to fully test normality of a stationary process using a Crammér-Wold type result (Cuesta-Albertos et al. 2007) that uses random projections to differentiate among distributions. In Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014) existing tests for the normality of the one dimensional marginal are applied to the random projections and the resulting p-values combined using the false discovery rate for dependent data (Benjamini and Yekutieli 2001). The \CRANpkg{nortsTest} package improves on this test by allowing to use the less conservative false discovery rate in Benjamini and Hochberg (1995). + +We show the Crammér-Wold type result below. The result works for separable Hilbert spaces, however here, for its later application, we restrict it to \(l^2,\) the space of square summable sequences over \(\mathbb{N},\) with inner product \(\langle \cdot,\cdot \rangle.\) + +\hypertarget{theorem-3-cuesta2007-theorem-3.6}{% +\paragraph{Theorem 3 (Cuesta-Albertos et al. 2007, Theorem 3.6)}\label{theorem-3-cuesta2007-theorem-3.6}} + +Let \(\eta\) be a dissipative distribution on \(l^2\) and \(Z\) a \(l^2\)-valued random element, then \(Z\) is Gaussian if and only if +\[ + \eta\{h \in l^2: \langle Z,h \rangle \text{ has a Gaussian distribution}\} > 0. +\] +A dissipative distribution (Nieto-Reyes, Cuesta-Albertos, and Gamboa 2014, Definition 2.1) is a generalization of the concept of absolutely continuous distribution to the infinite-dimensional space. A Dirichlet process (Gelman et al. 2013) produces random elements with a dissipative distribution in \(l^2\). In practice, generate draws of \(h \in l^2\) with a stick-breaking process that makes use of beta distributions. + +Let \(X = \{X_t\}_{t\in\mathbb{Z}}\) be a stationary process. As \(X\) is normally distributed if the process \(X^{(t)} := \{X_k\}_{k \leq t}\) is Gaussian for each \(t\in\mathbb{Z},\) using the result above, Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014) provides a procedure for testing that \(X\) is a Gaussian process by testing whether the process \(Y^h = \{Y^h_t\}_{t \in \mathbb{Z}}\) is Gaussian. +\begin{equation} + Y^h_t := \sum_{i=0}^\infty h_i X_{t-i} = \langle X^{ (t) },h \rangle, \label{eq:proj} +\end{equation} +where \(\langle X^{(t)},h \rangle\) is a real random variable for each \(t \in \mathbb{Z}\) and \(h\in l^2\). Thus, \(Y^h\) is a stationary process constructed by the projection of \(X^{(t)}\) on the space generated by \(h.\) Therefore, \(X\) is a Gaussian process if and only if the one dimensional marginal distribution of \(Y^{h}\) is normally distributed. Additionally, the hypothesis of the tests \emph{Lobato and Velasco} or \emph{Epps}, such as \eqref{eq:a}, \eqref{eq:aLV}, \((A)\) and \((B)\), imposed on \(X\) are inherited by \(Y^h\). Then, those tests can be applied to evaluate the normality of the one dimensional marginal distribution of \(Y^h\). Further considerations include the specific beta parameters used to construct the distribution from which to draw \(h\) and selecting a proper number of combinations to establish the number of projections required to improve the method performance. All of these details are discussed in Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014). + +Next, we summarize the test of random projections in practice: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\item + Select \(k,\) which results in \(2k\) independent random projections (\emph{by default} \texttt{k\ =\ 1}). +\item + Draw the \(2k\) random elements to project the process from a dissipative distribution that uses a particular beta distribution. By default, use a \(\beta(2,7)\) for the first \(k\) projections and a \(\beta(100,1)\) for the later \(k\). +\item + Apply the tests of \emph{Lobato and Velasco} to the even projected processes and \emph{Epps} to the odd projections. +\item + Combine the obtained \(2k\) \texttt{p-values} using the false discover rate. By default, use Benjamini and Yekutieli (2001) procedure. +\end{enumerate} + +The \texttt{rp.test()} function implements the above procedure. The user might provide optional parameters such as the number of projections \texttt{k}, the parameters of the first beta distribution \texttt{pars1} and those of the second \texttt{pars2}. The next example illustrates the application of the \texttt{rp.test()} to a stationary GARCH(1,1) process drawn using normal random variables. + +\hypertarget{example-3}{% +\paragraph{Example 3}\label{example-3}} + +A stationary \texttt{GARCH(1,1)} process is drawn with a standard normal distribution and parameters \(\alpha_0 = 0,\) \(\alpha_1 = 0.2\) and \(\beta_1 = 0.3\) using the (\CRANpkg{fGarch} package, Wuertz et al. 2017). Note that a \texttt{GARCH(1,1)} process is stationary if the parameters \(\alpha_1\) and \(\beta_1\) satisfy the inequality \(\alpha_1 + \beta_1 < 1\) (Bollerslev 1986). + +\begin{verbatim} +set.seed(3468) +library(fGarch) +spec = garchSpec(model = list(alpha = 0.2, beta = 0.3)) +x = ts(garchSim(spec, n = 300)) +rp.test(x) +#> +#> k random projections test. +#> +#> data: x +#> k = 1, p.value adjust = Benjamini & Yekutieli, p-value = 1 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +At significance level \(\alpha = 0.05,\) the applied \emph{random projections} test with \texttt{k\ =\ 1} as the number of projections shows no evidence to reject the null hypothesis of normality. + +\hypertarget{the-psaradakis-and-vavras-test}{% +\subsection{The Psaradakis and Vavra's test}\label{the-psaradakis-and-vavras-test}} + +Psaradakis and Vávra (2017) adapted a distance test for normality for a one-dimensional marginal distribution of a stationary process. Initially, the test was based on the Anderson (1952) test statistic and used an auto-regressive sieve bootstrap approximation to the null distribution of the sample test statistic. Later, Psaradakis and Vávra (2020) considered this test as the ultimate normality test based on the empirical distribution function, and adapted its methodology to a wide range of tests, including Shapiro-Wilk (Shapiro and Wilk 1965), Jarque-Bera (Jarque and Bera 1980), Cramer von Mises (Anderson 1962), Epps, and Lobato-Velasco. Their experiments show that the Lobato-Velasco and Jarque-Bera test's bootstrap version performs best in small samples. + +Although the test is said to be applicable to a wide class of non-stationary processes by transforming them into stationary by means of a fractional difference operator, no theoretic result was apparently provided to sustain this transformation. This work restricts the presentation of the original procedure to stationary processes. + +Let \(X\) be a stationary process satisfying +\begin{equation} + X_t = \sum_{i=0}^{\infty}\theta_i \epsilon_{t-i} + \mu_0, \ t \in \mathbb{Z}, \label{eq:aPV} +\end{equation} +where \(\mu_0 \in \mathbb{R}\), \(\{\theta_i\}_{i=0}^\infty\in l^2\) with \(\theta_0 = 1\) and \(\{\epsilon_t\}_{i=0}^\infty\) is a collection of mean zero i.i.d random variables. The null hypothesis is that the one dimensional marginal distribution of \(X\) is normally distributed, +\[ + H_0: F(\mu_0 +\sqrt{\gamma(0)}x)-F_N(x) = 0, \text{ for all } x\in \mathbb{R}, +\] +where F is the cumulative distribution function of \(X_0\), and \(F_N\) denotes the standard normal cumulative distribution function. Note that if \(\epsilon_0\) is normally distributed, then the null hypothesis is satisfied. Conversely, if the null hypothesis is satisfied, then \(\epsilon_0\) is normally distributed and, consequently, \(X_0\).\\ +The considered test for \(H_0\) is based on the Anderson-Darling distance statistic +\begin{equation} + A_d = \int_{-\infty}^{\infty}\dfrac{[{F_n}(\widehat{\mu}+\sqrt{\widehat{\gamma}(0)}x)-F_N(x)]^2}{F_N(x)[1-F_N(x)]}dF_N(x), \label{eq:aPV1} +\end{equation} +where \({F_n}(\cdot)\) is the empirical distribution function associated to \(F\) based on a simple random sample of size \(n\). Psaradakis and Vávra (2017) proposes an auto-regressive sieve bootstrap procedure to approximate the sampling properties of \(A_d\) arguing that making use of classical asymptotic inference for \(A_d\) is problematic and involved. This scheme is motivated by the fact that under some assumptions for \(X,\) including \eqref{eq:aPV}, \(\epsilon_t\) admits the representation +\begin{equation} + \epsilon_t = \sum_{i=1}^{\infty}\phi_i(X_{t-i} - \mu_0), \ t \in \mathbb{Z}, \label{eq:ePV} +\end{equation} +for certain type of \(\{\phi_i\}_{i=1}^\infty\in l^2\). The main idea behind this approach is to generate a bootstrap sample \(\epsilon_t^*\) to approximate \(\epsilon_t\) with a finite-order auto-regressive model. This is because the distribution of the processes \(\epsilon_t\) and \(\epsilon_t^*\) coincide asymptotically if the order of the auto-regressive approximation grows simultaneously with \(n\) at an appropriate rate (Bühlmann 1997). The procedure makes use of the \(\epsilon_t^{*'}s\) to obtain the \(X_t^{*'}s\) through the bootstrap analog of \eqref{eq:ePV}. Then, generate a bootstrap sample of the \(A_d\) statistic, \(A_d^{*},\) making use of the bootstrap analog of \eqref{eq:aPV}. + +The \texttt{vavra.test()} function implements Psaradakis and Vávra (2020) procedure. By default, it generates 1,000 sieve-bootstrap replications of the Anderson-Darling statistic. The user can provide different test procedures, such as the \emph{Shapiro-Wilk, Jarque-Bera, Cramer von Mises, Epps} or \emph{Lobato-Velasco} test, by specifying a text value to the \texttt{normality} argument. The presented values are Monte Carlo estimates of the \(A_d\) statistic and \texttt{p.value}. + +\hypertarget{example-4}{% +\paragraph{Example 4}\label{example-4}} + +A stationary \(ARMA\)(1,1) process is simulated using a standard normal distribution and performs \emph{Psaradakis and Vávra} procedure using Anderson-Darling and Cramer von Mises test statistics. At significance level \(\alpha = 0.05\), there is no evidence to reject the null hypothesis of normality. + +\begin{verbatim} +set.seed(298) +x = arima.sim(250,model = list(ar = 0.2, ma = 0.34)) +# Default, Psaradakis and Vavra's procedure +vavra.test(x, seed = 298) +#> +#> Psaradakis-Vavra test +#> +#> data: x +#> bootstrap-ad = 0.48093, p-value = 0.274 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +Approximate Cramer von Mises test for the Psaradakis and Vavra's procedure + +\begin{verbatim} +vavra.test(x, normality = "cvm", seed = 298) +#> +#> Sieve-Bootstrap cvm test +#> +#> data: x +#> bootstrap-cvm = 0.056895, p-value = 0.49 +#> alternative hypothesis: x does not follow a Gaussian Process +\end{verbatim} + +\hypertarget{the-multivariate-kurtosis-test}{% +\subsection{The multivariate kurtosis test}\label{the-multivariate-kurtosis-test}} + +The literature contains some procedures to test the null hypothesis that a multivariate stochastic process is Gaussian. Those include Moulines, Choukri, and Sharbit (1992), a test based on the characteristic function, and Steinberg and Zeitouni (1992), a test based on properties of the entropy of Gaussian processes that does not make use of cumulant computations. According to El Bouch, Michel, and Comon (2022), these tests may hardly be executable in real time. Consequently, they propose a test based on multivariate kurtosis (Mardia 1970). The proposed procedure is for \(p=1,2,\) and we elaborate on it in what follows. In Section 6.3 of El Bouch, Michel, and Comon (2022), they suggest to apply random projections for higher dimensions but they do not investigate the procedure any further. + +The p-value of this test is obtained as \(2(1-F_N(z))\) where, as above, \(F_N\) denotes the standard normal cumulative distribution function. There, +\[ + z:=(\hat{B}_p-E[\hat{B}_p])/\sqrt{E[(\hat{B}_p-E[\hat{B}_p])^2]}, + \] +where +\[ + \hat{B}_p:=n^{-1}\sum_{t=1}^n(x_t^t \hat{S}^{-1}x_t)^2, + \] +and +\[ + \hat{S}:=n^{-1}\sum_{t=1}^n x_t x_t^t. +\] +In El Bouch, Michel, and Comon (2022), there reader can found the exact computations of \(E[\hat{B}_p]\) and \(E[(\hat{B}_p-E[\hat{B}_p])^2].\) + +This test is implemented in the \texttt{elbouch.test()} function. By default, the function computes the univariate El Bouch test. If the user provides a secondary data set, the function computes the bivariate counterpart. + +\hypertarget{example-5}{% +\paragraph{Example 5}\label{example-5}} + +Simulate a two-dimensional stationary VAR(2) process using independent AR(1) and AR(2) processes with standard normal distributions and apply the bivariate El Bouch test. At significance level \(\alpha = 0.05\), there is no evidence to reject the null hypothesis of normality. + +\begin{verbatim} +set.seed(23890) +x = arima.sim(250,model = list(ar = 0.2)) +y = arima.sim(250,model = list(ar = c(0.4,0,.1))) +elbouch.test(y = y,x = x) +#> +#> El Bouch, Michel & Comon's test +#> +#> data: w = (y, x) +#> Z = 0.92978, p-value = 0.1762 +#> alternative hypothesis: w = (y, x) does not follow a Gaussian Process +\end{verbatim} + +\hypertarget{simulations-and-data-analysis}{% +\section{Simulations and data analysis}\label{simulations-and-data-analysis}} + +\hypertarget{numerical-experiments}{% +\subsection{Numerical experiments}\label{numerical-experiments}} + +Inspired by the simulation studies in Psaradakis and Vávra (2017) and Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014), we propose here a procedure that involves drawing data from the \(AR(1)\) process +\begin{equation} + X_t = \phi X_{t-1} + \epsilon_t, \ t \in\mathbb{Z}, \text{ for } \phi \in \{ 0,\pm 0.25,\pm 0.4\}, \label{eq:eqAR} +\end{equation} +where the \(\{\epsilon_t\}_{t\in\mathbb{Z}}\) are i.i.d random variables. For the distribution of the \(\epsilon_t\) we consider different scenarios: standard normal (\(N\)), standard log-normal (\(\log N\)), Student t with 3 degrees of freedom (\(t_3\)), chi-squared with 10 degrees of freedom (\(\chi^2(10)\)) and gamma with \((7, 1)\) shape and scale parameters (\(\Gamma(7,1)\)). + +\begin{table}[!h] +\centering +\caption{\label{tab:tab1-static}Part 1. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ in { 0, 0.25, 0.4}, n in {100, 250}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.} +\centering +\resizebox{\ifdim\width>\linewidth\linewidth\else\width\fi}{!}{ +\begin{tabular}[t]{lrrrrrrrrrrrr} +\toprule +\multicolumn{1}{c}{ } & \multicolumn{6}{c}{n = 100} & \multicolumn{6}{c}{n = 250} \\ +\cmidrule(l{3pt}r{3pt}){2-7} \cmidrule(l{3pt}r{3pt}){8-13} +phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi\\ +\midrule +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Lobato and Velasco}}\\ +\hspace{1em}N & 0.041 & 0.044 & 0.047 & 0.032 & 0.035 & 0.769 & 0.059 & 0.037 & 0.054 & 0.040 & 0.037 & 0.646\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.610 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.653\\ +\hspace{1em}t3 & 0.797 & 0.853 & 0.902 & 0.875 & 0.829 & 0.627 & 0.990 & 0.994 & 0.998 & 0.999 & 0.983 & 0.674\\ +\hspace{1em}chisq10 & 0.494 & 0.698 & 0.770 & 0.707 & 0.610 & 0.620 & 0.930 & 0.995 & 0.998 & 0.997 & 0.977 & 0.657\\ +\hspace{1em}Gamma(7,1) & 0.995 & 1.000 & 0.999 & 0.996 & 0.988 & 0.634 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.665\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Epps}}\\ +\hspace{1em}N & 0.056 & 0.051 & 0.062 & 0.060 & 0.063 & 0.695 & 0.048 & 0.058 & 0.053 & 0.066 & 0.063 & 0.736\\ +\hspace{1em}logN & 0.908 & 0.917 & 0.972 & 0.985 & 0.984 & 0.729 & 1.000 & 1.000 & 1.000 & 0.999 & 1.000 & 0.777\\ +\hspace{1em}t3 & 0.243 & 0.291 & 0.370 & 0.317 & 0.248 & 0.722 & 0.776 & 0.872 & 0.908 & 0.881 & 0.780 & 0.769\\ +\hspace{1em}chisq10 & 0.267 & 0.440 & 0.548 & 0.469 & 0.360 & 0.699 & 0.611 & 0.850 & 0.930 & 0.866 & 0.721 & 0.739\\ +\hspace{1em}Gamma(7,1) & 0.866 & 0.961 & 0.996 & 0.993 & 0.965 & 0.722 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.782\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Random Projections}}\\ +\hspace{1em}N & 0.051 & 0.042 & 0.045 & 0.039 & 0.050 & 1.301 & 0.045 & 0.033 & 0.046 & 0.038 & 0.050 & 1.905\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.330 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.906\\ +\hspace{1em}t3 & 0.790 & 0.863 & 0.879 & 0.823 & 0.727 & 1.320 & 0.982 & 0.994 & 0.995 & 0.991 & 0.975 & 1.949\\ +\hspace{1em}chisq10 & 0.589 & 0.730 & 0.757 & 0.640 & 0.542 & 1.295 & 0.957 & 0.994 & 0.994 & 0.969 & 0.888 & 1.926\\ +\hspace{1em}Gamma(7,1) & 0.998 & 1.000 & 1.000 & 0.998 & 0.989 & 1.308 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.963\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Psaradakis and Vavra}}\\ +\hspace{1em}N & 0.052 & 0.048 & 0.051 & 0.058 & 0.050 & 17.905 & 0.061 & 0.046 & 0.038 & 0.051 & 0.045 & 22.115\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 17.149 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 21.841\\ +\hspace{1em}t3 & 0.700 & 0.799 & 0.851 & 0.780 & 0.695 & 17.503 & 0.960 & 0.979 & 0.991 & 0.977 & 0.960 & 22.183\\ +\hspace{1em}chisq10 & 0.498 & 0.673 & 0.804 & 0.689 & 0.550 & 18.029 & 0.902 & 0.983 & 0.997 & 0.988 & 0.933 & 22.197\\ +\hspace{1em}Gamma(7,1) & 0.989 & 1.000 & 1.000 & 1.000 & 0.998 & 18.467 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 22.292\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Lobato}}\\ +\hspace{1em}N & 0.057 & 0.052 & 0.047 & 0.059 & 0.052 & 37.141 & 0.035 & 0.049 & 0.048 & 0.058 & 0.049 & 40.532\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 32.509 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 40.793\\ +\hspace{1em}t3 & 0.797 & 0.867 & 0.899 & 0.869 & 0.809 & 32.755 & 0.989 & 0.994 & 0.996 & 0.996 & 0.989 & 41.158\\ +\hspace{1em}chisq10 & 0.567 & 0.729 & 0.801 & 0.745 & 0.649 & 32.242 & 0.942 & 0.990 & 1.000 & 0.994 & 0.963 & 40.950\\ +\hspace{1em}Gamma(7,1) & 0.999 & 1.000 & 1.000 & 0.998 & 0.991 & 31.763 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 41.277\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Epps}}\\ +\hspace{1em}N & 0.047 & 0.053 & 0.048 & 0.052 & 0.044 & 57.749 & 0.058 & 0.052 & 0.053 & 0.048 & 0.043 & 65.367\\ +\hspace{1em}logN & 0.846 & 0.877 & 0.963 & 0.974 & 0.959 & 56.756 & 1.000 & 1.000 & 1.000 & 1.000 & 0.999 & 65.968\\ +\hspace{1em}t3 & 0.183 & 0.238 & 0.313 & 0.230 & 0.196 & 57.350 & 0.752 & 0.863 & 0.913 & 0.841 & 0.754 & 65.699\\ +\hspace{1em}chisq10 & 0.252 & 0.364 & 0.527 & 0.450 & 0.358 & 56.627 & 0.596 & 0.813 & 0.913 & 0.854 & 0.685 & 65.369\\ +\hspace{1em}Gamma(7,1) & 0.816 & 0.948 & 0.993 & 0.979 & 0.931 & 56.986 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 65.315\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{El Bouch}}\\ +\hspace{1em}N & 0.040 & 0.047 & 0.044 & 0.033 & 0.050 & 0.798 & 0.040 & 0.054 & 0.052 & 0.061 & 0.059 & 1.020\\ +\hspace{1em}logN & 0.990 & 0.998 & 0.998 & 0.995 & 0.980 & 0.805 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.025\\ +\hspace{1em}t3 & 0.833 & 0.883 & 0.928 & 0.886 & 0.846 & 0.824 & 0.996 & 0.999 & 0.998 & 0.998 & 0.991 & 1.044\\ +\hspace{1em}chisq10 & 0.041 & 0.152 & 0.281 & 0.155 & 0.046 & 0.812 & 0.062 & 0.386 & 0.597 & 0.388 & 0.065 & 1.031\\ +\hspace{1em}Gamma(7,1) & 0.833 & 0.905 & 0.929 & 0.898 & 0.818 & 0.818 & 0.993 & 0.998 & 0.999 & 0.995 & 0.989 & 1.042\\ +\bottomrule +\end{tabular}} +\end{table} + +As in Psaradakis and Vávra (2017), \(m=1,000\) independent draws of the above process are generated for each pair of parameter \(\phi\) and distribution. Each draw is taken of length \(past+n,\) with \(past=500\) and \(n \in \{100,250,500,1000 \}\). The first 500 data points of each realization are then discarded in order to eliminate start-up effects. The \(n\) remaining data points are used to compute the value of the test statistic of interest. In each particular scenario, the rejection rate is obtained by computing the proportion of times that the test is rejected among the \(m\) trials. + +\begin{table}[!h] +\centering +\caption{\label{tab:tab2-static}Part 2. Rejection rate estimates over $m=1,000$ trials of the seven studied goodness of fit test for the null hypothesis of normality. The data is drawn using the process defined in (8) for different values of $phi$ and $n$ displayed in the columns and different distributions for $epsilon_t$ in the rows. $phi$ is in { 0, 0.25, 0.4} and n in {500, 1000}. For each test and distribution, max.phi represents the maximum rejection rate's running time in seconds among the different values of the AR parameter.} +\centering +\resizebox{\ifdim\width>\linewidth\linewidth\else\width\fi}{!}{ +\begin{tabular}[t]{lrrrrrrrrrrrr} +\toprule +\multicolumn{1}{c}{ } & \multicolumn{6}{c}{n = 500} & \multicolumn{6}{c}{n = 1,000} \\ +\cmidrule(l{3pt}r{3pt}){2-7} \cmidrule(l{3pt}r{3pt}){8-13} +phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi & -0.4 & -0.25 & 0.0 & 0.25 & 0.4 & max.phi\\ +\midrule +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Lobato and Velasco}}\\ +\hspace{1em}N & 0.041 & 0.035 & 0.052 & 0.035 & 0.049 & 0.729 & 0.048 & 0.050 & 0.040 & 0.062 & 0.040 & 1.065\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.743 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.076\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.844 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.116\\ +\hspace{1em}chisq10 & 0.999 & 1.000 & 1.000 & 1.000 & 1.000 & 0.824 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.082\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.825 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.105\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Epps}}\\ +\hspace{1em}N & 0.048 & 0.046 & 0.056 & 0.065 & 0.050 & 0.905 & 0.034 & 0.038 & 0.046 & 0.033 & 0.059 & 1.182\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.931 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.294\\ +\hspace{1em}t3 & 0.991 & 0.994 & 0.996 & 0.997 & 0.985 & 0.936 & 1.000 & 0.998 & 1.000 & 1.000 & 0.999 & 1.235\\ +\hspace{1em}chisq10 & 0.924 & 0.991 & 0.999 & 0.991 & 0.969 & 0.917 & 0.997 & 1.000 & 1.000 & 1.000 & 1.000 & 1.202\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.873 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.239\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Random Projections}}\\ +\hspace{1em}N & 0.044 & 0.043 & 0.040 & 0.040 & 0.048 & 2.723 & 0.021 & 0.027 & 0.043 & 0.043 & 0.047 & 4.544\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.759 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.588\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.755 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.531\\ +\hspace{1em}chisq10 & 1.000 & 1.000 & 1.000 & 1.000 & 0.998 & 2.782 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.520\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.843 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 4.527\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Psaradakis and Vavra}}\\ +\hspace{1em}N & 0.048 & 0.050 & 0.045 & 0.053 & 0.039 & 26.957 & 0.055 & 0.045 & 0.047 & 0.043 & 0.033 & 37.993\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 27.209 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 37.282\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 26.599 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 37.642\\ +\hspace{1em}chisq10 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 27.418 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 37.731\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 27.659 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 38.232\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Lobato}}\\ +\hspace{1em}N & 0.055 & 0.048 & 0.053 & 0.037 & 0.035 & 53.110 & 0.050 & 0.046 & 0.067 & 0.049 & 0.047 & 72.528\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 52.632 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 71.845\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 52.763 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 71.454\\ +\hspace{1em}chisq10 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 52.455 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 73.413\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 53.204 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 72.253\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{Bootstrap Epps}}\\ +\hspace{1em}N & 0.051 & 0.043 & 0.033 & 0.043 & 0.051 & 78.920 & 0.055 & 0.054 & 0.056 & 0.044 & 0.064 & 101.883\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 78.194 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 101.753\\ +\hspace{1em}t3 & 0.979 & 0.995 & 0.998 & 0.996 & 0.985 & 79.735 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 100.766\\ +\hspace{1em}chisq10 & 0.911 & 0.986 & 0.996 & 0.995 & 0.945 & 80.841 & 0.997 & 1.000 & 1.000 & 1.000 & 0.998 & 101.250\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 78.688 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 101.360\\ +\addlinespace[0.3em] +\multicolumn{13}{l}{\textbf{El Bouch}}\\ +\hspace{1em}N & 0.065 & 0.053 & 0.047 & 0.061 & 0.059 & 1.419 & 0.055 & 0.064 & 0.051 & 0.048 & 0.045 & 2.467\\ +\hspace{1em}logN & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.435 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.500\\ +\hspace{1em}t3 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.453 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.492\\ +\hspace{1em}chisq10 & 0.100 & 0.609 & 0.871 & 0.609 & 0.076 & 1.439 & 0.176 & 0.858 & 0.984 & 0.865 & 0.173 & 2.470\\ +\hspace{1em}Gamma(7,1) & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 1.444 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 2.483\\ +\bottomrule +\end{tabular}} +\end{table} + +Tables \ref{tab:tab1-static} and \ref{tab:tab2-static} present the rejection rate estimates. For every process of length \(n,\) the columns represent the used \(AR(1)\) parameter and the rows the distribution used to draw the process. The obtained results are consistent with those obtained in the publications where the different tests were proposed. As expected, rejection rates are around 0.05 when the data is drawn from a standard normal distribution, as in this case the data is drawn from a Gaussian process. Conversely, high rejection rates are registered for the other distributions. Low rejection rates are observed, however, for the \(\chi^2(10)\) distribution when making use of some of the tests. For instance, the \emph{Epps} and \emph{bootstrap Epps} tests, although they consistently tend to 1 when the length of the process, \(n,\) increases. Another case is the El Bouch test. However, this one maintains low rates for large values of \(|\phi|\) when \(n\) increases. Furthermore, for the random projections test, the number of projections used in this study is the default \(k = 1,\) which is by far a lower number than the recommended by Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014). However, even in these conditions, the obtained results are satisfactory, with the random projection test having even better performance than the tests of Epps (1987) or Psaradakis and Vávra (2017). + +An important aspect in selecting a procedure is its computation time. Thus, for each length of the process, \(n,\) there is an additional column, max.phi, in \emph{Tables} \ref{tab:tab1-static} and \ref{tab:tab2-static}. Each entry in this column refers to a different distribution and contains the maximum running time in seconds to obtain the rejection rate among the different values of the AR parameter. That is, for a fix distribution, the rejection rates are computed for each of the five possibilities of \(\phi\) and the time that it takes recorded. The running time in the table is the largest among the five. Furthermore, in \textit{Table} \ref{tab:tab3-static} we show the time in seconds that each studied test takes to check whether a given process is Gaussian. In particular, the table contains the average running time over 1,000 trials that takes to generate and check a Gaussian AR(1) process with parameter \(\phi = 0.5\). This is done for different sample sizes, \(n \in \{1000, 2000, 3000, 4000, 5000\}.\) According to the table, the asymptotic tests (Lobato and Velasco, Epps, random projections and El Bouch) have similar running times. On the contrary, the bootstrap based tests (Psaradakis and Vavra, Bootstrap Epps and Lobato and Velasco) have, as expected, higher running times on average. Furthermore, Tables \ref{tab:tab1-static} and \ref{tab:tab2-static} show similar results in time performance. There, the maximum running time of the bootstrap based tests exceeds in more than ten seconds the time obtained with the asymptotic based tests. It is worth saying that the tables have been obtained with R version 4.3.1 (2023-06-16) and platform aarch64-apple-darwin20 (64-bit),running under macOS Sonoma 14.2.1. + +\begin{table} + +\caption{\label{tab:tab3-static}Average running time in seconds, over 1000 iterations, to compute the null hypothesis of Gaussianity for each of the studied tests (first column) and different sample sizes, $n=1000$ (second column), $n=2000$ (third column), $n=3000$ (fourth column), $n=4000$ (fifth column) and $n=5000$ (sixth column). Each iteration makes use of a Gaussian AR(1) process with parameter $phi = 0.5.$} +\centering +\begin{tabular}[t]{lrrrrr} +\toprule +tests & n = 1000 & n = 2000 & n = 3000 & n = 4000 & n = 5000\\ +\midrule +Lobato and Velasco & 0.0010 & 0.0014 & 0.0020 & 0.0026 & 0.0035\\ +Epps & 0.0010 & 0.0015 & 0.0021 & 0.0027 & 0.0035\\ +Random Projections & 0.0026 & 0.0045 & 0.0063 & 0.0082 & 0.0105\\ +El Bouch & 0.0023 & 0.0046 & 0.0074 & 0.0109 & 0.0152\\ +Psaradakis and Vavra & 0.0286 & 0.0429 & 0.0565 & 0.0012 & 0.0014\\ +\addlinespace +Bootstrap Lobato & 0.0542 & 0.0014 & 0.0019 & 0.0025 & 0.0032\\ +Bootstrap Epps & 0.0013 & 0.0018 & 0.0023 & 0.0029 & 0.0037\\ +\bottomrule +\end{tabular} +\end{table} + +\hypertarget{real-data-application}{% +\subsection{Real data application}\label{real-data-application}} + +As an illustrative example, we analyze the monthly mean carbon dioxide, in parts per million (\emph{ppm}), measured at the Mauna Loa Observatory, in Hawaii, from March 1958 to November 2018. The carbon dioxide data measured as the mole fraction in dry air on Mauna Loa constitute the longest record of direct measurements of \(CO2\) in the atmosphere. This dataset is available in the \CRANpkg{astsa} package (Stoffer 2020) under the name \emph{cardox} data and it is displayed in the left panel of Figure \ref{fig:fig1-static}. The plot's grid is created using the \CRANpkg{cowplot} package (Wilke 2020). + +The objective of this subsection is to propose a model to analyze this time series and check the assumptions on the residuals of the model using our implemented \texttt{check\_residuals()} function. The time series clearly has trend and seasonal components (see left panel of Figure \ref{fig:fig1-static}), therefore, an adequate model that filters both components has to be selected. We make use of an ETS model. For its implementation, we make use the \texttt{ets()} function from the \CRANpkg{forecast} package (R. Hyndman and Khandakar 2008). This function fits 32 different ETS models and selects the best model according to information criteria such as \emph{Akaike's information criterion} (AIC) or \emph{Bayesian Information criteria} (BIC) (Chen and Chen 2008). +The results provided by the \texttt{ets()} function are: + +\begin{figure} + +{\centering \includegraphics[width=0.75\linewidth]{figures/fig1-static-1} + +} + +\caption{Left panel: CO2 Levels at Mauna Loa, time-series plot. The cardox data show a positive tendency and strong seasonality. Right panel: forecast of the next 12 months for the CO2 levels at Mauna Loa, the model's predictions capture the time-series behaviour.}\label{fig:fig1-static} +\end{figure} + + + +\begin{verbatim} +library(forecast) +library(astsa) +model = ets(cardox) +summary(model) +#> ETS(M,A,A) +#> +#> Call: +#> ets(y = cardox) +#> +#> Smoothing parameters: +#> alpha = 0.5451 +#> beta = 0.0073 +#> gamma = 0.1076 +#> +#> Initial states: +#> l = 314.4546 +#> b = 0.0801 +#> s = 0.6986 0.0648 -0.8273 -1.8999 -3.0527 -2.7629 +#> -1.2769 0.7015 2.1824 2.6754 2.3317 1.165 +#> +#> sigma: 9e-04 +#> +#> AIC AICc BIC +#> 3429.637 3430.439 3508.867 +#> +#> Training set error measures: +#> ME RMSE MAE MPE MAPE MASE +#> Training set 0.018748 0.3158258 0.2476335 0.005051657 0.06933903 0.152935 +#> ACF1 +#> Training set 0.09308391 +\end{verbatim} + +The resulting model, proposed by the \texttt{ets()} function, for analyzing the \emph{carbon dioxide} data in \emph{Mauna Loa} is an \(ETS[M,A,A]\) model. The parameters \(\alpha, \beta \text{ and } \gamma\) (see Definition 1) have being estimated using the least squares method. If the assumptions on the model are satisfied, then the errors of the model behave like a Gaussian stationary process. To check it, we make use of the function \texttt{check\_residuals()}. For more details on the compatibility of this function with the models obtained by other packages see the \CRANpkg{nortsTest} repository. In the following, we display the results of using the \emph{Augmented Dickey-Fuller} test (\emph{Subsection 3.1}) to check the stationary assumption and the \emph{random projection} test with \texttt{k\ =\ 1} projections to check the normality assumption. For the other test options see the function's documentation. + +\begin{verbatim} +check_residuals(model,unit_root = "adf",normality = "rp", + plot = TRUE) +\end{verbatim} + +\begin{verbatim} +#> +#> *************************************************** +#> +#> Unit root test for stationarity: +#> +#> Augmented Dickey-Fuller Test +#> +#> data: y +#> Dickey-Fuller = -9.8935, Lag order = 9, p-value = 0.01 +#> alternative hypothesis: stationary +#> +#> +#> Conclusion: y is stationary +#> *************************************************** +#> +#> Goodness of fit test for Gaussian Distribution: +#> +#> k random projections test. +#> +#> data: y +#> k = 1, p.value adjust = Benjamini & Yekutieli, p-value = 1 +#> alternative hypothesis: y does not follow a Gaussian Process +#> +#> +#> Conclusion: y follows a Gaussian Process +#> +#> *************************************************** +\end{verbatim} + +The obtained results indicate that the null hypothesis of non stationarity is rejected at significance level \(\alpha = 0.01.\) Additionally, there is no evidence to reject the null hypothesis of normality at significance level \(\alpha = 0.05.\) Consequently, we conclude that the residuals follow a stationary Gaussian process, having that the resulting \(ETS[M,A,A]\) model adjusts well to the \emph{carbon dioxide} data in \emph{Mauna Loa}. + +In the above displayed \texttt{check\_residuals()} function, the \texttt{plot} argument is set to \texttt{TRUE}. The resulting plots are shown in Figure \ref{fig:fig2-static}. The plot in the \emph{top} panel and the auto-correlation plots in the bottom panels insinuate that the residuals have a stationary behavior. The \emph{top} panel plot shows slight oscillations around zero and the auto-correlations functions in the \emph{bottom} panels have values close to zero in every lag. The histogram and qq-plot in the \emph{middle} panels suggest that the marginal distribution of the residuals is normally distributed. Therefore, Figure \ref{fig:fig2-static} agrees with the reported results, indicating that the assumptions of the model are satisfied. + +\begin{figure} + +{\centering \includegraphics[width=1\linewidth]{figures/fig2-static-1} + +} + +\caption{Check residuals plot for the ETS(M,A,A) model. The upper panel shows the residuals time-series plot, showing small oscillations around zero, which insinuates stationarity. The middle plots are the residuals histogram (middle-left) and quantile-quantile plot (middle-right), both plots suggest that the residuals have a normal distribution. The lower panel shows the autocorrelation functions, for both plots, the autocorrelations are close to zero giving the impression of stationarity.}\label{fig:fig2-static} +\end{figure} + + + +As the assumptions of the model have been checked, it can be used for instance to forecast. The result of applying the following function is displayed in Figure \ref{fig:fig1-static}. It presents the carbon dioxide data for the last 8 years and a forecast of the next 12 months. It is observable from the plot that the model captures the process trend and periodicity. + +\begin{verbatim} +autoplot(forecast(model,h = 12),include = 100, + xlab = "years",ylab = "CO2 (ppm)", + main = "Forecast: Carbon Dioxide Levels at Mauna Loa") +\end{verbatim} + + + +\hypertarget{conclusions}{% +\section{Conclusions}\label{conclusions}} + +For independent data, the \CRANpkg{nortest} package (Gross and Ligges 2015) provides five different tests for normality, the \CRANpkg{mvnormtest} package (Jarek 2012) performs the Shapiro-Wilks test for multivariate data and the \CRANpkg{MissMech} package (Jamshidian, Jalal, and Jansen 2014) provides tests for normality in multivariate incomplete data. To test the normality of dependent data, some authors such as Psaradakis and Vávra (2017) and Nieto-Reyes, Cuesta-Albertos, and Gamboa (2014) have available undocumented \texttt{Matlab} code, which is almost only helpful in re-doing their simulation studies. + +To our knowledge, no consistent implementation or package of tests for normality of stationary processes has been done before. Therefore, the \CRANpkg{nortsTest} is the first package to implement normality tests in stationary processes. This work gives a general overview of a careful selection of tests for normality in the stationary process, which consists of the most available types of tests. It additionally provides examples that illustrate each of the test implementations. + +For checking the model's assumptions, the \CRANpkg{forecast} and \CRANpkg{astsa} packages contain functions for visual diagnostic. Following the same idea, \CRANpkg{nortsTest} provides similar diagnostic methods; it also reports the results of testing stationarity and normality, the main assumptions for the residuals in time series analysis. + +\hypertarget{future-work-and-projects}{% +\section{Future work and projects}\label{future-work-and-projects}} + +A further version of the \CRANpkg{nortsTest} package will incorporate additional tests such as Bispectral (Hinich 1982) and Stein's characterization (Bontemps and Meddahi 2005). Further future work will include a Bayesian version of a \emph{residuals check} procedure that uses the random projection method. Any future version under development can be installed from \texttt{GitHub} using the following code. + +\begin{verbatim} +if (!requireNamespace("remotes")) install.packages("remotes") +remotes::install_github("asael697/nortsTest",dependencies = TRUE) +\end{verbatim} + +\hypertarget{acknowledgment}{% +\section*{Acknowledgment}\label{acknowledgment}} +\addcontentsline{toc}{section}{Acknowledgment} + +This work was supported by grant PID2022-139237NB-I00 funded by ``ERDF A way of making Europe'' and MCIN/AEI/10.13039/501100011033. + +\hypertarget{references}{% +\section*{References}\label{references}} +\addcontentsline{toc}{section}{References} + +\hypertarget{refs}{} +\begin{CSLReferences}{1}{0} +\leavevmode\vadjust pre{\hypertarget{ref-vonMisses1962}{}}% +Anderson, T. W. 1962. {``{On the distribution of the two-sample Cramer-von Mises criterion}.''} \emph{The Annals of Mathematical Statistics} 33 (3): 1148--59. \url{https://doi.org/10.1214/aoms/1177704477}. + +\leavevmode\vadjust pre{\hypertarget{ref-anderson1952}{}}% +Anderson, T. W., and D. A. Darling. 1952. {``Asymptotic Theory of Certain Goodness of Fit Criteria Based on Stochastic Processes.''} \emph{Annals of Mathematical Statistics} 23 (2): 193--212. \url{https://doi.org/10.1214/aoms/1177729437}. + +\leavevmode\vadjust pre{\hypertarget{ref-bai2005}{}}% +Bai, Jushan, and Serena Ng. 2005. {``Tests for Skewness, Kurtosis, and Normality for Time Series Data.''} \emph{Journal of Business \& Economic Statistics} 23 (1): 49--60. \url{https://doi.org/10.1198/073500104000000271}. + +\leavevmode\vadjust pre{\hypertarget{ref-Hegy1993}{}}% +Beaulieu, Joseph, and Jeffrey A. Miron. 1993. {``Seasonal Unit Roots in Aggregate {U.S.} Data.''} \emph{Journal of Econometrics} 55 (1): 305--28. \url{https://doi.org/10.1016/0304-4076(93)90018-Z}. + +\leavevmode\vadjust pre{\hypertarget{ref-Benjamin1995}{}}% +Benjamini, Yoav, and Yosef Hochberg. 1995. {``Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing.''} \emph{Journal of the Royal Statistical Society. Series B (Methodological)} 57 (1): 289--300. \url{http://www.jstor.org/stable/2346101}. + +\leavevmode\vadjust pre{\hypertarget{ref-Benjamin2001}{}}% +Benjamini, Yoav, and Daniel Yekutieli. 2001. {``The Control of the False Discovery Rate in Multiple Testing Under Dependency.''} \emph{The Annals of Statistics} 29 (4): 1165--88. \url{http://www.jstor.org/stable/2674075}. + +\leavevmode\vadjust pre{\hypertarget{ref-Berg2010}{}}% +Berg, Arthur, Efstathios Paparoditis, and Dimitris N. Politis. 2010. {``A Bootstrap Test for Time Series Linearity.''} \emph{Journal of Statistical Planning and Inference} 140 (12): 3841--57. \url{https://doi.org/10.1016/j.jspi.2010.04.047}. + +\leavevmode\vadjust pre{\hypertarget{ref-Bollerslev1986}{}}% +Bollerslev, Tim. 1986. {``Generalized Autoregressive Conditional Heteroskedasticity.''} \emph{Journal of Econometrics} 31 (3): 307--27. \url{https://doi.org/10.1016/0304-4076(86)90063-1}. + +\leavevmode\vadjust pre{\hypertarget{ref-Meddahi2005}{}}% +Bontemps, Christian, and Nour Meddahi. 2005. {``Testing Normality: A GMM Approach.''} \emph{Journal of Econometrics} 124 (1): 149--86. \url{https://doi.org/10.1016/j.jeconom.2004.02.014}. + +\leavevmode\vadjust pre{\hypertarget{ref-Box}{}}% +Box, G. E. P., and David A. Pierce. 1970. {``Distribution of Residual Autocorrelations in Autoregressive-Integrated Moving Average Time Series Models.''} \emph{Journal of the American Statistical Association} 65 (332): 1509--26. \url{https://doi.org/10.1080/01621459.1970.10481180}. + +\leavevmode\vadjust pre{\hypertarget{ref-Box1990}{}}% +Box, George Edward Pelham, and Gwilym Jenkins. 1990. \emph{Time Series Analysis, Forecasting and Control}. USA: Holden-Day, Inc. \url{https://www.wiley.com/en-us/Time+Series+Analysis}. + +\leavevmode\vadjust pre{\hypertarget{ref-Buhlmann1997}{}}% +Bühlmann, Peter. 1997. {``Sieve Bootstrap for Time Series.''} \emph{Bernoulli} 3 (2): 123--48. \url{http://www.jstor.org/stable/3318584}. + +\leavevmode\vadjust pre{\hypertarget{ref-ch1995}{}}% +Canova, Fabio, and Bruce E. Hansen. 1995. {``Are Seasonal Patterns Constant over Time? A Test for Seasonal Stability.''} \emph{Journal of Business \& Economic Statistics} 13 (3): 237--52. \url{https://doi.org/10.1080/07350015.1995.10524598}. + +\leavevmode\vadjust pre{\hypertarget{ref-BIC2006}{}}% +Chen, Jiahua, and Zehua Chen. 2008. {``Extended Bayesian Information Criteria for Model Selection with Large Model Spaces.''} \emph{Biometrika} 95 (3): 759--71. \url{https://doi.org/10.1093/biomet/asn034}. + +\leavevmode\vadjust pre{\hypertarget{ref-Cuesta2007}{}}% +Cuesta-Albertos, J. A., E. del Barrio, R. Fraiman, and C. Matrán. 2007. {``The Random Projection Method in Goodness of Fit for Functional Data.''} \emph{Computational Statistics \& Data Analysis} 51 (10): 4814--31. \url{https://doi.org/10.1016/j.csda.2006.09.007}. + +\leavevmode\vadjust pre{\hypertarget{ref-Dagostino1987}{}}% +D'Agostino, Ralph B., and Michael A. Stephens. 1986. {``Goodness-of-Fit Techniques.''} \emph{Quality and Reliability Engineering International} 3 (1): 71--71. \url{https://doi.org/10.1002/qre.4680030121}. + +\leavevmode\vadjust pre{\hypertarget{ref-Wilkinson1986}{}}% +Dallal, Gerard E., and Leland Wilkinson. 1986. {``An Analytic Approximation to the Distribution of Lilliefors's Test Statistic for Normality.''} \emph{The American Statistician} 40 (4): 294--96. \url{https://doi.org/10.1080/00031305.1986.10475419}. + +\leavevmode\vadjust pre{\hypertarget{ref-DH2008}{}}% +Doornik, Jurgen A., and Henrik Hansen. 2008. {``{An omnibus test for univariate and multivariate normality}.''} \emph{Oxford Bulletin of Economics and Statistics} 70 (s1): 927--39. \url{https://doi.org/10.1111/j.1468-0084.2008.} + +\leavevmode\vadjust pre{\hypertarget{ref-el2022normality}{}}% +El Bouch, Sara, Olivier Michel, and Pierre Comon. 2022. {``A Normality Test for Multivariate Dependent Samples.''} \emph{Signal Processing} 201: 108705. \url{https://doi.org/10.1016/j.sigpro.2022.108705}. + +\leavevmode\vadjust pre{\hypertarget{ref-engle1982}{}}% +Engle, Robert F. 1982. {``Autoregressive Conditional Heteroscedasticity with Estimates of the Variance of United Kingdom Inflation.''} \emph{Econometrica} 50 (4): 987--1007. \url{http://www.jstor.org/stable/1912773}. + +\leavevmode\vadjust pre{\hypertarget{ref-epps1987}{}}% +Epps, T. W. 1987. {``Testing That a Stationary Time Series Is {G}aussian.''} \emph{The Annals of Statistics} 15 (4): 1683--98. \url{https://doi.org/10.1214/aos/1176350618}. + +\leavevmode\vadjust pre{\hypertarget{ref-Gasser1975}{}}% +Gasser, Theo. 1975. {``Goodness-of-Fit Tests for Correlated Data.''} \emph{Biometrika} 62 (3): 563--70. \url{http://www.jstor.org/stable/2335511}. + +\leavevmode\vadjust pre{\hypertarget{ref-gelman2013}{}}% +Gelman, A., J. B. Carlin, H. S. Stern, D. B. Dunson, A. Vehtari, and D. B. Rubin. 2013. \emph{Bayesian Data Analysis, Third Edition}. Chapman \& Hall/CRC Texts in Statistical Science. Taylor \& Francis. \url{https://books.google.nl/books?id=ZXL6AQAAQBAJ}. + +\leavevmode\vadjust pre{\hypertarget{ref-nortest2015}{}}% +Gross, Juergen, and Uwe Ligges. 2015. \emph{`Nortest`: Tests for Normality}. \url{https://CRAN.R-project.org/package=nortest}. + +\leavevmode\vadjust pre{\hypertarget{ref-HZ1990}{}}% +Henze, N., and B. Zirkler. 1990. {``A Class of Invariant Consistent Tests for Multivariate Normality.''} \emph{Communications in Statistics - Theory and Methods} 19 (10): 3595--3617. \url{https://doi.org/10.1080/03610929008830400}. + +\leavevmode\vadjust pre{\hypertarget{ref-Hinich1982}{}}% +Hinich, Melvin J. 1982. {``Testing for {G}aussianity and Linearity of a Stationary Time Series.''} \emph{Journal of Time Series Analysis} 3 (3): 169--76. \url{https://doi.org/10.1111/j.1467-9892.1982.tb00339}. + +\leavevmode\vadjust pre{\hypertarget{ref-Holt2004}{}}% +Holt, Charles C. 2004. {``Forecasting Seasonals and Trends by Exponentially Weighted Moving Averages.''} \emph{International Journal of Forecasting} 20 (1): 5--10. \url{https://doi.org/10.1016/j.ijforecast.2003.09.015}. + +\leavevmode\vadjust pre{\hypertarget{ref-hong1999hypothesis}{}}% +Hong, Yongmiao. 1999. {``Hypothesis Testing in Time Series via the Empirical Characteristic Function: A Generalized Spectral Density Approach.''} \emph{Journal of the American Statistical Association} 94 (448): 1201--20. \url{https://doi.org/10.2307/2669935}. + +\leavevmode\vadjust pre{\hypertarget{ref-Hyndman2008}{}}% +Hyndman, Robin John, Anne B Koehler, J Keith Ord, and Ralph David Snyder. 2008. \emph{Forecasting with Exponential Smoothing: The State Space Approach}. Springer. \url{https://doi.org/10.1111/j.1751-5823.2009.00085_17}. + +\leavevmode\vadjust pre{\hypertarget{ref-Rob2007}{}}% +Hyndman, Rob, and Yeasmin Khandakar. 2008. {``Automatic Time Series Forecasting: The `Forecast` Package for {`R`}.''} \emph{Journal of Statistical Software, Articles} 27 (3): 1--22. \url{https://doi.org/10.18637/jss.v027.i03}. + +\leavevmode\vadjust pre{\hypertarget{ref-Mortaza2014}{}}% +Jamshidian, Mortaza, Siavash Jalal, and Camden Jansen. 2014. {```MissMech`: An {`R`} Package for Testing Homoscedasticity, Multivariate Normality, and Missing Completely at Random (MCAR).''} \emph{Journal of Statistical Software} 56 (6): 1--31. \url{http://www.jstatsoft.org/v56/i06/}. + +\leavevmode\vadjust pre{\hypertarget{ref-mvnormtest2012}{}}% +Jarek, Slawomir. 2012. \emph{`Mvnormtest`: Normality Test for Multivariate Variables}. \url{https://CRAN.R-project.org/package=mvnormtest}. + +\leavevmode\vadjust pre{\hypertarget{ref-jarque1980}{}}% +Jarque, Carlos M., and Anil K. Bera. 1980. {``Efficient Tests for Normality, Homoscedasticity and Serial Independence of Regression Residuals.''} \emph{Economics Letters} 6 (3): 255--59. \url{https://doi.org/10.1016/0165-1765(80)90024-5}. + +\leavevmode\vadjust pre{\hypertarget{ref-KppsI1992}{}}% +Kwiatkowski, Denis, Peter C. B. Phillips, Peter Schmidt, and Yongcheol Shin. 1992. {``Testing the Null Hypothesis of Stationarity Against the Alternative of a Unit Root: How Sure Are We That Economic Time Series Have a Unit Root?''} \emph{Journal of Econometrics} 54 (1): 159--78. \url{https://doi.org/10.1016/0304-4076(92)90104-Y}. + +\leavevmode\vadjust pre{\hypertarget{ref-Lobato2004}{}}% +Lobato, Ignacio, and Carlos Velasco. 2004. {``A Simple Test of Normality for Time Series.''} \emph{Econometric Theory} 20 (August): 671--89. \url{https://doi.org/10.1017/S0266466604204030}. + +\leavevmode\vadjust pre{\hypertarget{ref-Lomincki1961}{}}% +Lomnicki, Z. 1961. {``Tests for Departure from Normality in the Case of Linear Stochastic Processes.''} \emph{Metrika: International Journal for Theoretical and Applied Statistics} 4 (1): 37--62. \url{https://EconPapers.repec.org/RePEc:spr:metrik:v:4:y:1961:i:1:p:37-62}. + +\leavevmode\vadjust pre{\hypertarget{ref-uroot}{}}% +López-de-Lacalle, Javier. 2019. \emph{`Uroot`: Unit Root Tests for Seasonal Time Series}. \url{https://CRAN.R-project.org/package=uroot}. + +\leavevmode\vadjust pre{\hypertarget{ref-mardia1970measures}{}}% +Mardia, Kanti V. 1970. {``Measures of Multivariate Skewness and Kurtosis with Applications.''} \emph{Biometrika} 57 (3): 519--30. \url{http://www.jstor.org/stable/2334770}. + +\leavevmode\vadjust pre{\hypertarget{ref-meintanis2016review}{}}% +Meintanis, Simos G. 2016. {``A Review of Testing Procedures Based on the Empirical Characteristic Function.''} \emph{South African Statistical Journal} 50 (1): 1--14. \url{https://doi.org/10.10520/EJC186846}. + +\leavevmode\vadjust pre{\hypertarget{ref-moulines1992testing}{}}% +Moulines, E, K Choukri, and M Sharbit. 1992. {``Testing That a Multivariate Stationary Time-Series Is {G}aussian.''} In \emph{{[}1992{]} IEEE Sixth SP Workshop on Statistical Signal and Array Processing}, 185--88. IEEE. \url{https://doi.org/10.1109/SSAP.1992.246818}. + +\leavevmode\vadjust pre{\hypertarget{ref-Nieto-Reyes:2022-1}{}}% +Nieto-Reyes, Alicia. 2021. {``On the Non-{G}aussianity of the Height of Sea Waves.''} \emph{Journal of Marine Science and Engineering} 9 (12). \url{https://www.mdpi.com/2077-1312/9/12/1446}. + +\leavevmode\vadjust pre{\hypertarget{ref-Nieto-Reyes:2022-2}{}}% +---------. 2022. {``On the Non-{G}aussianity of Sea Surface Elevations.''} \emph{Journal of Marine Science and Engineering} 10 (9). \url{https://doi.org/10.3390/jmse10091303}. + +\leavevmode\vadjust pre{\hypertarget{ref-nietoreyes2014}{}}% +Nieto-Reyes, Alicia, Juan Antonio Cuesta-Albertos, and Fabrice Gamboa. 2014. {``A Random-Projection Based Test of {G}aussianity for Stationary Processes.''} \emph{Computational Statistics \& Data Analysis} 75: 124--41. \url{https://doi.org/10.1016/j.csda.2014.01.013}. + +\leavevmode\vadjust pre{\hypertarget{ref-ocsb1988}{}}% +Osborn, Denise R., A. P. L. Chui, Jeremy P. Smith, and C. R. Birchenhall. 1988. {``Seasonality and the Order of Integration for Consumption.''} \emph{Oxford Bulletin of Economics and Statistics} 50 (4): 361--77. \url{https://doi.org/10.1111/j.1468-0084.1988.mp50004002.x}. + +\leavevmode\vadjust pre{\hypertarget{ref-Pearson1895}{}}% +Pearson, Karl, and Olaus Magnus Friedrich Erdmann Henrici. 1895. {``X. {C}ontributions to the Mathematical Theory of Evolution.-{II} {S}kew Variation in Homogeneous Material.''} \emph{Philosophical Transactions of the Royal Society of London. (A.)} 186: 343--414. \url{https://doi.org/10.1098/rsta.1895.0010}. + +\leavevmode\vadjust pre{\hypertarget{ref-Perron1988}{}}% +Perron, Pierre. 1988. {``Trends and Random Walks in Macroeconomic Time Series: Further Evidence from a New Spproach.''} \emph{Journal of Economic Dynamics and Control} 12 (2): 297--332. \url{https://doi.org/10.1016/0165-1889(88)90043-7}. + +\leavevmode\vadjust pre{\hypertarget{ref-OBrien2010}{}}% +Petris, Giovanni, Sonia Petrone, and Patrizia Campagnoli. 2007. {``Dynamic Linear Models with {`R`}.''} Berlin: Springer. \url{https://doi.org/10.1111/j.1751-5823.2010.00109_26.x}. + +\leavevmode\vadjust pre{\hypertarget{ref-MarianZach2017}{}}% +Psaradakis, Zacharias. 2017. {``Normality Tests for Dependent Data.''} Working and Discussion Papers WP 12/2017. Research Department, National Bank of Slovakia. \url{https://ideas.repec.org/p/svk/wpaper/1053.html}. + +\leavevmode\vadjust pre{\hypertarget{ref-vavra2017}{}}% +Psaradakis, Zacharias, and Marián Vávra. 2017. {``A Distance Test of Normality for a Wide Class of Stationary Processes.''} \emph{Econometrics and Statistics} 2: 50--60. \url{https://doi.org/10.1016/j.ecosta.2016.11.005}. + +\leavevmode\vadjust pre{\hypertarget{ref-psaradakis2020normality}{}}% +---------. 2020. {``Normality Tests for Dependent Data: Large-Sample and Bootstrap Approaches.''} \emph{Communications in Statistics-Simulation and Computation} 49 (2): 283--304. \url{https://doi.org/10.1080/03610918.2018.1485941}. + +\leavevmode\vadjust pre{\hypertarget{ref-mvntest}{}}% +Pya, Natalya, Vassilly Voinov, Rashid Makarov, and Yevgeniy Voinov. 2016. \emph{`mvnTest`: Goodness of Fit Tests for Multivariate Normality}. \url{https://CRAN.R-project.org/package=mvnTest}. + +\leavevmode\vadjust pre{\hypertarget{ref-aTSA}{}}% +Qiu, Debin. 2015. \emph{`aTSA`: Alternative Time Series Analysis}. \url{https://CRAN.R-project.org/package=aTSA}. + +\leavevmode\vadjust pre{\hypertarget{ref-Royston1982}{}}% +Royston, J. P. 1982. {``An Extension of {S}hapiro and {W}ilk's {W} Test for Normality to Large Samples.''} \emph{Journal of the Royal Statistical Society. Series C (Applied Statistics)} 31 (2): 115--24. \url{http://www.jstor.org/stable/2347973}. + +\leavevmode\vadjust pre{\hypertarget{ref-Royston1992}{}}% +---------. 1992. {``Approximating the Shapiro-Wilk {W}-Test for Non-Normality.''} \emph{Journal of Statistics and Computing} 2 (3): 117--19. \url{https://doi.org/10.1007/BF01891203}. + +\leavevmode\vadjust pre{\hypertarget{ref-Royston1993}{}}% +Royston, Patrick. 1993. {``A Pocket-Calculator Algorithm for the {S}hapiro-{F}rancia Test for Non-Normality: An Application to Medicine.''} \emph{Statistics in Medicine} 12 (2): 181--84. \url{https://doi.org/10.1002/sim.4780120209}. + +\leavevmode\vadjust pre{\hypertarget{ref-dickey1984}{}}% +Said, Said E., and David A. Dickey. 1984. {``Testing for Unit Roots in Autoregressive-Moving Average Models of Unknown Order.''} \emph{Biometrika} 71 (3): 599--607. \url{https://doi.org/10.1093/biomet/71.3.599}. + +\leavevmode\vadjust pre{\hypertarget{ref-SWtest1965}{}}% +Shapiro, S. S., and M. B. Wilk. 1965. {``An Analysis of Variance Test for Normality (Complete Samples).''} \emph{Biometrika} 52 (3-4): 591--611. \url{https://doi.org/10.1093/biomet/52.3-4.591}. + +\leavevmode\vadjust pre{\hypertarget{ref-shumway2010}{}}% +Shumway, R. H., and D. S. Stoffer. 2010. \emph{Time Series Analysis and Itts Applications: With {`R`} Examples}. Springer Texts in Statistics. Springer New York. \url{https://books.google.es/books?id=dbS5IQ8P5gYC}. + +\leavevmode\vadjust pre{\hypertarget{ref-Smirnov1948}{}}% +Smirnov, N. 1948. {``Table for Estimating the Goodness of Fit of Empirical Distributions.''} \emph{Annals of Mathematical Statistics} 19 (2): 279--81. \url{https://doi.org/10.1214/aoms/1177730256}. + +\leavevmode\vadjust pre{\hypertarget{ref-Steinberg1992}{}}% +Steinberg, Y., and O. Zeitouni. 1992. {``On Tests for Normality.''} \emph{IEEE Transactions on Information Theory} 38 (6): 1779--87. \url{https://doi.org/10.1109/18.165450}. + +\leavevmode\vadjust pre{\hypertarget{ref-astsa}{}}% +Stoffer, David. 2020. \emph{`Astsa`: Applied Statistical Time Series Analysis}. \url{https://CRAN.R-project.org/package=astsa}. + +\leavevmode\vadjust pre{\hypertarget{ref-R}{}}% +Team, `R` Core. 2018. \emph{{`R`}: A Language and Environment for Statistical Computing}. Vienna, Austria: {`R`} Foundation for Statistical Computing. \url{https://www.R-project.org/}. + +\leavevmode\vadjust pre{\hypertarget{ref-tseries}{}}% +Trapletti, Adrian, and Kurt Hornik. 2019. \emph{`Tseries`: Time Series Analysis and Computational Finance}. \url{https://CRAN.R-project.org/package=tseries}. + +\leavevmode\vadjust pre{\hypertarget{ref-Ts2010}{}}% +Tsay, R. 2010. \emph{Analysis of Financial Time Series}. Second. Chicago: Wiley-Interscience. \url{https://doi.org/10.1002/0471264105}. + +\leavevmode\vadjust pre{\hypertarget{ref-S2_2016}{}}% +Vassilly Voinov, Rashid Makarov, Natalie Pya, and Yevgeniy Voinov. 2016. {``New Invariant and Consistent Chi-Squared Type Goodness-of-Fit Tests for Multivariate Normality and a Related Comparative Simulation Study.''} \emph{Communications in Statistics - Theory and Methods} 45 (11): 3249--63. \url{https://doi.org/10.1080/03610926.2014.901370}. + +\leavevmode\vadjust pre{\hypertarget{ref-W2006}{}}% +Wasserman, Larry. 2006. \emph{All of Nonparametric Statistics}. New York: Springer. \url{https://doi.org/10.1007/0-387-30623-4}. + +\leavevmode\vadjust pre{\hypertarget{ref-west2006}{}}% +West, M., and J. Harrison. 2006. \emph{Bayesian Forecasting and Dynamic Models}. Springer Series in Statistics. Springer New York. \url{https://books.google.nl/books?id=0mPgBwAAQBAJ}. + +\leavevmode\vadjust pre{\hypertarget{ref-ggplot2}{}}% +Wickham, Hadley. 2009. \emph{`Ggplot2`: Elegant Graphics for Data Analysis}. Springer-Verlag New York. \url{http://ggplot2.org}. + +\leavevmode\vadjust pre{\hypertarget{ref-cowplot}{}}% +Wilke, Claus O. 2020. \emph{`Cowplot`: Streamlined Plot Theme and Plot Annotations for `Ggplot2`}. \url{https://CRAN.R-project.org/package=cowplot}. + +\leavevmode\vadjust pre{\hypertarget{ref-fGarch}{}}% +Wuertz, Diethelm, Tobias Setz, Yohan Chalabi, Chris Boudt, Pierre Chausse, and Michal Miklovac. 2017. \emph{`fGarch`: Rmetrics - Autoregressive Conditional Heteroskedastic Modelling}. \url{https://CRAN.R-project.org/package=fGarch}. + +\end{CSLReferences} + +\bibliography{RJreferences.bib} + +\address{% +Asael Alonzo Matamoros\\ +Aalto University\\% +Department of Computer Science\\ Eespo, Finland\\ +% +\url{https://asael697.github.io}\\% +% +\href{mailto:izhar.alonzomatamoros@aalto.fi}{\nolinkurl{izhar.alonzomatamoros@aalto.fi}}% +} + +\address{% +Alicia Nieto-Reyes\\ +Universidad de Cantabria\\% +Departmento de Mathemáticas, Estadística y Computación\\ Avd. de los Castros s/n.~39005 Santander, Spain\\ +% +\url{https://orcid.org/0000-0002-0268-3322}\\% +% +\href{mailto:alicia.nieto@unican.es}{\nolinkurl{alicia.nieto@unican.es}}% +} + +\address{% +Claudio Agostinelli\\ +University of Trento\\% +Department of Mathematics\\ Via Sommarive, 14 - 38123 Povo\\ +% +\url{https://orcid.org/0000-0001-6702-4312}\\% +% +\href{mailto:claudio.agostinelli@unitn.it}{\nolinkurl{claudio.agostinelli@unitn.it}}% +} diff --git a/_articles/RJ-2024-008/scripts/runtime.R b/_articles/RJ-2024-008/scripts/runtime.R new file mode 100644 index 0000000000..4877c86237 --- /dev/null +++ b/_articles/RJ-2024-008/scripts/runtime.R @@ -0,0 +1,60 @@ +################################################################# +# runtime simulations +################################################################# + +library(nortsTest) + +lobato = c(nortsTest:::rejection_rate(n = 1000, htest = "lobato", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "lobato", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "lobato", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "lobato", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "lobato", seed = 1975)[2])/1000 + +epps = c(nortsTest:::rejection_rate(n = 1000, htest = "epps", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "epps", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "epps", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "epps", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "epps", seed = 1975)[2])/1000 + +rp = c(nortsTest:::rejection_rate(n = 1000, htest = "rp", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "rp", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "rp", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "rp", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "rp", seed = 1975)[2])/1000 + +elbouch = c(nortsTest:::rejection_rate(n = 1000, htest = "elbouch", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "elbouch", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "elbouch", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "elbouch", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "elbouch", seed = 1975)[2])/1000 + +vavra = c(nortsTest:::rejection_rate(n = 1000, htest = "vavra", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "vavra", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "vavra", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "vavra", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "vavra", seed = 1975)[2])/1000 + +lobato_sb = c(nortsTest:::rejection_rate(n = 1000, htest = "lobato_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "lobato_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "lobato_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "lobato_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "lobato_bootstrap", seed = 1975)[2])/1000 + +epps_sb = c(nortsTest:::rejection_rate(n = 1000, htest = "epps_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 2000, htest = "epps_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 3000, htest = "epps_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 4000, htest = "epps_bootstrap", seed = 1975)[2], + nortsTest:::rejection_rate(n = 5000, htest = "epps_bootstrap", seed = 1975)[2])/1000 + +runtime = matrix(c(lobato, epps, rp, elbouch, vavra, lobato_sb, epps_sb), + nrow = 7, ncol = 5, byrow = TRUE) + +tests = c("Lobato and Velasco", "Epps", "Random Projections","El Bouch", + "Psaradakis and Vavra", "Bootstrap Lobato", "Bootstrap Epps") + +runtime = as.data.frame(runtime) +colnames(runtime) = paste("n =",c(1000,2000,3000,4000,5000)) +runtime = cbind(tests, runtime) + +rm(lobato, epps, rp, elbouch, vavra, lobato_sb, epps_sb,tests) +save.image("~/Documents/nortsTest_paper/data/runtime.Rdata") diff --git a/_articles/RJ-2024-008/scripts/simulations.R b/_articles/RJ-2024-008/scripts/simulations.R new file mode 100644 index 0000000000..e37f1f6628 --- /dev/null +++ b/_articles/RJ-2024-008/scripts/simulations.R @@ -0,0 +1,124 @@ +################################################################# +# Simulations and Data Analysis +################################################################# + +library(nortsTest) + +## subsection: Numerical experiments + +lobato100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "lobato") +epps100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "epps") +rp100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "rp",k = 1) +vavra100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "vavra") +elbouch100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "elbouch") +l_sb100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "lobato_bootstrap") +e_sb100 = nortsTest:::rejection_table(reps = 1000,n = 100,htest = "epps_bootstrap") + +rr100 = rbind(lobato100,epps100) +rr100 = rbind(rr100,rp100) +rr100 = rbind(rr100,vavra100) +rr100 = rbind(rr100, l_sb100) +rr100 = rbind(rr100, e_sb100) +rr100 = rbind(rr100, elbouch100) + +row.names(rr100) = NULL +colnames(rr100) = c("-0.4","-0.25","0.0","0.25","0.4","max time") +rr100 = data.frame(rr100) + +rr100$distribution = rep(c("N","logN","t3","chisq10","Gamma(7,1)"),7) +rr100$test = c(rep("Lobato and Velasco",5),rep("Epps",5),rep("Random projection k = 1",5), + rep("Psaradakis and Vavra",5),rep("Bootstrap Lobato",5),rep("Bootstrap Epps",5), + rep("El bouch",5)) + +rrr100 = rr100[,c(8,7,1:6)] + +################################ Rejection Rates m = 250 ################################# + +lobato100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "lobato") +epps100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "epps") +rp100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "rp",k = 1) +vavra100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "vavra") +elbouch100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "elbouch") +l_sb100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "lobato_bootstrap") +e_sb100 = nortsTest:::rejection_table(reps = 1000,n = 250,htest = "epps_bootstrap") + +rr100 = rbind(lobato100,epps100) +rr100 = rbind(rr100,rp100) +rr100 = rbind(rr100,vavra100) +rr100 = rbind(rr100, l_sb100) +rr100 = rbind(rr100, e_sb100) +rr100 = rbind(rr100, elbouch100) + +row.names(rr100) = NULL +colnames(rr100) = c("-0.4","-0.25","0.0","0.25","0.4","max time") +rr100 = data.frame(rr100) + +rr100$distribution = rep(c("N","logN","t3","chisq10","Gamma(7,1)"),7) +rr100$test = c(rep("Lobato and Velasco",5),rep("Epps",5),rep("Random projection k = 1",5), + rep("Psaradakis and Vavra",5),rep("Bootstrap Lobato",5),rep("Bootstrap Epps",5), + rep("El bouch",5)) + +rrr250 = rr100[,c(8,7,1:6)] + + +################################ Rejection Rates m = 500 ################################ + +lobato100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "lobato") +epps100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "epps") +rp100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "rp",k = 1) +vavra100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "vavra") +elbouch100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "elbouch") +l_sb100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "lobato_bootstrap") +e_sb100 = nortsTest:::rejection_table(reps = 1000,n = 500,htest = "epps_bootstrap") + +rr100 = rbind(lobato100,epps100) +rr100 = rbind(rr100,rp100) +rr100 = rbind(rr100,vavra100) +rr100 = rbind(rr100, l_sb100) +rr100 = rbind(rr100, e_sb100) +rr100 = rbind(rr100, elbouch100) + +row.names(rr100) = NULL +colnames(rr100) = c("-0.4","-0.25","0.0","0.25","0.4","max time") +rr100 = data.frame(rr100) + +rr100$distribution = rep(c("N","logN","t3","chisq10","Gamma(7,1)"),7) +rr100$test = c(rep("Lobato and Velasco",5),rep("Epps",5),rep("Random projection k = 1",5), + rep("Psaradakis and Vavra",5),rep("Bootstrap Lobato",5),rep("Bootstrap Epps",5), + rep("El bouch",5)) + +rrr500 = rr100[,c(8,7,1:6)] + +################################ Rejection Rates m = 1000 ################################## + +lobato100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "lobato") +epps100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "epps") +rp100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "rp",k = 1) +vavra100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "vavra") +elbouch100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "elbouch") +l_sb100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "lobato_bootstrap") +e_sb100 = nortsTest:::rejection_table(reps = 1000,n = 1000,htest = "epps_bootstrap") + +rr100 = rbind(lobato100,epps100) +rr100 = rbind(rr100,rp100) +rr100 = rbind(rr100,vavra100) +rr100 = rbind(rr100, l_sb100) +rr100 = rbind(rr100, e_sb100) +rr100 = rbind(rr100, elbouch100) + +row.names(rr100) = NULL +colnames(rr100) = c("-0.4","-0.25","0.0","0.25","0.4","max time") +rr100 = data.frame(rr100) + +rr100$distribution = rep(c("N","logN","t3","chisq10","Gamma(7,1)"),7) +rr100$test = c(rep("Lobato and Velasco",5),rep("Epps",5),rep("Random projection k = 1",5), + rep("Psaradakis and Vavra",5),rep("Bootstrap Lobato",5),rep("Bootstrap Epps",5), + rep("El bouch",5)) + +rrr1000 = rr100[,c(8,7,1:6)] + +results1 = cbind(rrr100,rrr250[,3:8]) +results2 = cbind(rrr500,rrr1000[,3:8]) + +rm(lobato100, epps100, rp100, vavra100, elbouch100, l_sb100, e_sb100) +save.image("~/Documents/nortsTest_paper/data/r_sim.Rdata") diff --git a/_articles/RJ-2024-009/RJ-2024-009.R b/_articles/RJ-2024-009/RJ-2024-009.R new file mode 100644 index 0000000000..f9d7bee53f --- /dev/null +++ b/_articles/RJ-2024-009/RJ-2024-009.R @@ -0,0 +1,149 @@ +# Generated by `rjournal_pdf_article()` using `knitr::purl()`: do not edit by hand +# Please edit RJ-2024-009.Rmd to modify this file + +## ----eval = FALSE, echo = FALSE----------------------------------------------- +# # to be added to the yaml header when compiling to latex +# output: +# rjtools::rjournal_pdf_article: +# self_contained: yes +# toc: no + + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, comment = NA) + +library(shinymgr) +library(fs) +library(rjtools) +library(dplyr) + + +## ----eval = FALSE------------------------------------------------------------- +# include=knitr::is_latex_output(), eval=knitr::is_latex_output() + + +## ----fig1, echo = F, out.width = "100%", fig.cap = "Stages of a reproducible workflow, a process that moves an inquiry from raw data to insightful contribution."---- +knitr::include_graphics('images/figure1.png') + + +## ----fig2, echo = F, out.width = "100%", fig.cap = "Top: The \"iris\\_explorer\" app guides a user through an analysis of the iris dataset in a tab-based sequence. Bottom: A blueprint of the \"iris\\_explorer\" app shows the 5 tabs, each containing a single module identified by name within blue ovals. Some of the shiny modules require inputs and generate outputs as identified in gray polygons.", fig.pos = "h"---- +knitr::include_graphics('images/figure2.png') + + +## ----eval = FALSE, echo =TRUE------------------------------------------------- +# install.packages("shinymgr") + + +## ----eval = FALSE, echo =TRUE------------------------------------------------- +# remotes::install_gitlab( +# repo = "vtcfwru/shinymgr", +# auth_token = Sys.getenv("GITLAB_PAT"), +# host = "code.usgs.gov", +# build_vignettes = FALSE) + + +## ----warning=FALSE, eval = FALSE, echo = TRUE--------------------------------- +# # set the directory path that will house the shinymgr project +# parentPath <- getwd() +# +# # set up raw directories and fresh database +# shinymgr_setup( +# parentPath = parentPath, +# demo = TRUE) + + +## ----comment = NA, echo = FALSE----------------------------------------------- +fs::dir_tree( + path = "shinymgr", + recurse = TRUE) + + +## ----eval = FALSE, echo = TRUE------------------------------------------------ +# # launch shinymgr +# launch_shinymgr(shinyMgrPath = paste0(parentPath, "/shinymgr")) + + +## ----fig3, echo = F, out.width = "100%", fig.cap = "The shinymgr Developer Portal consists of a sidebar panel where developers can create new shiny modules and new apps, and test-drive analyses and reports from the user's perspective. The main panel shows the \"Build App\" tab within the \"Developer Tools\" section."---- +knitr::include_graphics('images/figure3.png') + + +## ----fig4, echo = FALSE, out.width = "100%", fig.cap = "The 11 tables of the shinymgr SQLite database. Lines indicate how the tables are related to each other.", fig.pos = "b"---- +knitr::include_graphics('images/figure4.jpg') + + +#> #!! ModName = iris_cluster +#> #!! ModDisplayName = Iris K-Means Clustering +#> #!! ModDescription = Clusters iris data based on 2 attributes +#> #!! ModCitation = Baggins, Bilbo. (2023). iris_cluster. [Source code]. +#> #!! ModNotes = Demo module for the shinymgr package. +#> #!! ModActive = 1 +#> #!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame + +## ----fig5, echo = FALSE, out.width = "100%", fig.cap= "The shinymgr Developer Portal layout, showing the app builder in the Developer Tools.", fig.pos = "h"---- + +knitr::include_graphics('images/figure5.png') + + +## ----echo = TRUE-------------------------------------------------------------- +# look at the appTabs table in the database +qry_app_flow("iris_explorer", shinyMgrPath = paste0(getwd(),"/shinymgr")) + + +## ----eval = TRUE, echo = FALSE------------------------------------------------ +fs::dir_tree( + path = "shinymgr", + recurse = FALSE, + regexp = '(modules)|(global)|(data$)|(reports)|(server)|(ui)|(www)' +) + + +## ----fig6, echo = FALSE, out.width = "100%", fig.cap= "An example of a deployed shinymgr app. The deployed version excludes the Developers Tools tab and is an example of what the end user sees when using a deployed app."---- + +knitr::include_graphics('images/figure6.png') + + + +## ----eval = TRUE, echo = TRUE------------------------------------------------- +rds_filepath <- paste0(getwd(),"/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS") +old_analysis <- readRDS(rds_filepath) +str(old_analysis, max.level = 2, nchar.max = 20, vec.len = 15) + + +## ----echo = TRUE, eval = FALSE------------------------------------------------ +# rerun_analysis(analysis_path = rds_filepath) + + +## ----fig7, echo = F, out.width = "100%", fig.cap = "A screenshot of the rerun\\_analysis() function, as called on the saved analysis from the iris\\_explorer app (RDS file). The active tab, called \"The App\", allows a user to rerun a previously executed analysis. The \"Analysis Summary\" tab displays the values of all module arguments and returns, captured when the analysis was saved, along with a detailed description of the app, it's modules, the App's source code, and all package dependencies."---- +knitr::include_graphics('images/figure7.png') + + +## ----comment = '', echo=FALSE, class.output='r'------------------------------- +rmd_filepath <- paste0(getwd(),"/shinymgr/reports/iris_explorer/iris_explorer_report.Rmd") +cat(paste(readLines(rmd_filepath), collapse = '\n')) + + +## ----warning = FALSE---------------------------------------------------------- +learnr::available_tutorials( + package = "shinymgr") %>% + dplyr::arrange(title) + + + +## ----eval = FALSE, echo = TRUE------------------------------------------------ +# learnr::run_tutorial( +# name = "modules", +# package = "shinymgr") + + +## ----eval = FALSE, echo = TRUE------------------------------------------------ +# browseURL(paste0(find.package("shinymgr"), "/extdata/shinymgr_cheatsheet.pdf")) + + +## ----fig8, echo = F, out.width = "85%", fig.cap = "Entity relationship diagram for the shinymgr database, which tracks all components of an apps and modules. The database consists of 11 tables. Primary keys are referenced with a \"pk\" prefix, while foreign keys are referenced with an \"fk\" prefix. A full description of the database is contained in the \"database\" learnr tutorial that comes with the shinymgr package.", fig.pos = 'h'---- +knitr::include_graphics('images/figure8.png') + + +## ----comment = '', echo=FALSE, class.output='r'------------------------------- +rmd_filepath <- paste0(getwd(),"/shinymgr/modules/iris_cluster.R") +cat(paste(readLines(rmd_filepath), collapse = '\n')) + diff --git a/_articles/RJ-2024-009/RJ-2024-009.Rmd b/_articles/RJ-2024-009/RJ-2024-009.Rmd new file mode 100644 index 0000000000..911bde9002 --- /dev/null +++ b/_articles/RJ-2024-009/RJ-2024-009.Rmd @@ -0,0 +1,414 @@ +--- +title: 'shinymgr: A Framework for Building, Managing, and Stitching Shiny Modules + into Reproducible Workflows' +draft: no +URL: https://code.usgs.gov/vtcfwru/shinymgr +BugReports: https://code.usgs.gov/vtcfwru/shinymgr/issues +LazyData: yes +RoxygenNote: 7.2.1 +type: package +disclaimer: This draft manuscript is distributed solely for purposes of scientific + peer review. Its content is deliberative and predecisional, so it must not be disclosed + or released by reviewers. Because the manuscript has not yet been approved for publication + by the U.S. Geological Survey (USGS), it does not represent any official USGS finding + or policy. +bibliography: shinymgrFinal.bib +abstract: | + The R package shinymgr provides a unifying framework that allows Shiny developers to create, manage, and deploy a master Shiny application comprised of one or more "apps", where an "app" is a tab-based workflow that guides end-users through a step-by-step analysis. Each tab in a given "app" consists of one or more Shiny modules. The shinymgr app builder allows developers to "stitch" Shiny modules together so that outputs from one module serve as inputs to the next, creating an analysis pipeline that is easy to implement and maintain. Apps developed using shinymgr can be incorporated into R packages or deployed on a server, where they are accessible to end-users. Users of shinymgr apps can save analyses as an RDS file that fully reproduces the analytic steps and can be ingested into an RMarkdown or Quarto report for rapid reporting. In short, developers use the shinymgr framework to write Shiny modules and seamlessly combine them into Shiny apps, and end-users of these apps can execute reproducible analyses that can be incorporated into reports for rapid dissemination. A comprehensive overview of the package is provided by 12 learnr tutorials. +author: +- name: Laurence A. Clarfeld + affiliation: Vermont Cooperative Fish and Wildlife Research Unit + address: + - 302 Aiken Center, University of Vermont + - Burlington, VT 05405 USA + orcid: 0000-0002-3927-9411 + email: laurence.clarfeld@uvm.edu +- name: Caroline Tang + email: 17ct24@queensu.ca + orcid: 0000-0001-7966-5854 + affiliation: Queen's University + address: + - Biology Department + - 116 Barrie St, Kingston, ON K7L 3N6 +- name: Therese Donovan + email: tdonovan@uvm.edu + affiliation: U.S. Geological Survey, Vermont Cooperative Fish and Wildlife Research + Unit + address: + - 302 Aiken Center, University of Vermont + - Burlington, VT 05405 USA + orcid: 0000-0001-8124-9251 +output: + rjtools::rjournal_web_article: + self_contained: yes + toc: no + rjtools::rjournal_pdf_article: + toc: no +date: '2025-01-10' +date_received: '2022-11-03' +volume: 16 +issue: 1 +slug: RJ-2024-009 +journal: + lastpage: 174 + firstpage: 157 + +--- + + +```{r, eval = FALSE, echo = FALSE} +# to be added to the yaml header when compiling to latex +output: + rjtools::rjournal_pdf_article: + self_contained: yes + toc: no +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE, comment = NA) + +library(shinymgr) +library(fs) +library(rjtools) +library(dplyr) +``` + + +# Introduction {#intro} + +The \CRANpkg{shiny} R package allows users to build interactive web apps straight from R, without advanced knowledge of HTML or JavaScript [@shiny]. A *shiny* web app can permit an expedient analysis pipeline or workflow. Ideally, the pipeline can produce outputs that are fully reproducible [@Peng; @Gentleman; @Alston]. Moreover, the pipeline can permit rapid reporting to convey the results of an analysis workflow to a target audience [@stoudt2021principles] (Figure \@ref(fig:fig1)). + +```{r, eval = FALSE} +include=knitr::is_latex_output(), eval=knitr::is_latex_output() +``` + +```{r fig1, echo = F, out.width = "100%", fig.cap = "Stages of a reproducible workflow, a process that moves an inquiry from raw data to insightful contribution."} +knitr::include_graphics('images/figure1.png') +``` + +*shiny* applications range from simple to complex, each with an intended purpose developed for an intended user audience. Several R packages provide a development framework for building multi-faceted master applications, including \CRANpkg{shinipsum} for prototyping [@shinipsum], \CRANpkg{golem} [@golum], and \CRANpkg{rhino} [@rhino]. + +From the developer's perspective, complex *shiny* applications can result in many lines of code, creating challenges for collaborating, debugging, streamlining, and maintaining the overall product. *shiny* modules are a solution to this problem. As stated by Winston Chang [@shinyblog], "A *shiny* module is a piece of a *shiny* app. It can't be directly run, as a *shiny* app can. Instead, it is included as part of a larger app . . . Once created, a *shiny* module can be easily reused – whether across different apps, or multiple times in a single app." *shiny* modules, and modularization in general, are a core element of agile software development practices [@larman2004agile]. Several authors have contributed R packages for distributing pre-written *shiny* modules for general use, including the \CRANpkg{datamods} [@datamods], \CRANpkg{shiny.reglog} [@reglog], \CRANpkg{periscope} [@periscope], \CRANpkg{shinyauthr} [@shinyauthr], and \CRANpkg{jsmodule} [@jsmodule] packages. + +However, as the number of available modules increases, there is a pressing need for documenting available *shiny* modules and easily incorporating them into new workflows. For example, consider a toy modular-based app that guides a user through an analysis of the famous "Iris Dataset," which contains 150 records of 3 species of iris, including measurements of the length and width of the flowers' sepals and petals [@fisher1936use]. The app, called "Iris Explorer," consists of 5 tabs to be worked through in sequence (Figure \@ref(fig:fig2), top). + +Tab 1 displays instructions for use, while tab 2 performs a *k*-means clustering of the data, where *k* is specified by the user. The resulting clusters are displayed with two variables of the user's choosing as depicted in Figure \@ref(fig:fig2). In tab 3, the user will choose a value *n*, indicating the number of rows by which to randomly subset the data, and in tab 4 the user selects a single variable to be plotted as a bar chart. Finally, in tab 5 the user can save their outputs as an RDS file. This contrived example includes some key elements of a typical workflow in that the five tabs introduce a dataset, guide the user through light data wrangling, produce analysis outputs, and offer the ability to save the results. + +The app's blueprint (Figure \@ref(fig:fig2), bottom) identifies the *shiny* modules in each tab, showing how outputs from one module can serve as inputs to the next. Note that while this example shows a single module in each tab with differing inputs/outputs, in the general case tabs can contain an arbitrary number of *shiny* modules (including multiple instances of the same module) and each module can have multiple inputs/outputs. + +While two of the *shiny* modules within the "iris_explorer" app pertain to the iris dataset specifically ("iris_intro" and "iris_cluster"), the remaining *shiny* modules ("subset_rows", "single_column_plot", and "save") may be incorporated into other apps. + +```{r fig2, echo = F, out.width = "100%", fig.cap = "Top: The 'iris\\_explorer' app guides a user through an analysis of the iris dataset in a tab-based sequence. Bottom: A blueprint of the 'iris\\_explorer' app shows the 5 tabs, each containing a single module identified by name within blue ovals. Some of the shiny modules require inputs and generate outputs as identified in gray polygons.", fig.pos = "h"} +knitr::include_graphics('images/figure2.png') +``` + +\newpage + +Developers who utilize the same *shiny* modules within different apps will naturally be faced with several questions: + +1. Which *shiny* modules have been written? Are they well documented with unit testing? +2. What are the module's inputs (arguments) and outputs (returns)? +3. Where are the *shiny* modules stored? +4. How can *shiny* modules be combined into a cohesive, well-documented app? +5. How can production-ready apps be deployed for end-users? + +Users of an app created with the *shinymgr* framework may wish to know: + +6. Can analysis outputs be saved as a fully reproducible workflow? +7. Can outputs be ingested into a *Rmarkdown* or *Quarto* template for rapid reporting? + +## Introducing *shinymgr* + +The R package, \CRANpkg{shinymgr}, was developed to meet these challenges [@shinymgr_citation]. The *shinymgr* package includes a general framework that allows developers to create *shiny* modules, stitch them together as individual "apps" that are embedded within the master *shiny* application, and then deploy them on a *shiny* server or incorporate them into R packages. *shinymgr* was motivated from our first-hand experience in our work building tools that assist scientists in remote wildlife monitoring with the R package *AMMonitor* [@balantic2020ammonitor]. Dependencies of *shinymgr* include the packages \CRANpkg{DBI} [@dbi], \CRANpkg{reactable} [@reactable], \CRANpkg{RSQLite} [@RSQLite], \CRANpkg{renv} [@renv], \CRANpkg{shiny} [@shiny], \CRANpkg{shinyjs} [@shinyjs], and \CRANpkg{shinydashboard} [@shinydashboard]. + +From the developer's perspective, an "app" consists of an ordered set of tabs, each of which contain specified *shiny* modules. *shiny* modules are the basic element in the *shinymgr* framework; they can be used and re-used across different tabs and different apps. Information about each module and app is stored in a SQLite database [@sqlite2020hipp]. The *shinymgr* app builder "stitches" *shiny* modules together so that outputs from one module serve as inputs to the next, creating an analysis pipeline that is easy to implement and maintain. When apps are production-ready , developers can deploy a stand-alone *shiny* application independent of *shinymgr* on a server or within an R package. From the end-user's perspective, an "app" created with the *shinymgr* framework consists of an ordered series of *shiny* tabs, establishing an analysis. Users can save their inputs and outputs as an RDS file to ensure full reproducibility. Furthermore, the RDS file may be loaded into an R Markdown (Rmd) or Quarto (qmd) template for rapid reporting. We are unaware of existing packages that unify the elements of modularization, documentation, reproducibility, and reporting in a single framework. + +We introduce *shinymgr* in sections 2-4 below. In section [2](#appdev) we describe how developers can create apps using the *shinymgr* framework. In section [3](#appdeploy) we describe how developers can deploy a *shinymgr* project on a local machine, server, or within an R package. In section [4](#appUsing) describes the end-user experience, where end-users execute an "app" and store results for reproducibility and reporting. The package tutorials and cheat sheet are described in section [5](#tuts). The *shinymgr* package comes with a series of \CRANpkg{learnr} [@learnr] tutorials described at the end of the paper. + +# Developing *shinymgr* apps {#appdev} + +## Setting up *shinymgr* + +The canonical home of *shinymgr* is https://code.usgs.gov/vtcfwru/shinymgr/ where *shinymgr* users may post merge requests and bug fix requests. *shinymgr* may also be downloaded from CRAN. + +```{r, eval = FALSE, echo =TRUE} +install.packages("shinymgr") +``` + +The development version can be downloaded with: + +```{r, eval = FALSE, echo =TRUE} +remotes::install_gitlab( + repo = "vtcfwru/shinymgr", + auth_token = Sys.getenv("GITLAB_PAT"), + host = "code.usgs.gov", + build_vignettes = FALSE) +``` + +Once installed, a new *shinymgr* project can be created within a parent directory: + +```{r, warning=FALSE, eval = FALSE, echo = TRUE} +# set the directory path that will house the shinymgr project +parentPath <- getwd() + +# set up raw directories and fresh database +shinymgr_setup( + parentPath = parentPath, + demo = TRUE) +``` + +The `shinymgr_setup()` function produces the following directory structure within the primary "shinymgr" directory. This structure consists of 3 files that make up the "master" app (global.R, server.R, and ui.R), and 9 directories. If the argument demo is set to FALSE, these directories will be largely empty, except for the "modules_mgr" and "database" directories, which will contain *shiny* modules for rendering *shinymgr*'s UI and an empty SQLite database, respectively. If the argument demo is set to TRUE, each directory will include several demo files as shown, including a pre-populated database. Here, we highlight a subset of the demo files related to the "iris_explorer" app to guide developers through the key elements of *shinymgr* (additional demo files come with package but are omitted here for clarity). + +```{r, comment = NA, echo = FALSE} +fs::dir_tree( + path = "shinymgr", + recurse = TRUE) +``` + + +The directory structure produced by `shinymgr_setup()` includes the following: + +- The **analyses** directory provides the developer an example of a previously run analysis that was created using the *shinymgr* framework (an RDS file). An analysis file name includes the app name (e.g. "iris_explorer"), the name of the person who ran the analysis (e.g. "Gandalf"), and the date and time of the analysis (e.g., "iris_explorer_Gandalf_2023_06_05_16_30.RDS"). + +- The **data** directory stores RData files that can be used by various *shinymgr* apps (e.g., "iris.RData"). + +- The **database** directory stores the *shinymgr* SQLite database, named "shinymgr.sqlite." The database is used by the developer to track all *shiny* modules, their arguments (inputs), returns (outputs), and how they are combined into *shinymgr* apps. + +- The **modules** directory stores stand-alone *shiny* modules. These files are largely written by the developer with the help of the `mod_init()` function, and are registered in the database with the `mod_register()` function. Four of the example *shiny* modules listed are used in the "iris_explorer" app. + +- The **modules_app** directory stores *shiny* modules that are *shinymgr* "apps" – the stitching together of *shiny* modules into a tab-based layout that provides an analysis workflow (Figure \@ref(fig:fig2) shows the +"iris_explorer" app layout). Files within the "modules_app" directory are not written by hand - instead, they are created with the *shinymgr* "app builder." + +- The **modules_mgr** directory stores *shiny* modules that build the overall *shinymgr* framework. + +- The **reports** directory provides an example of an *RMarkdown* (Rmd) template (e.g., "iris_explorer_report.Rmd"), allowing for rapid reporting by an end-user. + +- The **tests** directory stores both \CRANpkg{testthat} [@testthat] and \CRANpkg{shinytest} [@shinytest] code testing scripts. + +- The **www** directory stores images that may be used by a *shiny* app. + +- In addition to these directories, three files are created for launching the master *shinymgr* *shiny* application: + + 1. **ui.R** - This file contains code to set the user interface for the master *shinymgr* app. + 2. **server.R** - The master server file. + 3. **global.R** - The global.R file is sourced into the server.R file at start-up. It sources all of the *shiny* modules within the *shinymgr* framework so they are available when *shinymgr* is launched. + +## The *shinymgr* developer's portal + +Once set-up is complete, the `launch_shinymgr()` function will launch the *shinymgr* "Developer's Portal" UI, allowing developers to create and test new *shinymgr* apps. + +```{r, eval = FALSE, echo = TRUE} +# launch shinymgr +launch_shinymgr(shinyMgrPath = paste0(parentPath, "/shinymgr")) +``` + +The portal is recognizable by the *shinymgr* logo in the upper left corner (Figure \@ref(fig:fig3)). The portal consists of three main tabs in the left menu. The "Developer Tools" tab is used to create apps, view the *shinymgr* database, and register reports, while the "Analysis (beta)" and "Reports (beta)" tabs allow developers to evaluate apps from the user's perspective. + +```{r fig3, echo = F, out.width = "100%", fig.cap = "The shinymgr Developer Portal consists of a sidebar panel where developers can create new shiny modules and new apps, and test-drive analyses and reports from the user's perspective. The main panel shows the 'Build App' tab within the 'Developer Tools' section."} +knitr::include_graphics('images/figure3.png') +``` + +The "Developer Tools" section includes 4 tabs for app development: The "Build App" tab allows the developer to create new *shinymgr* apps from existing modules using the *shinymgr* app builder; the "Database" tab displays the *shinymgr* database tables, the "Queries" tab contains a set of standard database queries, and the "Add Reports" tab allows the developer to link a report (Rmd or qmd) to a given *shinymgr* app (Figure \@ref(fig:fig3)), as described below. + +## The *shinymgr* database + +The *shinymgr* SQLite database ("shinymgr.sqlite") is a single file created by the `shinymgr_setup()` function. The database tracks all *shiny* modules, their arguments (inputs), returns (outputs), their package dependencies and version numbers, how they are combined into an "app," and any reports that are associated with apps. The database tables are populated via dedicated *shinymgr* functions. + +The *shinymgr* database consists of 11 tables in total (Figure \@ref(fig:fig4)). These tables are connected to each other as a typical relational database, with primary keys establishing unique records in each table, and foreign keys that reference primary keys in other tables (see Appendix A for a full database schema and the "database" *learnr* tutorial for additional information). + +The "apps," "appReports," "reports," "appTabs," and "tabs" tables largely store information on what a user would see when they run an analysis. The table "apps" stores information about apps such as "iris_explorer." Apps consist of tabs, which are listed in the "tabs" table. Tabs are linked to apps via the "appTabs" table. The table "reports" lists any Rmd or qmd files that serve as a report template, and the table "appReports" links a specific report with a specific app. + +The remaining 6 tables in Figure \@ref(fig:fig4) are "modules," "modFunctionArguments," "modFunctionReturns," "modPackages," "tabModules," and "appStitching." These tables largely store information about *shiny* modules that a developer creates, i.e., what *shiny* modules have been written, what are their arguments and returns, and what packages they use. The "tabModules" table identifies which tabs call which *shiny* modules (with a single tab capable of calling multiple *shiny* modules), and the "appStitching" table specifies how *shiny* modules are "stitched" together, i.e., which module returns are passed in as arguments to downstream *shiny* modules. + +```{r fig4, echo = FALSE, out.width = "100%", fig.cap = "The 11 tables of the shinymgr SQLite database. Lines indicate how the tables are related to each other.", fig.pos = "b"} +knitr::include_graphics('images/figure4.jpg') +``` + +Four of the 11 database tables focus on modules, highlighting that *shiny* modules are basic building blocks of any *shinymgr* app. Developers create new *shiny* modules with the `mod_init()` function, which copies a *shinymgr* module template (an R file template) that includes a header with key-value that describe the module, including the module name, display name, description, citation, notes, and module arguments and returns (if any). For example, the header of the iris_cluster module is: + +````{verbatim, echo = TRUE} +#!! ModName = iris_cluster +#!! ModDisplayName = Iris K-Means Clustering +#!! ModDescription = Clusters iris data based on 2 attributes +#!! ModCitation = Baggins, Bilbo. (2023). iris_cluster. [Source code]. +#!! ModNotes = Demo module for the shinymgr package. +#!! ModActive = 1 +#!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame +```` + +The module code is written beneath the header (see Appendix B for an example). Function calls within the module code should be written with `package::function()` notation, making explicit any R package dependencies. Once the module is completed, unit tests can written and stored in the *shinymgr* project's "tests" directory. The final module file is saved to the "modules" directory and registered into the database with the `mod_register()` function. The `mod_register()` function populates the modules, "modFunctionArguments", and "modFunctionReturns" SQLite database tables. Further, it uses the *renv* package to identify any package dependencies and inserts them into the modPackages table. Readers are referred to the "modules" "tests", and "shinymgr_modules" *learnr* tutorials that come with the *shinymgr* package for more details. + +Once modules are registered in the database, the developer can incorporate them into new apps. As *shiny* modules and apps in the database represent files that contain their scripts, deleting a module or an app from the database will delete all downstream database entries as well as (optionally) the actual files themselves. Deletion of a module will fail if it is being used in other apps. Module updates can be versioned by creating a new module and then referencing its precursor in the "modules" database table. + +## The *shinymgr* app builder + +Once developers create and register their own stand-alone *shiny* modules, apps are generated with *shinymgr*'s app builder (Figure \@ref(fig:fig5)). + +```{r fig5, echo = FALSE, out.width = "100%", fig.cap= "The shinymgr Developer Portal layout, showing the app builder in the Developer Tools.", fig.pos = "h"} + +knitr::include_graphics('images/figure5.png') +``` + +Developers are guided through a process where they design their app from *shiny* modules they have registered. The builder then populates the *shinymgr* database with instructions on how to construct the app and writes the app's script based on those instructions. The newly created script is saved to the "modules_app" directory. Through this structured process, apps produced by the builder are well-documented and generate highly reproducible analyses. Readers are encouraged to peruse the tutorial, "apps", for more information. + +The `qry_app_flow()` function will query the database to return a list of the *shiny* modules and tabs included in a specified app, such as "iris_explorer": + +```{r, echo = TRUE} +# look at the appTabs table in the database +qry_app_flow("iris_explorer", shinyMgrPath = paste0(getwd(),"/shinymgr")) +``` + +As shown in Figure \@ref(fig:fig2), this app has 5 tabs, and each tab features a single module. The "Save" tab is the final tab in all *shinymgr* apps and is not listed in the query result. + +Developers can "beta test" apps prior to deployment by selecting the Analysis (beta) tab in the Developer's Portal (Figure \@ref(fig:fig3)). They can also create *RMarkdown* or *Quarto* report templates that accept the outputs from an analysis and incorporate them into a report. Report metadata are logged in the "reports" table of the database, and then linked with a specific app in the "appReports" table. An end-user will run an analysis and render a report, a process described more fully in the "Using *shinymgr* Apps" section below. + +To summarize this section, developers use the `shinymgr_setup()` function to create the directory structure and underlying database needed to build and run *shiny* apps with *shinymgr*. Developers use the `mod_init()` and `mod_register()` functions to create modules and make them available for inclusion in new apps built with the *shinymgr* app builder. A developer can create as many *shinymgr* projects as needed. In each case, the *shinymgr* project is simply a fixed directory structure with three R files (ui.R, server.R, and global.R), and a series of subdirectories that contain the apps and *shiny* modules created by the developer, along with a database for tracking everything. + +# Deploying *shinymgr* projects {#appdeploy} + +Once development is completed, developers can deploy their *shinymgr* project on a server or within an R package by copying portions of the *shinymgr* project to a new location while retaining the original project for future development. Once deployed, a *shinymgr* project no longer requires the *shinymgr* package or database to be run. Thus, the files and directories to be copied for deployment include only: + +```{r, eval = TRUE, echo = FALSE} +fs::dir_tree( + path = "shinymgr", + recurse = FALSE, + regexp = '(modules)|(global)|(data$)|(reports)|(server)|(ui)|(www)' +) +``` + +The master app files, ui.R, global.R, and server.R, are needed to run the *shinymgr* framework. + +When deploying a *shinymgr* project within an R package, objects within the data folder should be copied into the package's "data" folder. The remaining files should be copied into a directory within the package's "inst" folder that will house the master *shiny* application. Deployment on a server such as shinyapps.io will require similar adjustments. + +After files are copied to the correct location, a few key adjustments are needed. First, the "modules_app" directory should contain only those apps (and dependent modules and reports) that can be used by end-users; unused apps, modules, and reports can be deleted. Second, the new.analysis.R script within the modules_mgr folder will require minor updates to remove dependencies on the *shinymgr* database. Third, the ui.R and server.R scripts should be updated to no longer showcase *shinymgr* and the Developer's Portal; rather, it should be customized by the developer to create their own purpose-driven apps. For example, Figure \@ref(fig:fig6) shows a hypothetical deployment of the master app titled "Deployed Project" that is based on the *shinymgr* framework. Notice the absence of the Developer Tools tab and the absence of references to *shinymgr*. The "deployment" *learnr* tutorial provides more in-depth discussion. + +```{r fig6, echo = FALSE, out.width = "100%", fig.cap= "An example of a deployed shinymgr app. The deployed version excludes the Developers Tools tab and is an example of what the end user sees when using a deployed app."} + +knitr::include_graphics('images/figure6.png') + +``` +To summarize this section, deploying the *shinymgr* framework involves copying key elements of the *shinymgr* developer project into package or server directories, updated as needed for use by end-users. Readers are referred to the “deployment” tutorial for further information. + +# Using *shinymgr* apps {#appUsing} + +Apps built with *shinymgr* can appeal to various types of end-users. When deployed as part of an R package, end-users would be anyone who uses that package. Apps may also be distributed as stand-alone scripts, or hosted on a server, as described above. Developers may also use *shinymgr* to produce apps for their own use (i.e., the developer *is* the end-user). Regardless of who the intended end-user is, this section discusses that user's experience after the master app is deployed. + +Whoever the intended audience for the app, this section discusses how an app can be used *after* it has been deployed. + +## Reproducible analyses + +The final tab in any *shinymgr* app provides the opportunity to save the analysis itself. Reproducibility is a core tenet of *shinymgr.* Therefore, a robust set of metadata are saved as an RDS file to allow a user to understand and replicate their results. An example of a completed analysis is the file, "iris_explorer_Gandalf_2023_06_05_16_30.RDS," which stores a user's analytic steps for a run of the "iris explorer" app. The code below reads in this example file, and shows the structure (a list with 23 elements): + +```{r, eval = TRUE, echo = TRUE} +rds_filepath <- paste0(getwd(),"/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS") +old_analysis <- readRDS(rds_filepath) +str(old_analysis, max.level = 2, nchar.max = 20, vec.len = 15) +``` + +The list stores a great deal of information: + +* **analysisName** is the name of the analysis and is equivalent to the filename of the RDS file (without the extension) +* **app** is the name of the app that produced the saved analysis results. +* **username** was entered in the "Save" tab when the analysis was performed. +* **mod#-value** indicate the values of each *shiny* module's arguments (inputs), if any exist, at the time the analysis was saved. +* **returns** includes values of all outputs (returns) of each module. +* **notes** were entered in the "Save" tab when the analysis was performed. +* **timestamp** is the date/time when the analysis was saved. +* **metadata** includes robust information about each module, including the app description and the description of each module as it was originally stored in the *shinymgr* database tables. The metadata list element also includes an *renv* "lockfile": a list that describes the R version and R package dependencies (including *shinymgr*) used by the app itself. The lockfile captures the state of the app's package dependencies at the time of its creation; in the case of *shinymgr*, it contains the dependencies used by the developer who created the app. Each lockfile record includes the name and version of the package and their installation source. +* **\*\_code** attributes with this format contain the source code for the app. + +The code list element allows an end user to revisit the full analysis with *shinymgr*'s `rerun_analysis()` function, supplying the file path to a saved *shinymgr* analysis (RDS file). + +```{r, echo = TRUE, eval = FALSE} +rerun_analysis(analysis_path = rds_filepath) +``` + +The `rerun_analysis()` function will launch a *shiny* app with two tabs (Figure \@ref(fig:fig7)); it can only be run during an interactive R session, with no other *shiny* apps running. + +```{r fig7, echo = F, out.width = "100%", fig.cap = "A screenshot of the rerun\\_analysis() function, as called on the saved analysis from the iris\\_explorer app (RDS file). The active tab, called 'The App', allows a user to rerun a previously executed analysis. The 'Analysis Summary' tab displays the values of all module arguments and returns, captured when the analysis was saved, along with a detailed description of the app, it's modules, the App's source code, and all package dependencies."} +knitr::include_graphics('images/figure7.png') +``` + +The first tab is called "The App", and will be visible when the `rerun_analysis()` function is called. It contains a header with the app's name, a subheading of "Analysis Rerun," and a fully functioning, identical copy of the *shiny* app used to generate the saved analysis. Below that, a disclaimer appears, indicating the app was produced from a saved analysis. A summary of the analysis is presented on the second tab that displays the values used to produce the given analysis output. + +If the `rerun_analysis()` function fails, it could be due to a change in R and package versions currently installed on the end-user's machine. To that end, the lockfile that is included in the metadata section of the RDS file can be used to restore the necessary R packages and R version with the `restore_analysis()` function. This function will attempt to create a self-contained *renv* R project that includes all of the packages and the R version used by the developer when the app was created. The analysis RDS is added to this new project, where the `rerun_analysis()` function can be attempted again. Readers are referred to the "analyses" tutorial for further information. + +## Rapid reporting + +Another important feature of *shinymgr* is the ability to share results of an analysis with others in a friendly, readable format with *RMarkdown* or *Quarto*. Apps produce an RDS file, which may be passed into an Rmd or qmd file as a parameterized input. For example, the demo database includes a report template called "iris_explorer_report.Rmd." This file, with code shown below, allows users to navigate to the RDS file produced by the "iris explorer" app and render the rapid report. + +```{r, comment = '', echo=FALSE, class.output='r'} +rmd_filepath <- paste0(getwd(),"/shinymgr/reports/iris_explorer/iris_explorer_report.Rmd") +cat(paste(readLines(rmd_filepath), collapse = '\n')) +``` +Reports may be run within the deployed version of *shinymgr* (e.g., left menu of Figure \@ref(fig:fig6)), or may be run directly in R by opening the Rmd file and navigating to the RDS as a file input. Users who run a report can download it to their local machine as a HTML, PDF, or Word file, where they can further customize the output. + +To summarize this section, users of *shinymgr* "apps" created with the *shinymgr* framework are presented with a series of *shiny* tabs that establish an analysis workflow. Users can save their inputs and outputs as an RDS file to ensure full reproducibility. Further, the RDS file may be loaded into an R Markdown (Rmd) or Quarto (qmd) template for rapid reporting. + +# Tutorials and cheatsheet {#tuts} + +with the package. Below is a list of current tutorials, intended to be worked through in order: + +```{r, warning = FALSE} +learnr::available_tutorials( + package = "shinymgr") %>% + dplyr::arrange(title) + +``` + +The "intro" tutorial gives a general overview. Tutorials 2-5 are aimed at developers who are new to *shiny*, while tutorials 6 – 12 focus on the *shinymgr* package. + +Launch a tutorial with the *learnr* `run_tutorial()` function, providing the name of the module to launch. The tutorial should launch in a browser, which has the benefit of being able to print the tutorial to PDF upon completion: + +```{r, eval = FALSE, echo = TRUE} +learnr::run_tutorial( + name = "modules", + package = "shinymgr") +``` + +Additionally, the package cheatsheet can be found with: + +```{r, eval = FALSE, echo = TRUE} +browseURL(paste0(find.package("shinymgr"), "/extdata/shinymgr_cheatsheet.pdf")) +``` + +Contributions are welcome from the community. Questions can be asked on the +issues page at https://code.usgs.gov/vtcfwru/shinymgr/issues. + + +# Acknowledgments + +We thank Cathleen Balantic and Jim Hines for feedback on the overall package and package tutorials. *shinymgr* was prototyped by Therese Donovan at a *shiny* workshop taught by Chris Dorich and Matthew Ross at Colorado State University in 2020 (pre-pandemic). We thank the instructors for feedback and initial coding assistance. Any use of trade, firm, or product names is for descriptive purposes only and does not imply endorsement by the U.S. Government. The Vermont Cooperative Fish and Wildlife Research Unit is jointly supported by the U.S. Geological Survey, University of Vermont, Vermont Fish and Wildlife Department, and Wildlife Management Institute. + +# Bibliography + +
    + +\newpage + +# Appendix A + +Entity relationship diagram for the *shinymgr* database, which tracks all components of an apps and modules (Figure \@ref(fig:fig8)). The database consists of 11 tables. Primary keys are referenced with a "pk" prefix, while foreign keys are referenced with an "fk" prefix. A full description of the database is contained in the "database" *learnr* tutorial that comes with the *shinymgr* package + +```{r fig8, echo = F, out.width = "85%", fig.cap = "Entity relationship diagram for the shinymgr database, which tracks all components of an apps and modules. The database consists of 11 tables. Primary keys are referenced with a 'pk' prefix, while foreign keys are referenced with an 'fk' prefix. A full description of the database is contained in the 'database' learnr tutorial that comes with the shinymgr package.", fig.pos = 'h'} +knitr::include_graphics('images/figure8.png') +``` + +\newpage + +# Appendix B + +Modules in *shinymgr* are written by developers for their own purposes. The `shinymgr::mod_init()` function creates a template for module development. The header is a series of key-value pairs that the developer fills out (typically after the module code is written and tested). The "iris_cluster" module is presented below as an example. The module consists of two paired functions: here, `iris_cluster_ui(id)` and `iris_cluster_server()`. The UI is a function with an argument called id, which is turned into module's "namespace" with the `NS()` function. A namespace is simply the module's identifier and ensures that function and object names within a given module do not conflict with function and object names in other modules. The Id's for each input and output in the UI must be wrapped in a `ns()` function call to make explicit that these inputs are assigned to the module's namespace. All UI elements are wrapped in a `tagList()` function, where a `tagList` allows one to combine multiple UI elements into a single R object. Readers should consult the "modules," "tests," and "shinymgr_modules" tutorials for additional information. + +```{r, comment = '', echo=FALSE, class.output='r'} +rmd_filepath <- paste0(getwd(),"/shinymgr/modules/iris_cluster.R") +cat(paste(readLines(rmd_filepath), collapse = '\n')) +``` + +\newpage + + + + + + diff --git a/_articles/RJ-2024-009/RJ-2024-009.html b/_articles/RJ-2024-009/RJ-2024-009.html new file mode 100644 index 0000000000..c500338d20 --- /dev/null +++ b/_articles/RJ-2024-009/RJ-2024-009.html @@ -0,0 +1,3285 @@ + + + + + + + + + + + + + + + + + + + + + + shinymgr: A Framework for Building, Managing, and Stitching Shiny Modules into Reproducible Workflows + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    shinymgr: A Framework for Building, Managing, and Stitching Shiny Modules into Reproducible Workflows

    + + + +

    The R package shinymgr provides a unifying framework that allows Shiny developers to create, manage, and deploy a master Shiny application comprised of one or more “apps”, where an “app” is a tab-based workflow that guides end-users through a step-by-step analysis. Each tab in a given “app” consists of one or more Shiny modules. The shinymgr app builder allows developers to “stitch” Shiny modules together so that outputs from one module serve as inputs to the next, creating an analysis pipeline that is easy to implement and maintain. Apps developed using shinymgr can be incorporated into R packages or deployed on a server, where they are accessible to end-users. Users of shinymgr apps can save analyses as an RDS file that fully reproduces the analytic steps and can be ingested into an RMarkdown or Quarto report for rapid reporting. In short, developers use the shinymgr framework to write Shiny modules and seamlessly combine them into Shiny apps, and end-users of these apps can execute reproducible analyses that can be incorporated into reports for rapid dissemination. A comprehensive overview of the package is provided by 12 learnr tutorials.

    +
    + + + +
    +
    + +
    +

    1 Introduction

    +

    The shiny R package allows users to build interactive web apps straight from R, without advanced knowledge of HTML or JavaScript (Chang et al. 2022). A shiny web app can permit an expedient analysis pipeline or workflow. Ideally, the pipeline can produce outputs that are fully reproducible (Gentleman and Lang 2007; Peng 2011; Alston and Rick 2021). Moreover, the pipeline can permit rapid reporting to convey the results of an analysis workflow to a target audience (Stoudt et al. 2021) (Figure 1).

    +
    + +
    +
    +
    +Stages of a reproducible workflow, a process that moves an inquiry from raw data to insightful contribution. +

    +Figure 1: Stages of a reproducible workflow, a process that moves an inquiry from raw data to insightful contribution. +

    +
    +
    +

    shiny applications range from simple to complex, each with an intended purpose developed for an intended user audience. Several R packages provide a development framework for building multi-faceted master applications, including shinipsum for prototyping (Fay and Rochette 2020), golem (Fay et al. 2021), and rhino (Żyła et al. 2023).

    +

    From the developer’s perspective, complex shiny applications can result in many lines of code, creating challenges for collaborating, debugging, streamlining, and maintaining the overall product. shiny modules are a solution to this problem. As stated by Winston Chang (Modularizing shiny app code 2020), “A shiny module is a piece of a shiny app. It can’t be directly run, as a shiny app can. Instead, it is included as part of a larger app . . . Once created, a shiny module can be easily reused – whether across different apps, or multiple times in a single app.” shiny modules, and modularization in general, are a core element of agile software development practices (Larman 2004). Several authors have contributed R packages for distributing pre-written shiny modules for general use, including the datamods (Perrier et al. 2022), shiny.reglog (Kosinski 2022), periscope (Brett and Neuhaus 2022), shinyauthr (Campbell 2021), and jsmodule (Kim and Lee 2022) packages.

    +

    However, as the number of available modules increases, there is a pressing need for documenting available shiny modules and easily incorporating them into new workflows. For example, consider a toy modular-based app that guides a user through an analysis of the famous “Iris Dataset,” which contains 150 records of 3 species of iris, including measurements of the length and width of the flowers’ sepals and petals (Fisher 1936). The app, called “Iris Explorer,” consists of 5 tabs to be worked through in sequence (Figure 2, top).

    +

    Tab 1 displays instructions for use, while tab 2 performs a k-means clustering of the data, where k is specified by the user. The resulting clusters are displayed with two variables of the user’s choosing as depicted in Figure 2. In tab 3, the user will choose a value n, indicating the number of rows by which to randomly subset the data, and in tab 4 the user selects a single variable to be plotted as a bar chart. Finally, in tab 5 the user can save their outputs as an RDS file. This contrived example includes some key elements of a typical workflow in that the five tabs introduce a dataset, guide the user through light data wrangling, produce analysis outputs, and offer the ability to save the results.

    +

    The app’s blueprint (Figure 2, bottom) identifies the shiny modules in each tab, showing how outputs from one module can serve as inputs to the next. Note that while this example shows a single module in each tab with differing inputs/outputs, in the general case tabs can contain an arbitrary number of shiny modules (including multiple instances of the same module) and each module can have multiple inputs/outputs.

    +

    While two of the shiny modules within the “iris_explorer” app pertain to the iris dataset specifically (“iris_intro” and “iris_cluster”), the remaining shiny modules (“subset_rows”, “single_column_plot”, and “save”) may be incorporated into other apps.

    +
    +
    +Top:  The 'iris\_explorer' app guides a user through an analysis of the iris dataset in a tab-based sequence.  Bottom:  A blueprint of the 'iris\_explorer' app shows the 5 tabs, each containing a single module identified by name within blue ovals. Some of the shiny modules require inputs and generate outputs as identified in gray polygons. +

    +Figure 2: Top: The ‘iris_explorer’ app guides a user through an analysis of the iris dataset in a tab-based sequence. Bottom: A blueprint of the ‘iris_explorer’ app shows the 5 tabs, each containing a single module identified by name within blue ovals. Some of the shiny modules require inputs and generate outputs as identified in gray polygons. +

    +
    +
    +
    +

    Developers who utilize the same shiny modules within different apps will naturally be faced with several questions:

    +
      +
    1. Which shiny modules have been written? Are they well documented with unit testing?
    2. +
    3. What are the module’s inputs (arguments) and outputs (returns)?
    4. +
    5. Where are the shiny modules stored?
    6. +
    7. How can shiny modules be combined into a cohesive, well-documented app?
    8. +
    9. How can production-ready apps be deployed for end-users?
    10. +
    +

    Users of an app created with the shinymgr framework may wish to know:

    +
      +
    1. Can analysis outputs be saved as a fully reproducible workflow?
    2. +
    3. Can outputs be ingested into a Rmarkdown or Quarto template for rapid reporting?
    4. +
    +

    1.1 Introducing shinymgr

    +

    The R package, shinymgr, was developed to meet these challenges (Clarfeld et al. 2024). The shinymgr package includes a general framework that allows developers to create shiny modules, stitch them together as individual “apps” that are embedded within the master shiny application, and then deploy them on a shiny server or incorporate them into R packages. shinymgr was motivated from our first-hand experience in our work building tools that assist scientists in remote wildlife monitoring with the R package AMMonitor (Balantic and Donovan 2020). Dependencies of shinymgr include the packages DBI (R Special Interest Group on Databases (R-SIG-DB) et al. 2022), reactable (Lin 2022), RSQLite (Müller et al. 2022), renv (Ushey 2023), shiny (Chang et al. 2022), shinyjs (Attali 2021), and shinydashboard (Chang and Borges Ribeiro 2021).

    +

    From the developer’s perspective, an “app” consists of an ordered set of tabs, each of which contain specified shiny modules. shiny modules are the basic element in the shinymgr framework; they can be used and re-used across different tabs and different apps. Information about each module and app is stored in a SQLite database (Hipp 2020). The shinymgr app builder “stitches” shiny modules together so that outputs from one module serve as inputs to the next, creating an analysis pipeline that is easy to implement and maintain. When apps are production-ready , developers can deploy a stand-alone shiny application independent of shinymgr on a server or within an R package. From the end-user’s perspective, an “app” created with the shinymgr framework consists of an ordered series of shiny tabs, establishing an analysis. Users can save their inputs and outputs as an RDS file to ensure full reproducibility. Furthermore, the RDS file may be loaded into an R Markdown (Rmd) or Quarto (qmd) template for rapid reporting. We are unaware of existing packages that unify the elements of modularization, documentation, reproducibility, and reporting in a single framework.

    +

    We introduce shinymgr in sections 2-4 below. In section 2 we describe how developers can create apps using the shinymgr framework. In section 3 we describe how developers can deploy a shinymgr project on a local machine, server, or within an R package. In section 4 describes the end-user experience, where end-users execute an “app” and store results for reproducibility and reporting. The package tutorials and cheat sheet are described in section 5. The shinymgr package comes with a series of learnr (Schloerke et al. 2020) tutorials described at the end of the paper.

    +

    2 Developing shinymgr apps

    +

    2.1 Setting up shinymgr

    +

    The canonical home of shinymgr is https://code.usgs.gov/vtcfwru/shinymgr/ where shinymgr users may post merge requests and bug fix requests. shinymgr may also be downloaded from CRAN.

    +
    +
    +
    install.packages("shinymgr")
    +
    +
    +

    The development version can be downloaded with:

    +
    +
    +
    remotes::install_gitlab(
    +  repo = "vtcfwru/shinymgr",
    +  auth_token = Sys.getenv("GITLAB_PAT"),
    +  host = "code.usgs.gov",
    +  build_vignettes = FALSE)
    +
    +
    +

    Once installed, a new shinymgr project can be created within a parent directory:

    +
    +
    +
    # set the directory path that will house the shinymgr project
    +parentPath <- getwd()
    +
    +# set up raw directories and fresh database
    +shinymgr_setup(
    +  parentPath = parentPath, 
    +  demo = TRUE)
    +
    +
    +

    The shinymgr_setup() function produces the following directory structure within the primary “shinymgr” directory. This structure consists of 3 files that make up the “master” app (global.R, server.R, and ui.R), and 9 directories. If the argument demo is set to FALSE, these directories will be largely empty, except for the “modules_mgr” and “database” directories, which will contain shiny modules for rendering shinymgr’s UI and an empty SQLite database, respectively. If the argument demo is set to TRUE, each directory will include several demo files as shown, including a pre-populated database. Here, we highlight a subset of the demo files related to the “iris_explorer” app to guide developers through the key elements of shinymgr (additional demo files come with package but are omitted here for clarity).

    +
    +
    shinymgr
    +├── analyses
    +│   └── iris_explorer_Gandalf_2023_06_05_16_30.RDS
    +├── data
    +│   └── iris.RData
    +├── database
    +│   └── shinymgr.sqlite
    +├── global.R
    +├── modules
    +│   ├── iris_cluster.R
    +│   ├── iris_intro.R
    +│   ├── single_column_plot.R
    +│   └── subset_rows.R
    +├── modules_app
    +│   └── iris_explorer.R
    +├── modules_mgr
    +│   ├── add_app.R
    +│   ├── add_mod.R
    +│   ├── add_report.R
    +│   ├── add_tab.R
    +│   ├── app_builder.R
    +│   ├── my_db.R
    +│   ├── new_analysis.R
    +│   ├── new_report.R
    +│   ├── queries.R
    +│   ├── save_analysis.R
    +│   ├── stitch_script.R
    +│   └── table.R
    +├── reports
    +│   └── iris_explorer
    +│       └── iris_explorer_report.Rmd
    +├── server.R
    +├── tests
    +│   ├── shinytest
    +│   │   ├── test-iris_explorer-expected
    +│   │   │   ├── 001.json
    +│   │   │   ├── 001.png
    +│   │   │   ├── 002.json
    +│   │   │   └── 002.png
    +│   │   └── test-iris_explorer.R
    +│   ├── shinytest.R
    +│   ├── testthat
    +│   │   ├── test-iris_cluster.R
    +│   │   └── test-subset_rows.R
    +│   └── testthat.R
    +├── ui.R
    +└── www
    +    ├── dark_mode.css
    +    └── shinymgr-hexsticker.png
    +
    +

    The directory structure produced by shinymgr_setup() includes the following:

    +
      +
    • The analyses directory provides the developer an example of a previously run analysis that was created using the shinymgr framework (an RDS file). An analysis file name includes the app name (e.g. “iris_explorer”), the name of the person who ran the analysis (e.g. “Gandalf”), and the date and time of the analysis (e.g., “iris_explorer_Gandalf_2023_06_05_16_30.RDS”).

    • +
    • The data directory stores RData files that can be used by various shinymgr apps (e.g., “iris.RData”).

    • +
    • The database directory stores the shinymgr SQLite database, named “shinymgr.sqlite.” The database is used by the developer to track all shiny modules, their arguments (inputs), returns (outputs), and how they are combined into shinymgr apps.

    • +
    • The modules directory stores stand-alone shiny modules. These files are largely written by the developer with the help of the mod_init() function, and are registered in the database with the mod_register() function. Four of the example shiny modules listed are used in the “iris_explorer” app.

    • +
    • The modules_app directory stores shiny modules that are shinymgr “apps” – the stitching together of shiny modules into a tab-based layout that provides an analysis workflow (Figure 2 shows the +“iris_explorer” app layout). Files within the “modules_app” directory are not written by hand - instead, they are created with the shinymgr “app builder.”

    • +
    • The modules_mgr directory stores shiny modules that build the overall shinymgr framework.

    • +
    • The reports directory provides an example of an RMarkdown (Rmd) template (e.g., “iris_explorer_report.Rmd”), allowing for rapid reporting by an end-user.

    • +
    • The tests directory stores both testthat (Wickham 2011) and shinytest (Chang et al. 2021) code testing scripts.

    • +
    • The www directory stores images that may be used by a shiny app.

    • +
    • In addition to these directories, three files are created for launching the master shinymgr shiny application:

      +
        +
      1. ui.R - This file contains code to set the user interface for the master shinymgr app.
        +
      2. +
      3. server.R - The master server file.
        +
      4. +
      5. global.R - The global.R file is sourced into the server.R file at start-up. It sources all of the shiny modules within the shinymgr framework so they are available when shinymgr is launched.
      6. +
    • +
    +

    2.2 The shinymgr developer’s portal

    +

    Once set-up is complete, the launch_shinymgr() function will launch the shinymgr “Developer’s Portal” UI, allowing developers to create and test new shinymgr apps.

    +
    +
    +
    # launch shinymgr
    +launch_shinymgr(shinyMgrPath = paste0(parentPath, "/shinymgr"))
    +
    +
    +

    The portal is recognizable by the shinymgr logo in the upper left corner (Figure 3). The portal consists of three main tabs in the left menu. The “Developer Tools” tab is used to create apps, view the shinymgr database, and register reports, while the “Analysis (beta)” and “Reports (beta)” tabs allow developers to evaluate apps from the user’s perspective.

    +
    +
    +The shinymgr Developer Portal consists of a sidebar panel where developers can create new shiny modules and new apps, and test-drive analyses and reports from the user's perspective. The main panel shows the 'Build App' tab within the 'Developer Tools' section. +

    +Figure 3: The shinymgr Developer Portal consists of a sidebar panel where developers can create new shiny modules and new apps, and test-drive analyses and reports from the user’s perspective. The main panel shows the ‘Build App’ tab within the ‘Developer Tools’ section. +

    +
    +
    +

    The “Developer Tools” section includes 4 tabs for app development: The “Build App” tab allows the developer to create new shinymgr apps from existing modules using the shinymgr app builder; the “Database” tab displays the shinymgr database tables, the “Queries” tab contains a set of standard database queries, and the “Add Reports” tab allows the developer to link a report (Rmd or qmd) to a given shinymgr app (Figure 3), as described below.

    +

    2.3 The shinymgr database

    +

    The shinymgr SQLite database (“shinymgr.sqlite”) is a single file created by the shinymgr_setup() function. The database tracks all shiny modules, their arguments (inputs), returns (outputs), their package dependencies and version numbers, how they are combined into an “app,” and any reports that are associated with apps. The database tables are populated via dedicated shinymgr functions.

    +

    The shinymgr database consists of 11 tables in total (Figure 4). These tables are connected to each other as a typical relational database, with primary keys establishing unique records in each table, and foreign keys that reference primary keys in other tables (see Appendix A for a full database schema and the “database” learnr tutorial for additional information).

    +

    The “apps,” “appReports,” “reports,” “appTabs,” and “tabs” tables largely store information on what a user would see when they run an analysis. The table “apps” stores information about apps such as “iris_explorer.” Apps consist of tabs, which are listed in the “tabs” table. Tabs are linked to apps via the “appTabs” table. The table “reports” lists any Rmd or qmd files that serve as a report template, and the table “appReports” links a specific report with a specific app.

    +

    The remaining 6 tables in Figure 4 are “modules,” “modFunctionArguments,” “modFunctionReturns,” “modPackages,” “tabModules,” and “appStitching.” These tables largely store information about shiny modules that a developer creates, i.e., what shiny modules have been written, what are their arguments and returns, and what packages they use. The “tabModules” table identifies which tabs call which shiny modules (with a single tab capable of calling multiple shiny modules), and the “appStitching” table specifies how shiny modules are “stitched” together, i.e., which module returns are passed in as arguments to downstream shiny modules.

    +
    +
    +The 11 tables of the shinymgr SQLite database. Lines indicate how the tables are related to each other. +

    +Figure 4: The 11 tables of the shinymgr SQLite database. Lines indicate how the tables are related to each other. +

    +
    +
    +

    Four of the 11 database tables focus on modules, highlighting that shiny modules are basic building blocks of any shinymgr app. Developers create new shiny modules with the mod_init() function, which copies a shinymgr module template (an R file template) that includes a header with key-value that describe the module, including the module name, display name, description, citation, notes, and module arguments and returns (if any). For example, the header of the iris_cluster module is:

    +
    +
    #!! ModName = iris_cluster
    +#!! ModDisplayName = Iris K-Means Clustering
    +#!! ModDescription = Clusters iris data based on 2 attributes
    +#!! ModCitation = Baggins, Bilbo.  (2023). iris_cluster. [Source code].
    +#!! ModNotes = Demo module for the shinymgr package.
    +#!! ModActive = 1
    +#!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame
    +
    +

    The module code is written beneath the header (see Appendix B for an example). Function calls within the module code should be written with package::function() notation, making explicit any R package dependencies. Once the module is completed, unit tests can written and stored in the shinymgr project’s “tests” directory. The final module file is saved to the “modules” directory and registered into the database with the mod_register() function. The mod_register() function populates the modules, “modFunctionArguments”, and “modFunctionReturns” SQLite database tables. Further, it uses the renv package to identify any package dependencies and inserts them into the modPackages table. Readers are referred to the “modules” “tests”, and “shinymgr_modules” learnr tutorials that come with the shinymgr package for more details.

    +

    Once modules are registered in the database, the developer can incorporate them into new apps. As shiny modules and apps in the database represent files that contain their scripts, deleting a module or an app from the database will delete all downstream database entries as well as (optionally) the actual files themselves. Deletion of a module will fail if it is being used in other apps. Module updates can be versioned by creating a new module and then referencing its precursor in the “modules” database table.

    +

    2.4 The shinymgr app builder

    +

    Once developers create and register their own stand-alone shiny modules, apps are generated with shinymgr’s app builder (Figure 5).

    +
    +
    +The shinymgr Developer Portal layout, showing the app builder in the Developer Tools. +

    +Figure 5: The shinymgr Developer Portal layout, showing the app builder in the Developer Tools. +

    +
    +
    +

    Developers are guided through a process where they design their app from shiny modules they have registered. The builder then populates the shinymgr database with instructions on how to construct the app and writes the app’s script based on those instructions. The newly created script is saved to the “modules_app” directory. Through this structured process, apps produced by the builder are well-documented and generate highly reproducible analyses. Readers are encouraged to peruse the tutorial, “apps”, for more information.

    +

    The qry_app_flow() function will query the database to return a list of the shiny modules and tabs included in a specified app, such as “iris_explorer”:

    +
    +
    +
    # look at the appTabs table in the database
    +qry_app_flow("iris_explorer", shinyMgrPath = paste0(getwd(),"/shinymgr"))
    +
    +
          fkAppName      fkTabName tabOrder       fkModuleName modOrder
    +1 iris_explorer       IE_intro        1         iris_intro        1
    +2 iris_explorer   IE_iris_data        2       iris_cluster        1
    +3 iris_explorer IE_subset_rows        3        subset_rows        1
    +4 iris_explorer   IE_plot_data        4 single_column_plot        1
    +
    +

    As shown in Figure 2, this app has 5 tabs, and each tab features a single module. The “Save” tab is the final tab in all shinymgr apps and is not listed in the query result.

    +

    Developers can “beta test” apps prior to deployment by selecting the Analysis (beta) tab in the Developer’s Portal (Figure 3). They can also create RMarkdown or Quarto report templates that accept the outputs from an analysis and incorporate them into a report. Report metadata are logged in the “reports” table of the database, and then linked with a specific app in the “appReports” table. An end-user will run an analysis and render a report, a process described more fully in the “Using shinymgr Apps” section below.

    +

    To summarize this section, developers use the shinymgr_setup() function to create the directory structure and underlying database needed to build and run shiny apps with shinymgr. Developers use the mod_init() and mod_register() functions to create modules and make them available for inclusion in new apps built with the shinymgr app builder. A developer can create as many shinymgr projects as needed. In each case, the shinymgr project is simply a fixed directory structure with three R files (ui.R, server.R, and global.R), and a series of subdirectories that contain the apps and shiny modules created by the developer, along with a database for tracking everything.

    +

    3 Deploying shinymgr projects

    +

    Once development is completed, developers can deploy their shinymgr project on a server or within an R package by copying portions of the shinymgr project to a new location while retaining the original project for future development. Once deployed, a shinymgr project no longer requires the shinymgr package or database to be run. Thus, the files and directories to be copied for deployment include only:

    +
    +
    shinymgr
    +├── data
    +├── global.R
    +├── modules
    +├── modules_app
    +├── modules_mgr
    +├── reports
    +├── server.R
    +├── ui.R
    +└── www
    +
    +

    The master app files, ui.R, global.R, and server.R, are needed to run the shinymgr framework.

    +

    When deploying a shinymgr project within an R package, objects within the data folder should be copied into the package’s “data” folder. The remaining files should be copied into a directory within the package’s “inst” folder that will house the master shiny application. Deployment on a server such as shinyapps.io will require similar adjustments.

    +

    After files are copied to the correct location, a few key adjustments are needed. First, the “modules_app” directory should contain only those apps (and dependent modules and reports) that can be used by end-users; unused apps, modules, and reports can be deleted. Second, the new.analysis.R script within the modules_mgr folder will require minor updates to remove dependencies on the shinymgr database. Third, the ui.R and server.R scripts should be updated to no longer showcase shinymgr and the Developer’s Portal; rather, it should be customized by the developer to create their own purpose-driven apps. For example, Figure 6 shows a hypothetical deployment of the master app titled “Deployed Project” that is based on the shinymgr framework. Notice the absence of the Developer Tools tab and the absence of references to shinymgr. The “deployment” learnr tutorial provides more in-depth discussion.

    +
    +
    +An example of a deployed shinymgr app. The deployed version excludes the Developers Tools tab and is an example of what the end user sees when using a deployed app. +

    +Figure 6: An example of a deployed shinymgr app. The deployed version excludes the Developers Tools tab and is an example of what the end user sees when using a deployed app. +

    +
    +
    +

    To summarize this section, deploying the shinymgr framework involves copying key elements of the shinymgr developer project into package or server directories, updated as needed for use by end-users. Readers are referred to the “deployment” tutorial for further information.

    +

    4 Using shinymgr apps

    +

    Apps built with shinymgr can appeal to various types of end-users. When deployed as part of an R package, end-users would be anyone who uses that package. Apps may also be distributed as stand-alone scripts, or hosted on a server, as described above. Developers may also use shinymgr to produce apps for their own use (i.e., the developer is the end-user). Regardless of who the intended end-user is, this section discusses that user’s experience after the master app is deployed.

    +

    Whoever the intended audience for the app, this section discusses how an app can be used after it has been deployed.

    +

    4.1 Reproducible analyses

    +

    The final tab in any shinymgr app provides the opportunity to save the analysis itself. Reproducibility is a core tenet of shinymgr. Therefore, a robust set of metadata are saved as an RDS file to allow a user to understand and replicate their results. An example of a completed analysis is the file, “iris_explorer_Gandalf_2023_06_05_16_30.RDS,” which stores a user’s analytic steps for a run of the “iris explorer” app. The code below reads in this example file, and shows the structure (a list with 23 elements):

    +
    +
    +
    rds_filepath <- paste0(getwd(),"/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS")
    +old_analysis <- readRDS(rds_filepath)
    +str(old_analysis, max.level = 2, nchar.max = 20, vec.len = 15)
    +
    +
    List of 23
    + $ analysisName                          : chr "iri"| __truncated__
    + $ app                                   : chr "iris_explorer"
    + $ username                              : chr "Gandalf"
    + $ mod2-clusters                         : int 3
    + $ mod2-xcol                             : chr "Sepal.Length"
    + $ mod2-ycol                             : chr "Petal.Length"
    + $ mod3-full_table__reactable__pageSize  : int 10
    + $ mod3-resample                         : 'shinyActionButtonValue' int 1
    + $ mod3-full_table__reactable__pages     : int 15
    + $ mod3-subset_table__reactable__page    : int 1
    + $ mod3-full_table__reactable__page      : int 1
    + $ mod3-sample_num                       : int 20
    + $ mod3-subset_table__reactable__pages   : int 2
    + $ mod3-subset_table__reactable__pageSize: int 10
    + $ returns                               :List of 3
    +  ..$ data1:List of 1
    +  ..$ data2:List of 1
    +  ..$ data3:List of 2
    + $ notes                                 : chr "Thi"| __truncated__
    + $ timestamp                             : POSIXct[1:1], format: "202"| __truncated__
    + $ metadata                              :List of 6
    +  ..$ appDescription: chr "Clu"| __truncated__
    +  ..$ mod1          :List of 7
    +  ..$ mod2          :List of 7
    +  ..$ mod3          :List of 7
    +  ..$ mod4          :List of 7
    +  ..$ lockfile      :List of 2
    + $ app_code                              : chr "# T"| __truncated__
    + $ iris_intro_code                       : chr "#!!"| __truncated__
    + $ iris_cluster_code                     : chr "#!!"| __truncated__
    + $ subset_rows_code                      : chr "#!!"| __truncated__
    + $ single_column_plot_code               : chr "#!!"| __truncated__
    +
    +

    The list stores a great deal of information:

    +
      +
    • analysisName is the name of the analysis and is equivalent to the filename of the RDS file (without the extension)
    • +
    • app is the name of the app that produced the saved analysis results.
    • +
    • username was entered in the “Save” tab when the analysis was performed.
    • +
    • mod#-value indicate the values of each shiny module’s arguments (inputs), if any exist, at the time the analysis was saved.
    • +
    • returns includes values of all outputs (returns) of each module.
    • +
    • notes were entered in the “Save” tab when the analysis was performed.
    • +
    • timestamp is the date/time when the analysis was saved.
    • +
    • metadata includes robust information about each module, including the app description and the description of each module as it was originally stored in the shinymgr database tables. The metadata list element also includes an renv “lockfile”: a list that describes the R version and R package dependencies (including shinymgr) used by the app itself. The lockfile captures the state of the app’s package dependencies at the time of its creation; in the case of shinymgr, it contains the dependencies used by the developer who created the app. Each lockfile record includes the name and version of the package and their installation source.
    • +
    • *_code attributes with this format contain the source code for the app.
    • +
    +

    The code list element allows an end user to revisit the full analysis with shinymgr’s rerun_analysis() function, supplying the file path to a saved shinymgr analysis (RDS file).

    +
    +
    +
    rerun_analysis(analysis_path = rds_filepath)
    +
    +
    +

    The rerun_analysis() function will launch a shiny app with two tabs (Figure 7); it can only be run during an interactive R session, with no other shiny apps running.

    +
    +
    +A screenshot of the rerun\_analysis() function, as called on the saved analysis from the iris\_explorer app (RDS file). The active tab, called 'The App', allows a user to rerun a previously executed analysis. The 'Analysis Summary' tab displays the values of all module arguments and returns, captured when the analysis was saved, along with a detailed description of the app, it's modules, the App's source code, and all package dependencies. +

    +Figure 7: A screenshot of the rerun_analysis() function, as called on the saved analysis from the iris_explorer app (RDS file). The active tab, called ‘The App’, allows a user to rerun a previously executed analysis. The ‘Analysis Summary’ tab displays the values of all module arguments and returns, captured when the analysis was saved, along with a detailed description of the app, it’s modules, the App’s source code, and all package dependencies. +

    +
    +
    +

    The first tab is called “The App”, and will be visible when the rerun_analysis() function is called. It contains a header with the app’s name, a subheading of “Analysis Rerun,” and a fully functioning, identical copy of the shiny app used to generate the saved analysis. Below that, a disclaimer appears, indicating the app was produced from a saved analysis. A summary of the analysis is presented on the second tab that displays the values used to produce the given analysis output.

    +

    If the rerun_analysis() function fails, it could be due to a change in R and package versions currently installed on the end-user’s machine. To that end, the lockfile that is included in the metadata section of the RDS file can be used to restore the necessary R packages and R version with the restore_analysis() function. This function will attempt to create a self-contained renv R project that includes all of the packages and the R version used by the developer when the app was created. The analysis RDS is added to this new project, where the rerun_analysis() function can be attempted again. Readers are referred to the “analyses” tutorial for further information.

    +

    4.2 Rapid reporting

    +

    Another important feature of shinymgr is the ability to share results of an analysis with others in a friendly, readable format with RMarkdown or Quarto. Apps produce an RDS file, which may be passed into an Rmd or qmd file as a parameterized input. For example, the demo database includes a report template called “iris_explorer_report.Rmd.” This file, with code shown below, allows users to navigate to the RDS file produced by the “iris explorer” app and render the rapid report.

    +
    +
    ---
    +title: 'Annual Report for Iris Explorer'
    +output: html_document
    +params:
    +  user: 
    +    label: "User"
    +    value: "Bilbo"
    +    placeholder: "Enter user name"
    +  year:
    +    label: "Year"
    +    value: 2017
    +    input: slider
    +    min: 2010
    +    max: 2018
    +    step: 1
    +    sep: ""
    +  file: 
    +   input: file
    +   label: "Choose RDS"
    +   value: ""
    +   multiple: FALSE
    +   buttonLabel: "Browse to analysis output..."
    +---
    +
    +```{r setup, include=FALSE}
    +knitr::opts_chunk$set(echo = FALSE)
    +library(knitr)
    +ps <- readRDS(params$file)
    +```
    +
    +This report summarizes an analysis of iris data by 
    +`r params$user` conducted  in `r params$year`. Iris 
    +data was clustered into `r ps$'mod2-clusters'` groups 
    +based on `r ps$'mod2-xcol'` and `r ps$'mod2-ycol'`. 
    +A random sample of  `r ps$'mod3-sample_num'` records 
    +were collected, with sample sizes shown in the pie
    +chart below:
    +
    +```{r}
    +pie_data <- table(ps$returns$data2$subset_data$cluster)
    +pie(
    +  x = pie_data,
    +  labels = as.character(pie_data), 
    +  col = rainbow(length(pie_data)),
    +  main = "Number of random samples by cluster"
    +)
    +legend(
    +  x = "topright", 
    +  legend = names(pie_data), 
    +  fill = rainbow(length(pie_data))
    +)
    +
    +```
    +
    +Some things to note about this analysis are:  `r ps$notes`
    +
    +Respectfully submitted,
    +
    +Gandalf
    +
    +

    Reports may be run within the deployed version of shinymgr (e.g., left menu of Figure 6), or may be run directly in R by opening the Rmd file and navigating to the RDS as a file input. Users who run a report can download it to their local machine as a HTML, PDF, or Word file, where they can further customize the output.

    +

    To summarize this section, users of shinymgr “apps” created with the shinymgr framework are presented with a series of shiny tabs that establish an analysis workflow. Users can save their inputs and outputs as an RDS file to ensure full reproducibility. Further, the RDS file may be loaded into an R Markdown (Rmd) or Quarto (qmd) template for rapid reporting.

    +

    5 Tutorials and cheatsheet

    +

    with the package. Below is a list of current tutorials, intended to be worked through in order:

    +
    +
    Available tutorials:
    +* shinymgr
    +  - intro            : "shinymgr-01: Introduction"
    +  - shiny            : "shinymgr-02: Shiny"
    +  - modules          : "shinymgr-03: Modules"
    +  - app_modules      : "shinymgr-04: App modules"
    +  - tests            : "shinymgr-05: Tests"
    +  - shinymgr         : "shinymgr-06: shinymgr"
    +  - database         : "shinymgr-07: Database"
    +  - shinymgr_modules : "shinymgr-08: shinymgr_modules "
    +  - apps             : "shinymgr-09: Apps"
    +  - analyses         : "shinymgr-10: Analyses"
    +  - reports          : "shinymgr-11: Reports"
    +  - deployment       : "shinymgr-12: Deployment" 
    +
    +

    The “intro” tutorial gives a general overview. Tutorials 2-5 are aimed at developers who are new to shiny, while tutorials 6 – 12 focus on the shinymgr package.

    +

    Launch a tutorial with the learnr run_tutorial() function, providing the name of the module to launch. The tutorial should launch in a browser, which has the benefit of being able to print the tutorial to PDF upon completion:

    +
    +
    +
    learnr::run_tutorial(
    +  name = "modules", 
    +  package = "shinymgr")
    +
    +
    +

    Additionally, the package cheatsheet can be found with:

    +
    +
    +
    browseURL(paste0(find.package("shinymgr"), "/extdata/shinymgr_cheatsheet.pdf"))
    +
    +
    +

    Contributions are welcome from the community. Questions can be asked on the +issues page at https://code.usgs.gov/vtcfwru/shinymgr/issues.

    +

    6 Acknowledgments

    +

    We thank Cathleen Balantic and Jim Hines for feedback on the overall package and package tutorials. shinymgr was prototyped by Therese Donovan at a shiny workshop taught by Chris Dorich and Matthew Ross at Colorado State University in 2020 (pre-pandemic). We thank the instructors for feedback and initial coding assistance. Any use of trade, firm, or product names is for descriptive purposes only and does not imply endorsement by the U.S. Government. The Vermont Cooperative Fish and Wildlife Research Unit is jointly supported by the U.S. Geological Survey, University of Vermont, Vermont Fish and Wildlife Department, and Wildlife Management Institute.

    +

    7 Bibliography

    +
    +
    +J. M. Alston and J. A. Rick. A beginner’s guide to conducting reproducible research. The Bulletin of the Ecological Society of America, 102(2): e01801, 2021. URL https://esajournals.onlinelibrary.wiley.com/doi/abs/10.1002/bes2.1801. +
    +
    +D. Attali. Shinyjs: Easily improve the user experience of your shiny apps in seconds. 2021. URL https://CRAN.R-project.org/package=shinyjs. R package version 2.1.0. +
    +
    +C. Balantic and T. Donovan. AMMonitor: Remote monitoring of biodiversity in an adaptive framework with r. Methods in Ecology and Evolution, 11(7): 869–877, 2020. DOI https://doi.org/10.1111/2041-210X.13397. +
    +
    +C. Brett and I. Neuhaus. Periscope: Enterprise streamlined ’shiny’ application framework. 2022. URL https://CRAN.R-project.org/package=periscope. R package version 1.0.1. +
    +
    +P. Campbell. Shinyauthr: ’Shiny’ authentication modules. 2021. URL https://CRAN.R-project.org/package=shinyauthr. R package version 1.0.0. +
    +
    +W. Chang and B. Borges Ribeiro. Shinydashboard: Create dashboards with ’shiny’. 2021. URL https://CRAN.R-project.org/package=shinydashboard. R package version 0.7.2. +
    +
    +W. Chang, J. Cheng, J. Allaire, C. Sievert, B. Schloerke, Y. Xie, J. Allen, J. McPherson, A. Dipert and B. Borges. Shiny: Web application framework for r. 2022. URL https://CRAN.R-project.org/package=shiny. R package version 1.7.3. +
    +
    +W. Chang, G. Csárdi and H. Wickham. Shinytest: Test shiny apps. 2021. URL https://CRAN.R-project.org/package=shinytest. R package version 1.5.1. +
    +
    +L. Clarfeld, C. Tang and T. Donovan. Shinymgr: A framework for building, managing, and stitching shiny modules into reproducible workflows. U.S. Geological Survey software release. Reston, VA., 2024. DOI 10.5066/P9UXPOBN. R package version 1.1.0. +
    +
    +C. Fay and S. Rochette. Shinipsum: Lorem-ipsum helper function for ’shiny’ prototyping. 2020. URL https://cran.r-project.org/web/packages/shinipsum/index.html. R package version 0.1.0. +
    +
    +C. Fay, S. Rochette, V. Guyader and C. Girard. Engineering production-grade shiny apps. Chapman; Hall/CRC, 2021. DOI https://doi.org/10.1201/9781003029878. +
    +
    +R. A. Fisher. The use of multiple measurements in taxonomic problems. Annals of eugenics, 7(2): 179–188, 1936. DOI https://doi.org/10.1111/j.1469-1809.1936.tb02137.x. +
    +
    +R. Gentleman and D. T. Lang. Statistical analyses and reproducible research. Journal of Computational and Graphical Statistics, 16(1): 1–23, 2007. URL https://doi.org/10.1198/106186007X178663. +
    +
    +R. D. Hipp. SQLite. 2020. URL https://www.sqlite.org/index.html. +
    +
    +J. Kim and H. Lee. Jsmodule: ’RStudio’ addins and ’shiny’ modules for medical research. 2022. URL https://CRAN.R-project.org/package=jsmodule. R package version 1.3.0. +
    +
    +M. Kosinski. Shiny.reglog: Optional login and registration module system for ShinyApps. 2022. URL https://statismike.github.io/shiny.reglog/. R package version 0.5.2. +
    +
    +C. Larman. Agile and iterative development: A manager’s guide. Addison-Wesley Professional, 2004. +
    +
    +G. Lin. Reactable: Interactive data tables based on ’react table’. 2022. URL https://CRAN.R-project.org/package=reactable. R package version 0.3.0. +
    +
    +Modularizing shiny app code. 2020. URL https://shiny.posit.co/r/articles/improve/modules/. Accessed: 2010-09-30. +
    +
    +K. Müller, H. Wickham, D. A. James and S. Falcon. RSQLite: SQLite interface for r. 2022. URL https://CRAN.R-project.org/package=RSQLite. R package version 2.2.14. +
    +
    +R. D. Peng. Reproducible research in computational science. Science, 334(6060): 1226–1227, 2011. URL https://www.science.org/doi/abs/10.1126/science.1213847. +
    +
    +V. Perrier, F. Meyer and Z. S. Abeer. Datamods: Modules to import and manipulate data in ’shiny’. 2022. URL https://CRAN.R-project.org/package=datamods. R package version 1.3.3. +
    +
    +R Special Interest Group on Databases (R-SIG-DB), H. Wickham and K. Müller. DBI: R database interface. 2022. URL https://CRAN.R-project.org/package=DBI. R package version 1.1.3. +
    +
    +B. Schloerke, J. Allaire and B. Borges. Learnr: Interactive tutorials for r. 2020. URL https://CRAN.R-project.org/package=learnr. R package version 0.10.1. +
    +
    +S. Stoudt, V. N. Vásquez and C. C. Martinez. Principles for data analysis workflows. PLOS Computational Biology, 17(3): e1008770, 2021. DOI https://doi.org/10.1371/journal.pcbi.1008770. +
    +
    +K. Ushey. Renv: Project environments. 2023. URL https://rstudio.github.io/renv/. R package version 0.17.3. +
    +
    +H. Wickham. Testthat: Get started with testing. The R Journal, 3: 5–10, 2011. URL https://journal.r-project.org/archive/2011-1/RJournal_2011-1_Wickham.pdf. +
    +
    +K. Żyła, J. Nowicki, L. Siemiński, M. Rogala, R. Vibal and T. Makowski. Rhino: A framework for enterprise shiny applications. 2023. DOI https://doi.org/10.32614/CRAN.package.rhino. https://appsilon.github.io/rhino/, https://github.com/Appsilon/rhino. +
    +
    +
    +

    8 Appendix A

    +

    Entity relationship diagram for the shinymgr database, which tracks all components of an apps and modules (Figure 8). The database consists of 11 tables. Primary keys are referenced with a “pk” prefix, while foreign keys are referenced with an “fk” prefix. A full description of the database is contained in the “database” learnr tutorial that comes with the shinymgr package

    +
    +
    +Entity relationship diagram for the shinymgr database, which tracks all components of an apps and modules.  The database consists of 11 tables. Primary keys are referenced with a 'pk' prefix, while foreign keys are referenced with an 'fk' prefix. A full description of the database is contained in the 'database' learnr tutorial that comes with the shinymgr package. +

    +Figure 8: Entity relationship diagram for the shinymgr database, which tracks all components of an apps and modules. The database consists of 11 tables. Primary keys are referenced with a ‘pk’ prefix, while foreign keys are referenced with an ‘fk’ prefix. A full description of the database is contained in the ‘database’ learnr tutorial that comes with the shinymgr package. +

    +
    +
    +
    +

    9 Appendix B

    +

    Modules in shinymgr are written by developers for their own purposes. The shinymgr::mod_init() function creates a template for module development. The header is a series of key-value pairs that the developer fills out (typically after the module code is written and tested). The “iris_cluster” module is presented below as an example. The module consists of two paired functions: here, iris_cluster_ui(id) and iris_cluster_server(). The UI is a function with an argument called id, which is turned into module’s “namespace” with the NS() function. A namespace is simply the module’s identifier and ensures that function and object names within a given module do not conflict with function and object names in other modules. The Id’s for each input and output in the UI must be wrapped in a ns() function call to make explicit that these inputs are assigned to the module’s namespace. All UI elements are wrapped in a tagList() function, where a tagList allows one to combine multiple UI elements into a single R object. Readers should consult the “modules,” “tests,” and “shinymgr_modules” tutorials for additional information.

    +
    +
    #!! ModName = iris_cluster
    +#!! ModDisplayName = Iris K-Means Clustering
    +#!! ModDescription = Clusters iris data based on 2 attributes
    +#!! ModCitation = Baggins, Bilbo.  (2022). iris_cluster. [Source code].
    +#!! ModNotes = 
    +#!! ModActive = 1
    +#!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame
    +
    +iris_cluster_ui <- function(id){
    +  # create the module's namespace 
    +  ns <- NS(id)
    +  
    +  tagList(
    +    sidebarLayout(
    +      sidebarPanel(
    +        # add the dropdown for the X variable
    +        selectInput(
    +          ns("xcol"),
    +          label = "X Variable", 
    +          choices = c(
    +            "Sepal.Length", 
    +            "Sepal.Width", 
    +            "Petal.Length", 
    +            "Petal.Width"
    +          ),
    +          selected = "Sepal.Length"
    +        ),
    +        
    +        # add the dropdown for the Y variable
    +        selectInput(
    +          ns("ycol"), 
    +          label = "Y Variable", 
    +          choices = c(
    +            "Sepal.Length", 
    +            "Sepal.Width", 
    +            "Petal.Length", 
    +            "Petal.Width"
    +          ),
    +          selected = "Sepal.Width"
    +        ),
    +        # add input box for the cluster number
    +        
    +        numericInput(
    +          ns("clusters"), 
    +          label = "Cluster count", 
    +          value = 3, 
    +          min = 1, 
    +          max = 9
    +        )
    +      ), # end of sidebarPanel
    +      
    +      mainPanel(
    +        # create outputs
    +        plotOutput(
    +          ns("plot1")
    +        )
    +      ) # end of mainPanel
    +    ) # end of sidebarLayout
    +  ) # end of tagList
    +} # end of UI function
    +
    +iris_cluster_server <- function(id) { 
    +  
    +  moduleServer(id, function(input, output, session) {
    +    
    +    # combine variables into new data frame
    +    selectedData <- reactive({
    +      iris[, c(input$xcol, input$ycol)]
    +    })
    +    
    +    # run kmeans algorithm 
    +    clusters <- reactive({
    +      kmeans(
    +        x = selectedData(), 
    +        centers = input$clusters
    +      )
    +    })
    +    
    +    output$plot1 <- renderPlot({
    +      par(mar = c(5.1, 4.1, 0, 1))
    +      plot(
    +        selectedData(),
    +        col = clusters()$cluster,
    +        pch = 20, 
    +        cex = 3
    +      )
    +    })
    +    
    +    return(
    +      reactiveValues(
    +        returndf = reactive({
    +          cbind(
    +            selectedData(), 
    +            cluster = clusters()$cluster
    +          )
    +        })
    +      )
    +    )
    +    
    +  }) # end of moduleServer function
    +  
    +} # end of irisCluster function
    +
    +
    +
    +

    9.1 CRAN packages used

    +

    shiny, shinipsum, golem, rhino, datamods, shiny.reglog, periscope, shinyauthr, jsmodule, shinymgr, DBI, reactable, RSQLite, renv, shinyjs, shinydashboard, learnr, testthat, shinytest

    +

    9.2 CRAN Task Views implied by cited packages

    +

    Databases, ReproducibleResearch, WebTechnologies

    + + +
    + +
    +
    + + + + + +
    +

    References

    +
    +

    Reuse

    +

    Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

    +

    Citation

    +

    For attribution, please cite this work as

    +
    Clarfeld, et al., "shinymgr: A Framework for Building, Managing, and Stitching Shiny Modules into Reproducible Workflows", The R Journal, 2025
    +

    BibTeX citation

    +
    @article{RJ-2024-009,
    +  author = {Clarfeld, Laurence A. and Tang, Caroline and Donovan, Therese},
    +  title = {shinymgr: A Framework for Building, Managing, and Stitching Shiny Modules into Reproducible Workflows},
    +  journal = {The R Journal},
    +  year = {2025},
    +  note = {https://doi.org/10.32614/RJ-2024-009},
    +  doi = {10.32614/RJ-2024-009},
    +  volume = {16},
    +  issue = {1},
    +  issn = {2073-4859},
    +  pages = {157-174}
    +}
    +
    + + + + + + + diff --git a/_articles/RJ-2024-009/RJ-2024-009.pdf b/_articles/RJ-2024-009/RJ-2024-009.pdf new file mode 100644 index 0000000000..e4f17e65ff Binary files /dev/null and b/_articles/RJ-2024-009/RJ-2024-009.pdf differ diff --git a/_articles/RJ-2024-009/RJ-2024-009.tex b/_articles/RJ-2024-009/RJ-2024-009.tex new file mode 100644 index 0000000000..bb710b02a4 --- /dev/null +++ b/_articles/RJ-2024-009/RJ-2024-009.tex @@ -0,0 +1,665 @@ +% !TeX root = RJwrapper.tex +\title{shinymgr: A Framework for Building, Managing, and Stitching Shiny Modules into Reproducible Workflows} + + +\author{by Laurence A. Clarfeld, Caroline Tang, and Therese Donovan} + +\maketitle + +\abstract{% +The R package shinymgr provides a unifying framework that allows Shiny developers to create, manage, and deploy a master Shiny application comprised of one or more ``apps'', where an ``app'' is a tab-based workflow that guides end-users through a step-by-step analysis. Each tab in a given ``app'' consists of one or more Shiny modules. The shinymgr app builder allows developers to ``stitch'' Shiny modules together so that outputs from one module serve as inputs to the next, creating an analysis pipeline that is easy to implement and maintain. Apps developed using shinymgr can be incorporated into R packages or deployed on a server, where they are accessible to end-users. Users of shinymgr apps can save analyses as an RDS file that fully reproduces the analytic steps and can be ingested into an RMarkdown or Quarto report for rapid reporting. In short, developers use the shinymgr framework to write Shiny modules and seamlessly combine them into Shiny apps, and end-users of these apps can execute reproducible analyses that can be incorporated into reports for rapid dissemination. A comprehensive overview of the package is provided by 12 learnr tutorials. +} + +\section{Introduction}\label{intro} + +The \CRANpkg{shiny} R package allows users to build interactive web apps straight from R, without advanced knowledge of HTML or JavaScript \citep{shiny}. A \emph{shiny} web app can permit an expedient analysis pipeline or workflow. Ideally, the pipeline can produce outputs that are fully reproducible \citep{Peng, Gentleman, Alston}. Moreover, the pipeline can permit rapid reporting to convey the results of an analysis workflow to a target audience \citep{stoudt2021principles} (Figure \ref{fig:fig1}). + +\begin{figure} +\includegraphics[width=1\linewidth]{images/figure1} \caption{Stages of a reproducible workflow, a process that moves an inquiry from raw data to insightful contribution.}\label{fig:fig1} +\end{figure} + +\emph{shiny} applications range from simple to complex, each with an intended purpose developed for an intended user audience. Several R packages provide a development framework for building multi-faceted master applications, including \CRANpkg{shinipsum} for prototyping \citep{shinipsum}, \CRANpkg{golem} \citep{golum}, and \CRANpkg{rhino} \citep{rhino}. + +From the developer's perspective, complex \emph{shiny} applications can result in many lines of code, creating challenges for collaborating, debugging, streamlining, and maintaining the overall product. \emph{shiny} modules are a solution to this problem. As stated by Winston Chang \citep{shinyblog}, ``A \emph{shiny} module is a piece of a \emph{shiny} app. It can't be directly run, as a \emph{shiny} app can. Instead, it is included as part of a larger app . . . Once created, a \emph{shiny} module can be easily reused -- whether across different apps, or multiple times in a single app.'' \emph{shiny} modules, and modularization in general, are a core element of agile software development practices \citep{larman2004agile}. Several authors have contributed R packages for distributing pre-written \emph{shiny} modules for general use, including the \CRANpkg{datamods} \citep{datamods}, \CRANpkg{shiny.reglog} \citep{reglog}, \CRANpkg{periscope} \citep{periscope}, \CRANpkg{shinyauthr} \citep{shinyauthr}, and \CRANpkg{jsmodule} \citep{jsmodule} packages. + +However, as the number of available modules increases, there is a pressing need for documenting available \emph{shiny} modules and easily incorporating them into new workflows. For example, consider a toy modular-based app that guides a user through an analysis of the famous ``Iris Dataset,'' which contains 150 records of 3 species of iris, including measurements of the length and width of the flowers' sepals and petals \citep{fisher1936use}. The app, called ``Iris Explorer,'' consists of 5 tabs to be worked through in sequence (Figure \ref{fig:fig2}, top). + +Tab 1 displays instructions for use, while tab 2 performs a \emph{k}-means clustering of the data, where \emph{k} is specified by the user. The resulting clusters are displayed with two variables of the user's choosing as depicted in Figure \ref{fig:fig2}. In tab 3, the user will choose a value \emph{n}, indicating the number of rows by which to randomly subset the data, and in tab 4 the user selects a single variable to be plotted as a bar chart. Finally, in tab 5 the user can save their outputs as an RDS file. This contrived example includes some key elements of a typical workflow in that the five tabs introduce a dataset, guide the user through light data wrangling, produce analysis outputs, and offer the ability to save the results. + +The app's blueprint (Figure \ref{fig:fig2}, bottom) identifies the \emph{shiny} modules in each tab, showing how outputs from one module can serve as inputs to the next. Note that while this example shows a single module in each tab with differing inputs/outputs, in the general case tabs can contain an arbitrary number of \emph{shiny} modules (including multiple instances of the same module) and each module can have multiple inputs/outputs. + +While two of the \emph{shiny} modules within the ``iris\_explorer'' app pertain to the iris dataset specifically (``iris\_intro'' and ``iris\_cluster''), the remaining \emph{shiny} modules (``subset\_rows'', ``single\_column\_plot'', and ``save'') may be incorporated into other apps. + +\begin{figure}[h] +\includegraphics[width=1\linewidth]{images/figure2} \caption{Top: The "iris\_explorer" app guides a user through an analysis of the iris dataset in a tab-based sequence. Bottom: A blueprint of the "iris\_explorer" app shows the 5 tabs, each containing a single module identified by name within blue ovals. Some of the shiny modules require inputs and generate outputs as identified in gray polygons.}\label{fig:fig2} +\end{figure} + +\newpage + +Developers who utilize the same \emph{shiny} modules within different apps will naturally be faced with several questions: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\tightlist +\item + Which \emph{shiny} modules have been written? Are they well documented with unit testing? +\item + What are the module's inputs (arguments) and outputs (returns)? +\item + Where are the \emph{shiny} modules stored? +\item + How can \emph{shiny} modules be combined into a cohesive, well-documented app? +\item + How can production-ready apps be deployed for end-users? +\end{enumerate} + +Users of an app created with the \emph{shinymgr} framework may wish to know: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\setcounter{enumi}{5} +\tightlist +\item + Can analysis outputs be saved as a fully reproducible workflow? +\item + Can outputs be ingested into a \emph{Rmarkdown} or \emph{Quarto} template for rapid reporting? +\end{enumerate} + +\subsection{\texorpdfstring{Introducing \emph{shinymgr}}{Introducing shinymgr}}\label{introducing-shinymgr} + +The R package, \CRANpkg{shinymgr}, was developed to meet these challenges \citep{shinymgr_citation}. The \emph{shinymgr} package includes a general framework that allows developers to create \emph{shiny} modules, stitch them together as individual ``apps'' that are embedded within the master \emph{shiny} application, and then deploy them on a \emph{shiny} server or incorporate them into R packages. \emph{shinymgr} was motivated from our first-hand experience in our work building tools that assist scientists in remote wildlife monitoring with the R package \emph{AMMonitor} \citep{balantic2020ammonitor}. Dependencies of \emph{shinymgr} include the packages \CRANpkg{DBI} \citep{dbi}, \CRANpkg{reactable} \citep{reactable}, \CRANpkg{RSQLite} \citep{RSQLite}, \CRANpkg{renv} \citep{renv}, \CRANpkg{shiny} \citep{shiny}, \CRANpkg{shinyjs} \citep{shinyjs}, and \CRANpkg{shinydashboard} \citep{shinydashboard}. + +From the developer's perspective, an ``app'' consists of an ordered set of tabs, each of which contain specified \emph{shiny} modules. \emph{shiny} modules are the basic element in the \emph{shinymgr} framework; they can be used and re-used across different tabs and different apps. Information about each module and app is stored in a SQLite database \citep{sqlite2020hipp}. The \emph{shinymgr} app builder ``stitches'' \emph{shiny} modules together so that outputs from one module serve as inputs to the next, creating an analysis pipeline that is easy to implement and maintain. When apps are production-ready , developers can deploy a stand-alone \emph{shiny} application independent of \emph{shinymgr} on a server or within an R package. From the end-user's perspective, an ``app'' created with the \emph{shinymgr} framework consists of an ordered series of \emph{shiny} tabs, establishing an analysis. Users can save their inputs and outputs as an RDS file to ensure full reproducibility. Furthermore, the RDS file may be loaded into an R Markdown (Rmd) or Quarto (qmd) template for rapid reporting. We are unaware of existing packages that unify the elements of modularization, documentation, reproducibility, and reporting in a single framework. + +We introduce \emph{shinymgr} in sections 2-4 below. In section \hyperref[appdev]{2} we describe how developers can create apps using the \emph{shinymgr} framework. In section \hyperref[appdeploy]{3} we describe how developers can deploy a \emph{shinymgr} project on a local machine, server, or within an R package. In section \hyperref[appUsing]{4} describes the end-user experience, where end-users execute an ``app'' and store results for reproducibility and reporting. The package tutorials and cheat sheet are described in section \hyperref[tuts]{5}. The \emph{shinymgr} package comes with a series of \CRANpkg{learnr} \citep{learnr} tutorials described at the end of the paper. + +\section{\texorpdfstring{Developing \emph{shinymgr} apps}{Developing shinymgr apps}}\label{appdev} + +\subsection{\texorpdfstring{Setting up \emph{shinymgr}}{Setting up shinymgr}}\label{setting-up-shinymgr} + +The canonical home of \emph{shinymgr} is \url{https://code.usgs.gov/vtcfwru/shinymgr/} where \emph{shinymgr} users may post merge requests and bug fix requests. \emph{shinymgr} may also be downloaded from CRAN. + +\begin{verbatim} +install.packages("shinymgr") +\end{verbatim} + +The development version can be downloaded with: + +\begin{verbatim} +remotes::install_gitlab( + repo = "vtcfwru/shinymgr", + auth_token = Sys.getenv("GITLAB_PAT"), + host = "code.usgs.gov", + build_vignettes = FALSE) +\end{verbatim} + +Once installed, a new \emph{shinymgr} project can be created within a parent directory: + +\begin{verbatim} +# set the directory path that will house the shinymgr project +parentPath <- getwd() + +# set up raw directories and fresh database +shinymgr_setup( + parentPath = parentPath, + demo = TRUE) +\end{verbatim} + +The \texttt{shinymgr\_setup()} function produces the following directory structure within the primary ``shinymgr'' directory. This structure consists of 3 files that make up the ``master'' app (global.R, server.R, and ui.R), and 9 directories. If the argument demo is set to FALSE, these directories will be largely empty, except for the ``modules\_mgr'' and ``database'' directories, which will contain \emph{shiny} modules for rendering \emph{shinymgr}'s UI and an empty SQLite database, respectively. If the argument demo is set to TRUE, each directory will include several demo files as shown, including a pre-populated database. Here, we highlight a subset of the demo files related to the ``iris\_explorer'' app to guide developers through the key elements of \emph{shinymgr} (additional demo files come with package but are omitted here for clarity). + +\begin{verbatim} +shinymgr ++-- analyses +| \-- iris_explorer_Gandalf_2023_06_05_16_30.RDS ++-- data +| \-- iris.RData ++-- database +| \-- shinymgr.sqlite ++-- global.R ++-- modules +| +-- iris_cluster.R +| +-- iris_intro.R +| +-- single_column_plot.R +| \-- subset_rows.R ++-- modules_app +| \-- iris_explorer.R ++-- modules_mgr +| +-- add_app.R +| +-- add_mod.R +| +-- add_report.R +| +-- add_tab.R +| +-- app_builder.R +| +-- my_db.R +| +-- new_analysis.R +| +-- new_report.R +| +-- queries.R +| +-- save_analysis.R +| +-- stitch_script.R +| \-- table.R ++-- reports +| \-- iris_explorer +| \-- iris_explorer_report.Rmd ++-- server.R ++-- tests +| +-- shinytest +| | +-- test-iris_explorer-expected +| | | +-- 001.json +| | | +-- 001.png +| | | +-- 002.json +| | | \-- 002.png +| | \-- test-iris_explorer.R +| +-- shinytest.R +| +-- testthat +| | +-- test-iris_cluster.R +| | \-- test-subset_rows.R +| \-- testthat.R ++-- ui.R +\-- www + +-- dark_mode.css + \-- shinymgr-hexsticker.png +\end{verbatim} + +The directory structure produced by \texttt{shinymgr\_setup()} includes the following: + +\begin{itemize} +\item + The \textbf{analyses} directory provides the developer an example of a previously run analysis that was created using the \emph{shinymgr} framework (an RDS file). An analysis file name includes the app name (e.g.~``iris\_explorer''), the name of the person who ran the analysis (e.g.~``Gandalf''), and the date and time of the analysis (e.g., ``iris\_explorer\_Gandalf\_2023\_06\_05\_16\_30.RDS''). +\item + The \textbf{data} directory stores RData files that can be used by various \emph{shinymgr} apps (e.g., ``iris.RData''). +\item + The \textbf{database} directory stores the \emph{shinymgr} SQLite database, named ``shinymgr.sqlite.'' The database is used by the developer to track all \emph{shiny} modules, their arguments (inputs), returns (outputs), and how they are combined into \emph{shinymgr} apps. +\item + The \textbf{modules} directory stores stand-alone \emph{shiny} modules. These files are largely written by the developer with the help of the \texttt{mod\_init()} function, and are registered in the database with the \texttt{mod\_register()} function. Four of the example \emph{shiny} modules listed are used in the ``iris\_explorer'' app. +\item + The \textbf{modules\_app} directory stores \emph{shiny} modules that are \emph{shinymgr} ``apps'' -- the stitching together of \emph{shiny} modules into a tab-based layout that provides an analysis workflow (Figure \ref{fig:fig2} shows the + ``iris\_explorer'' app layout). Files within the ``modules\_app'' directory are not written by hand - instead, they are created with the \emph{shinymgr} ``app builder.'' +\item + The \textbf{modules\_mgr} directory stores \emph{shiny} modules that build the overall \emph{shinymgr} framework. +\item + The \textbf{reports} directory provides an example of an \emph{RMarkdown} (Rmd) template (e.g., ``iris\_explorer\_report.Rmd''), allowing for rapid reporting by an end-user. +\item + The \textbf{tests} directory stores both \CRANpkg{testthat} \citep{testthat} and \CRANpkg{shinytest} \citep{shinytest} code testing scripts. +\item + The \textbf{www} directory stores images that may be used by a \emph{shiny} app. +\item + In addition to these directories, three files are created for launching the master \emph{shinymgr} \emph{shiny} application: + + \begin{enumerate} + \def\labelenumi{\arabic{enumi}.} + \tightlist + \item + \textbf{ui.R} - This file contains code to set the user interface for the master \emph{shinymgr} app.\\ + \item + \textbf{server.R} - The master server file.\\ + \item + \textbf{global.R} - The global.R file is sourced into the server.R file at start-up. It sources all of the \emph{shiny} modules within the \emph{shinymgr} framework so they are available when \emph{shinymgr} is launched. + \end{enumerate} +\end{itemize} + +\subsection{\texorpdfstring{The \emph{shinymgr} developer's portal}{The shinymgr developer's portal}}\label{the-shinymgr-developers-portal} + +Once set-up is complete, the \texttt{launch\_shinymgr()} function will launch the \emph{shinymgr} ``Developer's Portal'' UI, allowing developers to create and test new \emph{shinymgr} apps. + +\begin{verbatim} +# launch shinymgr +launch_shinymgr(shinyMgrPath = paste0(parentPath, "/shinymgr")) +\end{verbatim} + +The portal is recognizable by the \emph{shinymgr} logo in the upper left corner (Figure \ref{fig:fig3}). The portal consists of three main tabs in the left menu. The ``Developer Tools'' tab is used to create apps, view the \emph{shinymgr} database, and register reports, while the ``Analysis (beta)'' and ``Reports (beta)'' tabs allow developers to evaluate apps from the user's perspective. + +\begin{figure} +\includegraphics[width=1\linewidth]{images/figure3} \caption{The shinymgr Developer Portal consists of a sidebar panel where developers can create new shiny modules and new apps, and test-drive analyses and reports from the user's perspective. The main panel shows the "Build App" tab within the "Developer Tools" section.}\label{fig:fig3} +\end{figure} + +The ``Developer Tools'' section includes 4 tabs for app development: The ``Build App'' tab allows the developer to create new \emph{shinymgr} apps from existing modules using the \emph{shinymgr} app builder; the ``Database'' tab displays the \emph{shinymgr} database tables, the ``Queries'' tab contains a set of standard database queries, and the ``Add Reports'' tab allows the developer to link a report (Rmd or qmd) to a given \emph{shinymgr} app (Figure \ref{fig:fig3}), as described below. + +\subsection{\texorpdfstring{The \emph{shinymgr} database}{The shinymgr database}}\label{the-shinymgr-database} + +The \emph{shinymgr} SQLite database (``shinymgr.sqlite'') is a single file created by the \texttt{shinymgr\_setup()} function. The database tracks all \emph{shiny} modules, their arguments (inputs), returns (outputs), their package dependencies and version numbers, how they are combined into an ``app,'' and any reports that are associated with apps. The database tables are populated via dedicated \emph{shinymgr} functions. + +The \emph{shinymgr} database consists of 11 tables in total (Figure \ref{fig:fig4}). These tables are connected to each other as a typical relational database, with primary keys establishing unique records in each table, and foreign keys that reference primary keys in other tables (see Appendix A for a full database schema and the ``database'' \emph{learnr} tutorial for additional information). + +The ``apps,'' ``appReports,'' ``reports,'' ``appTabs,'' and ``tabs'' tables largely store information on what a user would see when they run an analysis. The table ``apps'' stores information about apps such as ``iris\_explorer.'' Apps consist of tabs, which are listed in the ``tabs'' table. Tabs are linked to apps via the ``appTabs'' table. The table ``reports'' lists any Rmd or qmd files that serve as a report template, and the table ``appReports'' links a specific report with a specific app. + +\begin{figure}[b] +\includegraphics[width=1\linewidth]{images/figure4} \caption{The 11 tables of the shinymgr SQLite database. Lines indicate how the tables are related to each other.}\label{fig:fig4} +\end{figure} + +Four of the 11 database tables focus on modules, highlighting that \emph{shiny} modules are basic building blocks of any \emph{shinymgr} app. Developers create new \emph{shiny} modules with the \texttt{mod\_init()} function, which copies a \emph{shinymgr} module template (an R file template) that includes a header with key-value that describe the module, including the module name, display name, description, citation, notes, and module arguments and returns (if any). For example, the header of the iris\_cluster module is: + +\begin{verbatim} +#!! ModName = iris_cluster +#!! ModDisplayName = Iris K-Means Clustering +#!! ModDescription = Clusters iris data based on 2 attributes +#!! ModCitation = Baggins, Bilbo. (2023). iris_cluster. [Source code]. +#!! ModNotes = Demo module for the shinymgr package. +#!! ModActive = 1 +#!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame +\end{verbatim} + +The module code is written beneath the header (see Appendix B for an example). Function calls within the module code should be written with \texttt{package::function()} notation, making explicit any R package dependencies. Once the module is completed, unit tests can written and stored in the \emph{shinymgr} project's ``tests'' directory. The final module file is saved to the ``modules'' directory and registered into the database with the \texttt{mod\_register()} function. The \texttt{mod\_register()} function populates the modules, ``modFunctionArguments'', and ``modFunctionReturns'' SQLite database tables. Further, it uses the \emph{renv} package to identify any package dependencies and inserts them into the modPackages table. Readers are referred to the ``modules'' ``tests'', and ``shinymgr\_modules'' \emph{learnr} tutorials that come with the \emph{shinymgr} package for more details. + +Once modules are registered in the database, the developer can incorporate them into new apps. As \emph{shiny} modules and apps in the database represent files that contain their scripts, deleting a module or an app from the database will delete all downstream database entries as well as (optionally) the actual files themselves. Deletion of a module will fail if it is being used in other apps. Module updates can be versioned by creating a new module and then referencing its precursor in the ``modules'' database table. + +\subsection{\texorpdfstring{The \emph{shinymgr} app builder}{The shinymgr app builder}}\label{the-shinymgr-app-builder} + +Once developers create and register their own stand-alone \emph{shiny} modules, apps are generated with \emph{shinymgr}'s app builder (Figure \ref{fig:fig5}). + +\begin{figure}[h] +\includegraphics[width=1\linewidth]{images/figure5} \caption{The shinymgr Developer Portal layout, showing the app builder in the Developer Tools.}\label{fig:fig5} +\end{figure} + +Developers are guided through a process where they design their app from \emph{shiny} modules they have registered. The builder then populates the \emph{shinymgr} database with instructions on how to construct the app and writes the app's script based on those instructions. The newly created script is saved to the ``modules\_app'' directory. Through this structured process, apps produced by the builder are well-documented and generate highly reproducible analyses. Readers are encouraged to peruse the tutorial, ``apps'', for more information. + +The \texttt{qry\_app\_flow()} function will query the database to return a list of the \emph{shiny} modules and tabs included in a specified app, such as ``iris\_explorer'': + +\begin{verbatim} +# look at the appTabs table in the database +qry_app_flow("iris_explorer", shinyMgrPath = paste0(getwd(),"/shinymgr")) +\end{verbatim} + +\begin{verbatim} + fkAppName fkTabName tabOrder fkModuleName modOrder +1 iris_explorer IE_intro 1 iris_intro 1 +2 iris_explorer IE_iris_data 2 iris_cluster 1 +3 iris_explorer IE_subset_rows 3 subset_rows 1 +4 iris_explorer IE_plot_data 4 single_column_plot 1 +\end{verbatim} + +As shown in Figure \ref{fig:fig2}, this app has 5 tabs, and each tab features a single module. The ``Save'' tab is the final tab in all \emph{shinymgr} apps and is not listed in the query result. + +Developers can ``beta test'' apps prior to deployment by selecting the Analysis (beta) tab in the Developer's Portal (Figure \ref{fig:fig3}). They can also create \emph{RMarkdown} or \emph{Quarto} report templates that accept the outputs from an analysis and incorporate them into a report. Report metadata are logged in the ``reports'' table of the database, and then linked with a specific app in the ``appReports'' table. An end-user will run an analysis and render a report, a process described more fully in the ``Using \emph{shinymgr} Apps'' section below. + +To summarize this section, developers use the \texttt{shinymgr\_setup()} function to create the directory structure and underlying database needed to build and run \emph{shiny} apps with \emph{shinymgr}. Developers use the \texttt{mod\_init()} and \texttt{mod\_register()} functions to create modules and make them available for inclusion in new apps built with the \emph{shinymgr} app builder. A developer can create as many \emph{shinymgr} projects as needed. In each case, the \emph{shinymgr} project is simply a fixed directory structure with three R files (ui.R, server.R, and global.R), and a series of subdirectories that contain the apps and \emph{shiny} modules created by the developer, along with a database for tracking everything. + +\section{\texorpdfstring{Deploying \emph{shinymgr} projects}{Deploying shinymgr projects}}\label{appdeploy} + +Once development is completed, developers can deploy their \emph{shinymgr} project on a server or within an R package by copying portions of the \emph{shinymgr} project to a new location while retaining the original project for future development. Once deployed, a \emph{shinymgr} project no longer requires the \emph{shinymgr} package or database to be run. Thus, the files and directories to be copied for deployment include only: + +\begin{verbatim} +shinymgr ++-- data ++-- global.R ++-- modules ++-- modules_app ++-- modules_mgr ++-- reports ++-- server.R ++-- ui.R +\-- www +\end{verbatim} + +The master app files, ui.R, global.R, and server.R, are needed to run the \emph{shinymgr} framework. + +When deploying a \emph{shinymgr} project within an R package, objects within the data folder should be copied into the package's ``data'' folder. The remaining files should be copied into a directory within the package's ``inst'' folder that will house the master \emph{shiny} application. Deployment on a server such as shinyapps.io will require similar adjustments. + +After files are copied to the correct location, a few key adjustments are needed. First, the ``modules\_app'' directory should contain only those apps (and dependent modules and reports) that can be used by end-users; unused apps, modules, and reports can be deleted. Second, the new.analysis.R script within the modules\_mgr folder will require minor updates to remove dependencies on the \emph{shinymgr} database. Third, the ui.R and server.R scripts should be updated to no longer showcase \emph{shinymgr} and the Developer's Portal; rather, it should be customized by the developer to create their own purpose-driven apps. For example, Figure \ref{fig:fig6} shows a hypothetical deployment of the master app titled ``Deployed Project'' that is based on the \emph{shinymgr} framework. Notice the absence of the Developer Tools tab and the absence of references to \emph{shinymgr}. The ``deployment'' \emph{learnr} tutorial provides more in-depth discussion. + +\begin{figure} +\includegraphics[width=1\linewidth]{images/figure6} \caption{An example of a deployed shinymgr app. The deployed version excludes the Developers Tools tab and is an example of what the end user sees when using a deployed app.}\label{fig:fig6} +\end{figure} + +To summarize this section, deploying the \emph{shinymgr} framework involves copying key elements of the \emph{shinymgr} developer project into package or server directories, updated as needed for use by end-users. Readers are referred to the ``deployment'' tutorial for further information. + +\section{\texorpdfstring{Using \emph{shinymgr} apps}{Using shinymgr apps}}\label{appUsing} + +Apps built with \emph{shinymgr} can appeal to various types of end-users. When deployed as part of an R package, end-users would be anyone who uses that package. Apps may also be distributed as stand-alone scripts, or hosted on a server, as described above. Developers may also use \emph{shinymgr} to produce apps for their own use (i.e., the developer \emph{is} the end-user). Regardless of who the intended end-user is, this section discusses that user's experience after the master app is deployed. + +Whoever the intended audience for the app, this section discusses how an app can be used \emph{after} it has been deployed. + +\subsection{Reproducible analyses}\label{reproducible-analyses} + +The final tab in any \emph{shinymgr} app provides the opportunity to save the analysis itself. Reproducibility is a core tenet of \emph{shinymgr.} Therefore, a robust set of metadata are saved as an RDS file to allow a user to understand and replicate their results. An example of a completed analysis is the file, ``iris\_explorer\_Gandalf\_2023\_06\_05\_16\_30.RDS,'' which stores a user's analytic steps for a run of the ``iris explorer'' app. The code below reads in this example file, and shows the structure (a list with 23 elements): + +\begin{verbatim} +rds_filepath <- paste0(getwd(),"/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS") +old_analysis <- readRDS(rds_filepath) +str(old_analysis, max.level = 2, nchar.max = 20, vec.len = 15) +\end{verbatim} + +\begin{verbatim} +List of 23 + $ analysisName : chr "iri"| __truncated__ + $ app : chr "iris_explorer" + $ username : chr "Gandalf" + $ mod2-clusters : int 3 + $ mod2-xcol : chr "Sepal.Length" + $ mod2-ycol : chr "Petal.Length" + $ mod3-full_table__reactable__pageSize : int 10 + $ mod3-resample : 'shinyActionButtonValue' int 1 + $ mod3-full_table__reactable__pages : int 15 + $ mod3-subset_table__reactable__page : int 1 + $ mod3-full_table__reactable__page : int 1 + $ mod3-sample_num : int 20 + $ mod3-subset_table__reactable__pages : int 2 + $ mod3-subset_table__reactable__pageSize: int 10 + $ returns :List of 3 + ..$ data1:List of 1 + ..$ data2:List of 1 + ..$ data3:List of 2 + $ notes : chr "Thi"| __truncated__ + $ timestamp : POSIXct[1:1], format: "202"| __truncated__ + $ metadata :List of 6 + ..$ appDescription: chr "Clu"| __truncated__ + ..$ mod1 :List of 7 + ..$ mod2 :List of 7 + ..$ mod3 :List of 7 + ..$ mod4 :List of 7 + ..$ lockfile :List of 2 + $ app_code : chr "# T"| __truncated__ + $ iris_intro_code : chr "#!!"| __truncated__ + $ iris_cluster_code : chr "#!!"| __truncated__ + $ subset_rows_code : chr "#!!"| __truncated__ + $ single_column_plot_code : chr "#!!"| __truncated__ +\end{verbatim} + +The list stores a great deal of information: + +\begin{itemize} +\tightlist +\item + \textbf{analysisName} is the name of the analysis and is equivalent to the filename of the RDS file (without the extension) +\item + \textbf{app} is the name of the app that produced the saved analysis results. +\item + \textbf{username} was entered in the ``Save'' tab when the analysis was performed. +\item + \textbf{mod\#-value} indicate the values of each \emph{shiny} module's arguments (inputs), if any exist, at the time the analysis was saved. +\item + \textbf{returns} includes values of all outputs (returns) of each module. +\item + \textbf{notes} were entered in the ``Save'' tab when the analysis was performed. +\item + \textbf{timestamp} is the date/time when the analysis was saved. +\item + \textbf{metadata} includes robust information about each module, including the app description and the description of each module as it was originally stored in the \emph{shinymgr} database tables. The metadata list element also includes an \emph{renv} ``lockfile'': a list that describes the R version and R package dependencies (including \emph{shinymgr}) used by the app itself. The lockfile captures the state of the app's package dependencies at the time of its creation; in the case of \emph{shinymgr}, it contains the dependencies used by the developer who created the app. Each lockfile record includes the name and version of the package and their installation source. +\item + \textbf{*\_code} attributes with this format contain the source code for the app. +\end{itemize} + +The code list element allows an end user to revisit the full analysis with \emph{shinymgr}'s \texttt{rerun\_analysis()} function, supplying the file path to a saved \emph{shinymgr} analysis (RDS file). + +\begin{verbatim} +rerun_analysis(analysis_path = rds_filepath) +\end{verbatim} + +The \texttt{rerun\_analysis()} function will launch a \emph{shiny} app with two tabs (Figure \ref{fig:fig7}); it can only be run during an interactive R session, with no other \emph{shiny} apps running. + +\begin{figure} +\includegraphics[width=1\linewidth]{images/figure7} \caption{A screenshot of the rerun\_analysis() function, as called on the saved analysis from the iris\_explorer app (RDS file). The active tab, called "The App", allows a user to rerun a previously executed analysis. The "Analysis Summary" tab displays the values of all module arguments and returns, captured when the analysis was saved, along with a detailed description of the app, it's modules, the App's source code, and all package dependencies.}\label{fig:fig7} +\end{figure} + +The first tab is called ``The App'', and will be visible when the \texttt{rerun\_analysis()} function is called. It contains a header with the app's name, a subheading of ``Analysis Rerun,'' and a fully functioning, identical copy of the \emph{shiny} app used to generate the saved analysis. Below that, a disclaimer appears, indicating the app was produced from a saved analysis. A summary of the analysis is presented on the second tab that displays the values used to produce the given analysis output. + +If the \texttt{rerun\_analysis()} function fails, it could be due to a change in R and package versions currently installed on the end-user's machine. To that end, the lockfile that is included in the metadata section of the RDS file can be used to restore the necessary R packages and R version with the \texttt{restore\_analysis()} function. This function will attempt to create a self-contained \emph{renv} R project that includes all of the packages and the R version used by the developer when the app was created. The analysis RDS is added to this new project, where the \texttt{rerun\_analysis()} function can be attempted again. Readers are referred to the ``analyses'' tutorial for further information. + +\subsection{Rapid reporting}\label{rapid-reporting} + +Another important feature of \emph{shinymgr} is the ability to share results of an analysis with others in a friendly, readable format with \emph{RMarkdown} or \emph{Quarto}. Apps produce an RDS file, which may be passed into an Rmd or qmd file as a parameterized input. For example, the demo database includes a report template called ``iris\_explorer\_report.Rmd.'' This file, with code shown below, allows users to navigate to the RDS file produced by the ``iris explorer'' app and render the rapid report. + +\begin{verbatim} +--- +title: 'Annual Report for Iris Explorer' +output: html_document +params: + user: + label: "User" + value: "Bilbo" + placeholder: "Enter user name" + year: + label: "Year" + value: 2017 + input: slider + min: 2010 + max: 2018 + step: 1 + sep: "" + file: + input: file + label: "Choose RDS" + value: "" + multiple: FALSE + buttonLabel: "Browse to analysis output..." +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(knitr) +ps <- readRDS(params$file) +``` + +This report summarizes an analysis of iris data by +`r params$user` conducted in `r params$year`. Iris +data was clustered into `r ps$'mod2-clusters'` groups +based on `r ps$'mod2-xcol'` and `r ps$'mod2-ycol'`. +A random sample of `r ps$'mod3-sample_num'` records +were collected, with sample sizes shown in the pie +chart below: + +```{r} +pie_data <- table(ps$returns$data2$subset_data$cluster) +pie( + x = pie_data, + labels = as.character(pie_data), + col = rainbow(length(pie_data)), + main = "Number of random samples by cluster" +) +legend( + x = "topright", + legend = names(pie_data), + fill = rainbow(length(pie_data)) +) + +``` + +Some things to note about this analysis are: `r ps$notes` + +Respectfully submitted, + +Gandalf +\end{verbatim} + +Reports may be run within the deployed version of \emph{shinymgr} (e.g., left menu of Figure \ref{fig:fig6}), or may be run directly in R by opening the Rmd file and navigating to the RDS as a file input. Users who run a report can download it to their local machine as a HTML, PDF, or Word file, where they can further customize the output. + +To summarize this section, users of \emph{shinymgr} ``apps'' created with the \emph{shinymgr} framework are presented with a series of \emph{shiny} tabs that establish an analysis workflow. Users can save their inputs and outputs as an RDS file to ensure full reproducibility. Further, the RDS file may be loaded into an R Markdown (Rmd) or Quarto (qmd) template for rapid reporting. + +\section{Tutorials and cheatsheet}\label{tuts} + +with the package. Below is a list of current tutorials, intended to be worked through in order: + +\begin{verbatim} +Available tutorials: +* shinymgr + - intro : "shinymgr-01: Introduction" + - shiny : "shinymgr-02: Shiny" + - modules : "shinymgr-03: Modules" + - app_modules : "shinymgr-04: App modules" + - tests : "shinymgr-05: Tests" + - shinymgr : "shinymgr-06: shinymgr" + - database : "shinymgr-07: Database" + - shinymgr_modules : "shinymgr-08: shinymgr_modules " + - apps : "shinymgr-09: Apps" + - analyses : "shinymgr-10: Analyses" + - reports : "shinymgr-11: Reports" + - deployment : "shinymgr-12: Deployment" +\end{verbatim} + +The ``intro'' tutorial gives a general overview. Tutorials 2-5 are aimed at developers who are new to \emph{shiny}, while tutorials 6 -- 12 focus on the \emph{shinymgr} package. + +Launch a tutorial with the \emph{learnr} \texttt{run\_tutorial()} function, providing the name of the module to launch. The tutorial should launch in a browser, which has the benefit of being able to print the tutorial to PDF upon completion: + +\begin{verbatim} +learnr::run_tutorial( + name = "modules", + package = "shinymgr") +\end{verbatim} + +Additionally, the package cheatsheet can be found with: + +\begin{verbatim} +browseURL(paste0(find.package("shinymgr"), "/extdata/shinymgr_cheatsheet.pdf")) +\end{verbatim} + +Contributions are welcome from the community. Questions can be asked on the +issues page at \url{https://code.usgs.gov/vtcfwru/shinymgr/issues}. + +\section{Acknowledgments}\label{acknowledgments} + +We thank Cathleen Balantic and Jim Hines for feedback on the overall package and package tutorials. \emph{shinymgr} was prototyped by Therese Donovan at a \emph{shiny} workshop taught by Chris Dorich and Matthew Ross at Colorado State University in 2020 (pre-pandemic). We thank the instructors for feedback and initial coding assistance. Any use of trade, firm, or product names is for descriptive purposes only and does not imply endorsement by the U.S. Government. The Vermont Cooperative Fish and Wildlife Research Unit is jointly supported by the U.S. Geological Survey, University of Vermont, Vermont Fish and Wildlife Department, and Wildlife Management Institute. + +\section{Bibliography}\label{bibliography} + +\newpage + +\section{Appendix A}\label{appendix-a} + +Entity relationship diagram for the \emph{shinymgr} database, which tracks all components of an apps and modules (Figure \ref{fig:fig8}). The database consists of 11 tables. Primary keys are referenced with a ``pk'' prefix, while foreign keys are referenced with an ``fk'' prefix. A full description of the database is contained in the ``database'' \emph{learnr} tutorial that comes with the \emph{shinymgr} package + +\begin{figure}[h] +\includegraphics[width=0.85\linewidth]{images/figure8} \caption{Entity relationship diagram for the shinymgr database, which tracks all components of an apps and modules. The database consists of 11 tables. Primary keys are referenced with a "pk" prefix, while foreign keys are referenced with an "fk" prefix. A full description of the database is contained in the "database" learnr tutorial that comes with the shinymgr package.}\label{fig:fig8} +\end{figure} + +\newpage + +\section{Appendix B}\label{appendix-b} + +Modules in \emph{shinymgr} are written by developers for their own purposes. The \texttt{shinymgr::mod\_init()} function creates a template for module development. The header is a series of key-value pairs that the developer fills out (typically after the module code is written and tested). The ``iris\_cluster'' module is presented below as an example. The module consists of two paired functions: here, \texttt{iris\_cluster\_ui(id)} and \texttt{iris\_cluster\_server()}. The UI is a function with an argument called id, which is turned into module's ``namespace'' with the \texttt{NS()} function. A namespace is simply the module's identifier and ensures that function and object names within a given module do not conflict with function and object names in other modules. The Id's for each input and output in the UI must be wrapped in a \texttt{ns()} function call to make explicit that these inputs are assigned to the module's namespace. All UI elements are wrapped in a \texttt{tagList()} function, where a \texttt{tagList} allows one to combine multiple UI elements into a single R object. Readers should consult the ``modules,'' ``tests,'' and ``shinymgr\_modules'' tutorials for additional information. + +\begin{verbatim} +#!! ModName = iris_cluster +#!! ModDisplayName = Iris K-Means Clustering +#!! ModDescription = Clusters iris data based on 2 attributes +#!! ModCitation = Baggins, Bilbo. (2022). iris_cluster. [Source code]. +#!! ModNotes = +#!! ModActive = 1 +#!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame + +iris_cluster_ui <- function(id){ + # create the module's namespace + ns <- NS(id) + + tagList( + sidebarLayout( + sidebarPanel( + # add the dropdown for the X variable + selectInput( + ns("xcol"), + label = "X Variable", + choices = c( + "Sepal.Length", + "Sepal.Width", + "Petal.Length", + "Petal.Width" + ), + selected = "Sepal.Length" + ), + + # add the dropdown for the Y variable + selectInput( + ns("ycol"), + label = "Y Variable", + choices = c( + "Sepal.Length", + "Sepal.Width", + "Petal.Length", + "Petal.Width" + ), + selected = "Sepal.Width" + ), + # add input box for the cluster number + + numericInput( + ns("clusters"), + label = "Cluster count", + value = 3, + min = 1, + max = 9 + ) + ), # end of sidebarPanel + + mainPanel( + # create outputs + plotOutput( + ns("plot1") + ) + ) # end of mainPanel + ) # end of sidebarLayout + ) # end of tagList +} # end of UI function + +iris_cluster_server <- function(id) { + + moduleServer(id, function(input, output, session) { + + # combine variables into new data frame + selectedData <- reactive({ + iris[, c(input$xcol, input$ycol)] + }) + + # run kmeans algorithm + clusters <- reactive({ + kmeans( + x = selectedData(), + centers = input$clusters + ) + }) + + output$plot1 <- renderPlot({ + par(mar = c(5.1, 4.1, 0, 1)) + plot( + selectedData(), + col = clusters()$cluster, + pch = 20, + cex = 3 + ) + }) + + return( + reactiveValues( + returndf = reactive({ + cbind( + selectedData(), + cluster = clusters()$cluster + ) + }) + ) + ) + + }) # end of moduleServer function + +} # end of irisCluster function +\end{verbatim} + +\newpage + +\bibliography{shinymgrFinal.bib} + +\address{% +Laurence A. Clarfeld\\ +Vermont Cooperative Fish and Wildlife Research Unit\\% +302 Aiken Center, University of Vermont\\ Burlington, VT 05405 USA\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0002-3927-9411}{0000-0002-3927-9411}}\\% +\href{mailto:laurence.clarfeld@uvm.edu}{\nolinkurl{laurence.clarfeld@uvm.edu}}% +} + +\address{% +Caroline Tang\\ +Queen's University\\% +Biology Department\\ 116 Barrie St, Kingston, ON K7L 3N6\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0001-7966-5854}{0000-0001-7966-5854}}\\% +\href{mailto:17ct24@queensu.ca}{\nolinkurl{17ct24@queensu.ca}}% +} + +\address{% +Therese Donovan\\ +U.S. Geological Survey, Vermont Cooperative Fish and Wildlife Research Unit\\% +302 Aiken Center, University of Vermont\\ Burlington, VT 05405 USA\\ +% +% +\textit{ORCiD: \href{https://orcid.org/0000-0001-8124-9251}{0000-0001-8124-9251}}\\% +\href{mailto:tdonovan@uvm.edu}{\nolinkurl{tdonovan@uvm.edu}}% +} diff --git a/_articles/RJ-2024-009/RJournal.sty b/_articles/RJ-2024-009/RJournal.sty new file mode 100644 index 0000000000..c39644cd3f --- /dev/null +++ b/_articles/RJ-2024-009/RJournal.sty @@ -0,0 +1,344 @@ +% Package `RJournal' to use with LaTeX2e +% Copyright (C) 2010 by the R Foundation +% Copyright (C) 2013 by the R Journal +% +% Originally written by Kurt Hornik and Friedrich Leisch with subsequent +% edits by the editorial board +% +% CAUTION: +% Do not modify this style file. Any changes to this file will be reset when your +% article is submitted. +% If you must modify the style or add LaTeX packages to the article, these +% should be specified in RJwrapper.tex + +\NeedsTeXFormat{LaTeX2e}[1995/12/01] +\ProvidesPackage{RJournal}[2022/06/27 v0.14 RJournal package] + +\RequirePackage{tikz} + +% Overall page layout, fonts etc ----------------------------------------------- + +% Issues of of \emph{The R Journal} are created from the standard \LaTeX{} +% document class \pkg{report}. + +\RequirePackage{geometry} +\geometry{a4paper, + textwidth=14cm, top=1cm, bottom=1cm, + includehead,includefoot,centering, + footskip=1.5cm} +\raggedbottom + +\RequirePackage{fancyhdr} +\fancyhead{} +\fancyheadoffset{2cm} +\fancyhead[L]{\textsc{\RJ@sectionhead}} +\fancyhead[R]{\thepage} +\fancyfoot{} +\fancyfoot[L]{The R Journal Vol. \RJ@volume/\RJ@number, \RJ@month~\RJ@year} +\fancyfoot[R]{ISSN 2073-4859} +\pagestyle{fancy} + +% We use the following fonts (all with T1 encoding): +% +% rm & palatino +% tt & inconsolata +% sf & helvetica +% math & palatino + +\RequirePackage{microtype} + +\RequirePackage[scaled=0.92]{helvet} +\RequirePackage{palatino,mathpazo} +\RequirePackage[scaled=1.02]{inconsolata} +\RequirePackage[T1]{fontenc} + +\RequirePackage[hyphens]{url} +\RequirePackage[pagebackref]{hyperref} +\renewcommand{\backref}[1]{[p#1]} + +% Dark blue colour for all links +\RequirePackage{color} +\definecolor{link}{rgb}{0.45,0.51,0.67} +\hypersetup{ + colorlinks,% + citecolor=link,% + filecolor=link,% + linkcolor=link,% + urlcolor=link +} + +% Give the text a little room to breath +\setlength{\parskip}{3pt} +\RequirePackage{setspace} +\setstretch{1.05} + +% Issue and article metadata --------------------------------------------------- + +% Basic front matter information about the issue: volume, number, and +% date. + +\newcommand{\volume}[1]{\def\RJ@volume{#1}} +\newcommand{\volnumber}[1]{\def\RJ@number{#1}} +\renewcommand{\month}[1]{\def\RJ@month{#1}} +\renewcommand{\year}[1]{\def\RJ@year{#1}} + + +% Individual articles correspond to +% chapters, and are contained in |article| environments. This makes it +% easy to have figures counted within articles and hence hyperlinked +% correctly. + +% An article has an author, a title, and optionally a subtitle. We use +% the obvious commands for specifying these. Articles will be put in certain +% journal sections, named by \sectionhead. + +\newcommand {\sectionhead} [1]{\def\RJ@sectionhead{#1}} +\renewcommand{\author} [1]{\def\RJ@author{#1}} +\renewcommand{\title} [1]{\def\RJ@title{#1}} +\newcommand {\subtitle} [1]{\def\RJ@subtitle{#1}} + +% Control appearance of titles: make slightly smaller than usual, and +% suppress section numbering. See http://tex.stackexchange.com/questions/69749 +% for why we don't use \setcounter{secnumdepth}{-1} + +\usepackage[medium]{titlesec} +\usepackage{titletoc} +\titleformat{\section} {\normalfont\large\bfseries}{\arabic{section}}{1em}{} +\titleformat{\subsection}{\normalfont\normalsize\bfseries}{\arabic{section}.\arabic{subsection}}{0.5em}{} +\titlecontents{chapter} [0em]{}{}{}{\titlerule*[1em]{.}\contentspage} + +% Article layout --------------------------------------------------------------- + +% Environment |article| clears the article header information at its beginning. +% We use |\FloatBarrier| from the placeins package to keep floats within +% the article. +\RequirePackage{placeins} +\newenvironment{article}{\author{}\title{}\subtitle{}\FloatBarrier}{\FloatBarrier} + +% Refereed articles should have an abstract, so we redefine |\abstract| to +% give the desired style + +\renewcommand{\abstract}[1]{% +\setstretch{1}% +\noindent% +\small% +\textbf{Abstract} #1 +} + +% The real work is done by a redefined version of |\maketitle|. Note +% that even though we do not want chapters (articles) numbered, we +% need to increment the chapter counter, so that figures get correct +% labelling. + +\renewcommand{\maketitle}{% +\noindent + \chapter{\RJ@title}\refstepcounter{chapter} + \ifx\empty\RJ@subtitle + \else + \noindent\textbf{\RJ@subtitle} + \par\nobreak\addvspace{\baselineskip} + \fi + \ifx\empty\RJ@author + \else + \noindent\textit{\RJ@author} + \par\nobreak\addvspace{\baselineskip} + \fi + \@afterindentfalse\@nobreaktrue\@afterheading +} + +% Now for some ugly redefinitions. We do not want articles to start a +% new page. (Actually, we do, but this is handled via explicit +% \newpage +% +% The name@of@eq is a hack to get hyperlinks to equations to work +% within each article, even though there may be multiple eq.(1) +% \begin{macrocode} +\renewcommand\chapter{\secdef\RJ@chapter\@schapter} +\providecommand{\nohyphens}{% + \hyphenpenalty=10000\exhyphenpenalty=10000\relax} +\newcommand{\RJ@chapter}{% + \edef\name@of@eq{equation.\@arabic{\c@chapter}}% + \renewcommand{\@seccntformat}[1]{}% + \@startsection{chapter}{0}{0mm}{% + -2\baselineskip \@plus -\baselineskip \@minus -.2ex}{\p@}{% + \phantomsection\normalfont\huge\bfseries\raggedright}} + +% Book reviews should appear as sections in the text and in the pdf bookmarks, +% however we wish them to appear as chapters in the TOC. Thus we define an +% alternative to |\maketitle| for reviews. +\newcommand{\review}[1]{ + \pdfbookmark[1]{#1}{#1} + \section*{#1} + \addtocontents{toc}{\protect\contentsline{chapter}{#1}{\thepage}{#1.1}} +} + +% We want bibliographies as starred sections within articles. +% +\RequirePackage[sectionbib,round]{natbib} +\bibliographystyle{abbrvnat} +\renewcommand{\bibsection}{\section*{References}} + +% Equations, figures and tables are counted within articles, but we do +% not show the article number. For equations it becomes a bit messy to avoid +% having hyperref getting it wrong. + +% \numberwithin{equation}{chapter} +\renewcommand{\theequation}{\@arabic\c@equation} +\renewcommand{\thefigure}{\@arabic\c@figure} +\renewcommand{\thetable}{\@arabic\c@table} + +% Issue layout ----------------------------------------------------------------- + +% Need to provide our own version of |\tableofcontents|. We use the +% tikz package to get the rounded rectangle. Notice that |\section*| +% is really the same as |\chapter*|. +\renewcommand{\contentsname}{Contents} +\renewcommand\tableofcontents{% + \vspace{1cm} + \section*{\contentsname} + { \@starttoc{toc} } +} + +\renewcommand{\titlepage}{% + \thispagestyle{empty} + \hypersetup{ + pdftitle={The R Journal Volume \RJ@volume/\RJ@number, \RJ@month \RJ@year},% + pdfauthor={R Foundation for Statistical Computing},% + } + \noindent + \begin{center} + \fontsize{50pt}{50pt}\selectfont + The \raisebox{-8pt}{\includegraphics[height=77pt]{Rlogo-5}}\hspace{10pt} + Journal + + \end{center} + {\large \hfill Volume \RJ@volume/\RJ@number, \RJ@month{} \RJ@year \quad} + + \rule{\textwidth}{1pt} + \begin{center} + {\Large A peer-reviewed, open-access publication of the \\ + R Foundation for Statistical Computing} + \end{center} + + % And finally, put in the TOC box. Note the way |tocdepth| is adjusted + % before and after producing the TOC: thus, we can ensure that only + % articles show up in the printed TOC, but that in the PDF version, + % bookmarks are created for sections and subsections as well (provided + % that the non-starred forms are used). + \setcounter{tocdepth}{0} + \tableofcontents + \setcounter{tocdepth}{2} + \clearpage +} + +% Text formatting -------------------------------------------------------------- + +\newcommand{\R}{R} +\newcommand{\address}[1]{\addvspace{\baselineskip}\noindent\emph{#1}} +\newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}} + +% Simple font selection is not good enough. For example, |\texttt{--}| +% gives `\texttt{--}', i.e., an endash in typewriter font. Hence, we +% need to turn off ligatures, which currently only happens for commands +% |\code| and |\samp| and the ones derived from them. Hyphenation is +% another issue; it should really be turned off inside |\samp|. And +% most importantly, \LaTeX{} special characters are a nightmare. E.g., +% one needs |\~{}| to produce a tilde in a file name marked by |\file|. +% Perhaps a few years ago, most users would have agreed that this may be +% unfortunate but should not be changed to ensure consistency. But with +% the advent of the WWW and the need for getting `|~|' and `|#|' into +% URLs, commands which only treat the escape and grouping characters +% specially have gained acceptance + +\DeclareRobustCommand\code{\bgroup\@noligs\@codex} +\def\@codex#1{\texorpdfstring% +{{\normalfont\ttfamily\hyphenchar\font=-1 #1}}% +{#1}\egroup} +\newcommand{\kbd}[1]{{\normalfont\texttt{#1}}} +\newcommand{\key}[1]{{\normalfont\texttt{\uppercase{#1}}}} +\DeclareRobustCommand\samp{`\bgroup\@noligs\@sampx} +\def\@sampx#1{{\normalfont\texttt{#1}}\egroup'} +\newcommand{\var}[1]{{\normalfont\textsl{#1}}} +\let\env=\code +\newcommand{\file}[1]{{`\normalfont\textsf{#1}'}} +\let\command=\code +\let\option=\samp +\newcommand{\dfn}[1]{{\normalfont\textsl{#1}}} +% \acronym is effectively disabled since not used consistently +\newcommand{\acronym}[1]{#1} +\newcommand{\strong}[1]{\texorpdfstring% +{{\normalfont\fontseries{b}\selectfont #1}}% +{#1}} +\let\pkg=\strong +\newcommand{\CRANpkg}[1]{\href{https://CRAN.R-project.org/package=#1}{\pkg{#1}}}% +\let\cpkg=\CRANpkg +\newcommand{\ctv}[1]{\href{https://CRAN.R-project.org/view=#1}{\emph{#1}}} +\newcommand{\BIOpkg}[1]{\href{https://www.bioconductor.org/packages/release/bioc/html/#1.html}{\pkg{#1}}} + +% Example environments --------------------------------------------------------- +\RequirePackage{fancyvrb} +\RequirePackage{alltt} + +\DefineVerbatimEnvironment{example}{Verbatim}{} +\renewenvironment{example*}{\begin{alltt}}{\end{alltt}} + +% Support for output from Sweave, and generic session style code +% These used to have fontshape=sl for Sinput/Scode/Sin, but pslatex +% won't use a condensed font in that case. + +% Update (2015-05-28 by DS): remove fontsize=\small to match example environment + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{} +\DefineVerbatimEnvironment{Scode}{Verbatim}{} +\DefineVerbatimEnvironment{Sin}{Verbatim}{} +\DefineVerbatimEnvironment{Sout}{Verbatim}{} +\newenvironment{Schunk}{}{} + +% Mathematics ------------------------------------------------------------------ + +% The implementation of |\operatorname| is similar to the mechanism +% \LaTeXe{} uses for functions like sin and cos, and simpler than the +% one of \AmSLaTeX{}. We use |\providecommand| for the definition in +% order to keep the one of the \pkg{amstex} if this package has +% already been loaded. +% \begin{macrocode} +\providecommand{\operatorname}[1]{% + \mathop{\operator@font#1}\nolimits} +\RequirePackage{amsfonts} + +\renewcommand{\P}{% + \mathop{\operator@font I\hspace{-1.5pt}P\hspace{.13pt}}} +\newcommand{\E}{% + \mathop{\operator@font I\hspace{-1.5pt}E\hspace{.13pt}}} +\newcommand{\VAR}{\operatorname{var}} +\newcommand{\COV}{\operatorname{cov}} +\newcommand{\COR}{\operatorname{cor}} + +% Figures ---------------------------------------------------------------------- + +\RequirePackage[font=small,labelfont=bf]{caption} + +% Wide environments for figures and tables ------------------------------------- +\RequirePackage{environ} + +% An easy way to make a figure span the full width of the page +\NewEnviron{widefigure}[1][]{ +\begin{figure}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{figure} +} + +\NewEnviron{widetable}[1][]{ +\begin{table}[#1] +\advance\leftskip-2cm +\begin{minipage}{\dimexpr\textwidth+4cm\relax}% + \captionsetup{margin=2cm} + \BODY +\end{minipage}% +\end{table} +} diff --git a/_articles/RJ-2024-009/RJwrapper.tex b/_articles/RJ-2024-009/RJwrapper.tex new file mode 100644 index 0000000000..d129ec41b0 --- /dev/null +++ b/_articles/RJ-2024-009/RJwrapper.tex @@ -0,0 +1,70 @@ +\documentclass[a4paper]{report} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{RJournal} +\usepackage{amsmath,amssymb,array} +\usepackage{booktabs} + + +% tightlist command for lists without linebreak +\providecommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} + +\usepackage{longtable} + +% Always define CSL refs as bib entries are contained in separate doc +% Pandoc citation processing +%From Pandoc 3.1.8 +% definitions for citeproc citations +\NewDocumentCommand\citeproctext{}{} +\NewDocumentCommand\citeproc{mm}{% + \begingroup\def\citeproctext{#2}\cite{#1}\endgroup} +\makeatletter + % allow citations to break across lines + \let\@cite@ofmt\@firstofone + % avoid brackets around text for \cite: + \def\@biblabel#1{} + \def\@cite#1#2{{#1\if@tempswa , #2\fi}} +\makeatother +\newlength{\cslhangindent} +\setlength{\cslhangindent}{1.5em} +\newlength{\csllabelwidth} +\setlength{\csllabelwidth}{3em} +\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing + {\begin{list}{}{% + \setlength{\itemindent}{0pt} + \setlength{\leftmargin}{0pt} + \setlength{\parsep}{0pt} + % turn on hanging indent if param 1 is 1 + \ifodd #1 + \setlength{\leftmargin}{\cslhangindent} + \setlength{\itemindent}{-1\cslhangindent} + \fi + % set entry spacing + \setlength{\itemsep}{#2\baselineskip}}} + {\end{list}} +\usepackage{calc} +\newcommand{\CSLBlock}[1]{#1\hfill\break} +\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}} +\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break} +\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} + + + +\begin{document} + + +%% do not edit, for illustration only +\sectionhead{Contributed research article} +\volume{16} +\volnumber{1} +\year{2024} +\month{March} +\setcounter{page}{157} + +\begin{article} + \input{RJ-2024-009} +\end{article} + + +\end{document} diff --git a/_articles/RJ-2024-009/images/figure1.png b/_articles/RJ-2024-009/images/figure1.png new file mode 100644 index 0000000000..38b809a4af Binary files /dev/null and b/_articles/RJ-2024-009/images/figure1.png differ diff --git a/_articles/RJ-2024-009/images/figure2.png b/_articles/RJ-2024-009/images/figure2.png new file mode 100644 index 0000000000..6acec7ed61 Binary files /dev/null and b/_articles/RJ-2024-009/images/figure2.png differ diff --git a/_articles/RJ-2024-009/images/figure3.png b/_articles/RJ-2024-009/images/figure3.png new file mode 100644 index 0000000000..af27c1a70a Binary files /dev/null and b/_articles/RJ-2024-009/images/figure3.png differ diff --git a/_articles/RJ-2024-009/images/figure4.jpg b/_articles/RJ-2024-009/images/figure4.jpg new file mode 100644 index 0000000000..b382990d20 Binary files /dev/null and b/_articles/RJ-2024-009/images/figure4.jpg differ diff --git a/_articles/RJ-2024-009/images/figure5.png b/_articles/RJ-2024-009/images/figure5.png new file mode 100644 index 0000000000..a7dfd06a5d Binary files /dev/null and b/_articles/RJ-2024-009/images/figure5.png differ diff --git a/_articles/RJ-2024-009/images/figure6.png b/_articles/RJ-2024-009/images/figure6.png new file mode 100644 index 0000000000..f54235d609 Binary files /dev/null and b/_articles/RJ-2024-009/images/figure6.png differ diff --git a/_articles/RJ-2024-009/images/figure7.png b/_articles/RJ-2024-009/images/figure7.png new file mode 100644 index 0000000000..1f1ad406a9 Binary files /dev/null and b/_articles/RJ-2024-009/images/figure7.png differ diff --git a/_articles/RJ-2024-009/images/figure8.png b/_articles/RJ-2024-009/images/figure8.png new file mode 100644 index 0000000000..62befdd874 Binary files /dev/null and b/_articles/RJ-2024-009/images/figure8.png differ diff --git a/_articles/RJ-2024-009/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS b/_articles/RJ-2024-009/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS new file mode 100644 index 0000000000..c3aca6502c Binary files /dev/null and b/_articles/RJ-2024-009/shinymgr/analyses/iris_explorer_Gandalf_2023_06_05_16_30.RDS differ diff --git a/_articles/RJ-2024-009/shinymgr/data/iris.RData b/_articles/RJ-2024-009/shinymgr/data/iris.RData new file mode 100644 index 0000000000..5e9124d475 Binary files /dev/null and b/_articles/RJ-2024-009/shinymgr/data/iris.RData differ diff --git a/_articles/RJ-2024-009/shinymgr/database/shinymgr.sqlite b/_articles/RJ-2024-009/shinymgr/database/shinymgr.sqlite new file mode 100644 index 0000000000..65e14933cc Binary files /dev/null and b/_articles/RJ-2024-009/shinymgr/database/shinymgr.sqlite differ diff --git a/_articles/RJ-2024-009/shinymgr/global.R b/_articles/RJ-2024-009/shinymgr/global.R new file mode 100644 index 0000000000..fc8a761b47 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/global.R @@ -0,0 +1,45 @@ +# clean out objects from environment +rm(list = ls()) + +# set global variables +shinyMgrPath <- getwd() + +# load required shiny framework packages +library(shinymgr) + +# load required module packages (parse headers) +app_mods <- list.files( + path = paste0(shinyMgrPath,"/modules"), + full.names = TRUE +) +modPackages <- vector() +for (modPath in app_mods) { + modPackages <- c(modPackages, shinymgr::mod_header_parser(modPath)[[4]]$packageName) +} + +modPackages <- unique(modPackages) + +# load the packages +for (package in modPackages) { + suppressPackageStartupMessages(library(package, character.only = TRUE)) +} + +# source in all manager (framework) modules +mgr_mods <- list.files( + path = paste0(shinyMgrPath,"/modules_mgr"), + full.names = TRUE +) + +sapply(mgr_mods, FUN = source) + +# source in all user modules + +sapply(app_mods, FUN = source) + +# source in all manager (framework) modules +app_mods <- list.files( + path = paste0(shinyMgrPath, "/modules_app"), + full.names = TRUE +) + +sapply(app_mods, FUN = source) diff --git a/_articles/RJ-2024-009/shinymgr/modules/iris_cluster.R b/_articles/RJ-2024-009/shinymgr/modules/iris_cluster.R new file mode 100644 index 0000000000..7a3b758d32 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules/iris_cluster.R @@ -0,0 +1,102 @@ +#!! ModName = iris_cluster +#!! ModDisplayName = Iris K-Means Clustering +#!! ModDescription = Clusters iris data based on 2 attributes +#!! ModCitation = Baggins, Bilbo. (2022). iris_cluster. [Source code]. +#!! ModNotes = +#!! ModActive = 1 +#!! FunctionReturn = returndf !! selected attributes and their assigned clusters !! data.frame + +iris_cluster_ui <- function(id){ + # create the module's namespace + ns <- NS(id) + + tagList( + sidebarLayout( + sidebarPanel( + # add the dropdown for the X variable + selectInput( + ns("xcol"), + label = "X Variable", + choices = c( + "Sepal.Length", + "Sepal.Width", + "Petal.Length", + "Petal.Width" + ), + selected = "Sepal.Length" + ), + + # add the dropdown for the Y variable + selectInput( + ns("ycol"), + label = "Y Variable", + choices = c( + "Sepal.Length", + "Sepal.Width", + "Petal.Length", + "Petal.Width" + ), + selected = "Sepal.Width" + ), + # add input box for the cluster number + + numericInput( + ns("clusters"), + label = "Cluster count", + value = 3, + min = 1, + max = 9 + ) + ), # end of sidebarPanel + + mainPanel( + # create outputs + plotOutput( + ns("plot1") + ) + ) # end of mainPanel + ) # end of sidebarLayout + ) # end of tagList +} # end of UI function + +iris_cluster_server <- function(id) { + + moduleServer(id, function(input, output, session) { + + # combine variables into new data frame + selectedData <- reactive({ + iris[, c(input$xcol, input$ycol)] + }) + + # run kmeans algorithm + clusters <- reactive({ + kmeans( + x = selectedData(), + centers = input$clusters + ) + }) + + output$plot1 <- renderPlot({ + par(mar = c(5.1, 4.1, 0, 1)) + plot( + selectedData(), + col = clusters()$cluster, + pch = 20, + cex = 3 + ) + }) + + return( + reactiveValues( + returndf = reactive({ + cbind( + selectedData(), + cluster = clusters()$cluster + ) + }) + ) + ) + + }) # end of moduleServer function + +} # end of irisCluster function diff --git a/_articles/RJ-2024-009/shinymgr/modules/iris_intro.R b/_articles/RJ-2024-009/shinymgr/modules/iris_intro.R new file mode 100644 index 0000000000..be4f78724b --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules/iris_intro.R @@ -0,0 +1,28 @@ +#!! ModName = iris_intro +#!! ModDisplayName = Iris Explorer introduction Page +#!! ModDescription = This module is simply a page of text with instructions for the iris explorer module. +#!! ModCitation = Baggins, Bilbo. (2022). iris_intro. [Source code]. +#!! ModActive = 1 + + +# the ui function +iris_intro_ui <- function(id) { + ns <- NS(id) + tagList( + wellPanel( + textOutput(ns("instructions")) + ) + ) +} + + +# the server function +iris_intro_server <- function(id) { + moduleServer(id, function(input, output, session) { + output$instructions <- renderText({ + "These are instructions for the iris_explorer app. This app clusters data by user-specified columns, + then takes a random subset of the data. The inputs and returns of each module can be downloaded as + an .RDS file." + }) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules/single_column_plot.R b/_articles/RJ-2024-009/shinymgr/modules/single_column_plot.R new file mode 100644 index 0000000000..710b70b61c --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules/single_column_plot.R @@ -0,0 +1,61 @@ +#!! ModName = single_column_plot +#!! ModDisplayName = Plot Single Column +#!! ModDescription = Uses qplot to plot a column in a dataset +#!! ModCitation = Baggins, Bilbo. (2022). single_column_plot. [Source code]. +#!! ModNotes = +#!! ModActive = 1 +#!! FunctionArg = dataset !! dataframe to be explored !! data.frame +#!! FunctionReturn = selectedCol !! name of column selected !! string +#!! FunctionReturn = g !! plot of column distribution !! ggproto +#!! Package = ggplot2 !! 3.3.5 !! + + +# the ui function +single_column_plot_ui <- function(id) { + ns <- NS(id) + tagList( + fluidRow( + column( + width = 4, + uiOutput(outputId = ns("selectCol")) + ), + column( + width = 8, + plotOutput(outputId = ns("fig")) + ) + ) + ) +} + + +# the server function +single_column_plot_server <- function(id, dataset) { + moduleServer(id, function(input, output, session) { + #create dropdown of + ns <- session$ns + output$selectCol <- renderUI({ + selectInput( + inputId = ns("selectedCol"), + label = "Select a column", + choices = names(dataset()) + ) + }) + + + g <- reactive({ + req(input$selectedCol) + columnValues <- dataset()[[input$selectedCol]] + qplot(x = columnValues, main = paste0(input$selectedCol, " distribution")) + + theme_classic() + }) + + output$fig <- renderPlot(g()) + #return column and plot + return( + reactiveValues( + selectedCol = reactive(input$selectedCol), + g = reactive(g()) + ) + ) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules/subset_rows.R b/_articles/RJ-2024-009/shinymgr/modules/subset_rows.R new file mode 100644 index 0000000000..6eba74beb1 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules/subset_rows.R @@ -0,0 +1,69 @@ +#!! ModName = subset_rows +#!! ModDisplayName = Subset Rows (random) +#!! ModDescription = Randomly selects rows to create a subset of a dataframe +#!! ModCitation = Baggins, Bilbo. (2022). subset_rows. [Source code]. +#!! ModNotes = +#!! ModActive = 1 +#!! FunctionArg = dataset !! dataframe to be subset !! data.frame +#!! FunctionReturn = subset_data !! subset of original data !! data.frame +#!! Package = reactable !! 0.3.0 !! + +subset_rows_ui <- function(id) { + ns <- NS(id) + + tagList( + sidebarLayout( + #make the sidebar with the numeric input, resample button, and whole dataframe + sidebarPanel( + numericInput( + inputId = ns("sample_num"), + label = "Number of rows to sample", + value = 10, + min = 1 + ), + actionButton( + inputId = ns("resample"), + label = "Re-sample" + ), + br(), #extra line break to leave some space between the button and the table + reactableOutput(ns("full_table")) + ), #end of sidebar panel + + #have the main panel display the subset dataframe + mainPanel( + h2("These rows were randomly chosen:"), + reactableOutput(ns("subset_table")) + ) #end of main panel + ) #end of sidebar layout + ) +} #end of ui function + +subset_rows_server <- function(id, dataset) { + moduleServer(id, function(input, output, session) { + + #create the reactable object that will display the full table + output$full_table <- renderReactable({ + reactable(data = dataset(), rownames = TRUE) + }) + + #create a vector of random indices based on the number selected that also listens for the button click + index <- reactive({ + input$resample + sample(1:nrow(dataset()), size = input$sample_num, replace = FALSE) + }) + + #create the reactable object that will display the subset table + output$subset_table <- renderReactable({ + reactable(dataset()[index(),], rownames = TRUE) + }) + + return( + reactiveValues( + subset_data = reactive({ + dataset()[index(),] + }) + ) + ) + + }) #end moduleServer function +} #end server function diff --git a/_articles/RJ-2024-009/shinymgr/modules_app/iris_explorer.R b/_articles/RJ-2024-009/shinymgr/modules_app/iris_explorer.R new file mode 100644 index 0000000000..2b378edea1 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_app/iris_explorer.R @@ -0,0 +1,329 @@ +# This script was automatically generated by the shinymgr R package's App Builder on 2022-08-03 13:45:22. +# For more information, visit: https://code.usgs.gov/vtcfwru/shinymgr +jscode <- " +shinyjs.disableTab = function(name) { +var tab = $('.nav li a[data-value=' + name + ']'); +tab.bind('click.tab', function(e) { +e.preventDefault(); +return false; +}); +tab.addClass('disabled'); +} + +shinyjs.enableTab = function(name) { +var tab = $('.nav li a[data-value=' + name + ']'); +tab.unbind('click.tab'); +tab.removeClass('disabled'); +} +" + +css <- " +.nav li a.disabled { +background-color: #bbb !important; +border-color: #ccc !important; +cursor: not-allowed !important; +}" + + +iris_explorer_ui <- function(id) { + ns <- NS(id) + tagList( + fluidPage( + useShinyjs(), + extendShinyjs(text = jscode, functions = c('disableTab','enableTab')), + inlineCSS(css), + actionButton( + ns("start"), + "Start New Analysis", + onclick = "var $btn=$(this); setTimeout(function(){$btn.remove();},0);" + ), + uiOutput(ns('test')) + ) + ) +} +iris_explorer_server <- function(id, userID, shinyMgrPath) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + observeEvent(input$start, { + + disable('start') + + output$test <- renderUI({ + tagList( + tabsetPanel( + id = ns("mainTabSet"), + tabPanel( + "Intro", + value = "tab1", + iris_intro_ui(ns("mod1")), + fluidRow( + actionButton(ns('next_tab_1'), label = "Next") + ) + ), + tabPanel( + "Cluster Iris Data", + value = "tab2", + tags$br(), + wellPanel( + style = "background: skyblue", + "Cluster data from the iris dataset by specifying the attributes and number of clusters." + ), + iris_cluster_ui(ns("mod2")), + fluidRow( + actionButton(ns('previous_tab_2'), label = "Previous"), + actionButton(ns('next_tab_2'), label = "Next") + ) + ), + tabPanel( + "Subset Rows", + value = "tab3", + tags$br(), + wellPanel( + style = "background: skyblue", + "Specify the number of rows to randomly select from the clustered iris data." + ), + subset_rows_ui(ns("mod3")), + fluidRow( + actionButton(ns('previous_tab_3'), label = "Previous"), + actionButton(ns('next_tab_3'), label = "Next") + ) + ), + tabPanel( + "Plot Column", + value = "tab4", + tags$br(), + wellPanel( + style = "background: skyblue", + "Select a column to view its distribution." + ), + single_column_plot_ui(ns("mod4")), + fluidRow( + actionButton(ns('previous_tab_4'), label = "Previous"), + actionButton(ns('next_tab_4'), label = "Next") + ) + ), + tabPanel( + "Save", + value = "tab5", + save_analysis_ui(ns("mod5")), + tags$br(), + tags$br(), + fluidRow( + actionButton(ns("previous_tab_5"), label = "Previous") + ) + ) + ) + ) + }) + delay(50, { + js$disableTab("tab2") + js$disableTab("tab3") + js$disableTab("tab4") + js$disableTab("tab5") + }) + }) + + iris_intro_server("mod1") + data1 <- iris_cluster_server("mod2") + data2 <- subset_rows_server("mod3", dataset = data1$returndf) + data3 <- single_column_plot_server("mod4", dataset = data2$subset_data) + save_analysis_server("mod5", + appName = "iris_explorer", + moduleInput = input, + returns = list( + data1 = list( + returndf = data1$returndf() + ), + data2 = list( + subset_data = data2$subset_data() + ), + data3 = list( + selectedCol = data3$selectedCol(), + g = data3$g() + ) + ), + metadata = list( + appDescription = "Cluster iris data, randomly subset some rows, and plot the distribution of a column", + mod1 = list( + dataset = "no returns", + modName = "iris_intro", + modDisplayName = "Iris Explorer introduction Page", + modDescription = "This module is simply a page of text with instructions for the iris explorer module.", + modArguments = "This module has no additional arguments", + modReturns = "This module has no returns", + modPackages = "This module has no package dependencies" + ), + mod2 = list( + dataset = "data1", + modName = "iris_cluster", + modDisplayName = "Iris K-Means Clustering", + modDescription = "Clusters iris data based on 2 attributes", + modArguments = "This module has no additional arguments", + modReturns = data.frame( + name = c("returndf"), + class = c("data.frame"), + description = c("selected attributes and their assigned clusters") + ), + modPackages = "This module has no package dependencies" + ), + mod3 = list( + dataset = "data2", + modName = "subset_rows", + modDisplayName = "Subset Rows (random)", + modDescription = "Randomly selects rows to create a subset of a dataframe", + modArguments = data.frame( + name = c("dataset"), + class = c("data.frame"), + description = c("dataframe to be subset") + ), + modReturns = data.frame( + name = c("subset_data"), + class = c("data.frame"), + description = c("subset of original data") + ), + modPackages = data.frame( + name = c("reactable"), + version = c("0.3.0") + ) + ), + mod4 = list( + dataset = "data3", + modName = "single_column_plot", + modDisplayName = "Plot Single Column", + modDescription = "Uses qplot to plot a column in a dataset", + modArguments = data.frame( + name = c("dataset"), + class = c("data.frame"), + description = c("dataframe to be explored") + ), + modReturns = data.frame( + name = c("selectedCol","g"), + class = c("string","ggproto"), + description = c("name of column selected","plot of column distribution") + ), + modPackages = data.frame( + name = c("ggplot2"), + version = c("3.3.5") + ) + ) + ) + ) + observeEvent(input$next_tab_1, { + js$enableTab('tab2') + js$disableTab('tab1') + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab2' + ) + }) + observeEvent(input$next_tab_2, { + js$enableTab('tab3') + js$disableTab('tab2') + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab3' + ) + }) + observeEvent(input$previous_tab_2, { + delay(50, { + js$enableTab('tab1') + js$disableTab('tab2') + }) + removeTab('mainTabSet','tab2',session) + insertTab( + inputId = 'mainTabSet', + tab = tabPanel( + title = "Cluster Iris Data", + value = "tab2", + iris_cluster_ui(ns("mod2")), + fluidRow( + actionButton(ns('previous_tab_2'), label = "Previous"), + actionButton(ns('next_tab_2'), label = "Next") + ) + ), + target = 'tab1', + position = 'after' + ) + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab1' + ) + }) + observeEvent(input$next_tab_3, { + js$enableTab('tab4') + js$disableTab('tab3') + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab4' + ) + }) + observeEvent(input$previous_tab_3, { + delay(50, { + js$enableTab('tab2') + js$disableTab('tab3') + }) + removeTab('mainTabSet','tab3',session) + insertTab( + inputId = 'mainTabSet', + tab = tabPanel( + title = "Subset Rows", + value = "tab3", + subset_rows_ui(ns("mod3")), + fluidRow( + actionButton(ns('previous_tab_3'), label = "Previous"), + actionButton(ns('next_tab_3'), label = "Next") + ) + ), + target = 'tab2', + position = 'after' + ) + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab2' + ) + }) + observeEvent(input$next_tab_4, { + js$enableTab('tab5') + js$disableTab('tab4') + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab5' + ) + }) + observeEvent(input$previous_tab_4, { + delay(50, { + js$enableTab('tab3') + js$disableTab('tab4') + }) + removeTab('mainTabSet','tab4',session) + insertTab( + inputId = 'mainTabSet', + tab = tabPanel( + title = "Plot Column", + value = "tab4", + single_column_plot_ui(ns("mod4")), + fluidRow( + actionButton(ns('previous_tab_4'), label = "Previous"), + actionButton(ns('next_tab_4'), label = "Next") + ) + ), + target = 'tab3', + position = 'after' + ) + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab3' + ) + }) + observeEvent(input$previous_tab_5, { + delay(50, { + js$enableTab('tab4') + js$disableTab('tab5') + }) + updateTabsetPanel( + session, 'mainTabSet', + selected = 'tab4' + ) + }) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/add_app.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_app.R new file mode 100644 index 0000000000..f7db04a188 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_app.R @@ -0,0 +1,50 @@ +add_app_metadata_UI <- function(id) { + ns <- NS(id) + tagList( + textInput( + ns('app_name'), + 'pkAppName' + ), + textInput( + ns('app_display_name'), + 'Display Name' + ), + textInput( + ns('app_descrip'), + 'Description' + ), + textInput( + ns('app_videoURL'), + 'App Video URL' + ), + textInput( + ns('app_CSS'), + 'Bootswatch theme name or CSS file name' + ), + textAreaInput( + ns('app_notes'), + 'Notes', + rows = 6, + cols = 4 + ), + checkboxInput( + ns('app_active'), + label = 'App Active', + value = TRUE + ) + ) +} + +add_app_metadata_Server <- function(id) { + moduleServer(id, function(input, output, session) { + reactiveValues( + app_name = reactive(input$app_name), + app_display_name = reactive(input$app_display_name), + app_descrip = reactive(input$app_descrip), + app_videoURL = reactive(input$app_videoURL), + app_CSS = reactive(input$app_CSS), + app_notes = reactive(input$app_notes), + app_active = reactive(input$app_active) + ) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/add_mod.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_mod.R new file mode 100644 index 0000000000..0887409d52 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_mod.R @@ -0,0 +1,31 @@ +add_mod_ui <- function(id) { + ns <- NS(id) + tagList( + useShinyjs(), + uiOutput(ns('select_mod')), + actionButton(ns('confirm_mod'), 'Confirm Module') + ) +} + +add_mod_server <- function(id, available_mods, mod_table) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + + output$select_mod <- renderUI({ + selectInput(ns('select_mod'), 'Select module', choices = available_mods()$mod_name) + }) + + observeEvent(input$confirm_mod, { + shinyjs::disable('select_mod') + shinyjs::disable('confirm_mod') + }) + + return_val <- eventReactive(input$confirm_mod, { + reactiveValues( + mod_name = reactive(input$select_mod) + ) + }) + + return_val + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/add_report.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_report.R new file mode 100644 index 0000000000..438c858273 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_report.R @@ -0,0 +1,212 @@ +add_report_ui <- function(id) { + ns <- NS(id) + tagList( + wellPanel( + id = "reportHeader", + style = "background: lightblue", + textOutput(outputId = ns("reportInstructions")) + ), + fileInput( + inputId = ns("pkReportName"), + label = "Select the file containing the report template:", + multiple = FALSE + ), + textInput( + inputId = ns("displayName"), + label = "Report Display Name:" + ), + textAreaInput( + inputId = ns("description"), + label = "Report Description:", + placeholder = "A description of the report" + ), + uiOutput(outputId = ns("appSelect")), + textAreaInput( + inputId = ns("notes"), + label = "Notes:", + placeholder = "Notes about connecting the report to this app" + ), + fluidRow( + column( + width = 2, + actionButton( + inputId = ns("submit"), + label = "Add Report" + ) + ), + column( + width = 2, + actionButton( + inputId = ns("cancel"), + label = "Cancel" + ) + ) + ) + ) + +} #end ui function + +add_report_server <- function(id, shinyMgrPath) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + + #make selector for the apps + output$appSelect <- renderUI({ + selectInput( + inputId = ns("selectedApp"), + label = "Which app is this report template for?", + choices = c("", sort(qry_row( + tableName = "apps", + colConditions = "pkAppName", + shinyMgrPath = shinyMgrPath + )$pkAppName)) + ) + }) + + #make text instructions + output$reportInstructions <- renderText({ + "Register a .Rmd template in the database and connect it with an app. + This tab records these templates in the reports and appReports table, but does + not store the files themselves. However, the files are copied to a subdirectory + of the reports folder named after the app." + }) + + #cancel button erases everything in the inputs (and maybe the file too?) + observeEvent(input$cancel, { + updateSelectInput( + session = session, + inputId = "selectedApp", + selected = "" + ) + + updateTextInput( + session = session, + inputId = "displayName", + value = "" + ) + + updateTextAreaInput( + session = session, + inputId = "description", + value = "" + ) + + }) #end cancel button press + + #submit button press + observeEvent(input$submit, { + #check that report file name is unique + existingReports <- qry_row( + tableName = "reports", + colConditions = "pkReportName", + shinyMgrPath = shinyMgrPath + )[,] + + reportName <- gsub(pattern = "(\\w+)\\..*", replacement = "\\1", x = input$pkReportName$name,) + print(reportName) + + if (reportName %in% existingReports) { + showModal( + modalDialog( + title = "A report with this filename already exists in the database.", + "Please delete the record or upload a different file.", + easyClose = FALSE + ) + ) + } else { + #create dataframe to add to reports table + newReport <- data.frame( + pkReportName = reportName, + displayName = input$displayName, + reportDescription = input$description + ) + + #create dataframe to add to appReports table + newAppReport <- data.frame( + fkAppName = input$selectedApp, + fkReportName = reportName, + notes = input$notes + ) + + #trycatch to append each table and clear inputs if successful + tryCatch( + expr = { + #create folder for the app if it doesn't already exist + if (!dir.exists(paste0(shinyMgrPath, "/reports/", input$selectedApp))) { + dir.create(paste0(shinyMgrPath, "/reports/", input$selectedApp)) + } + + #write the file in the folder + reportPath <- paste0(shinyMgrPath, "/reports/", input$selectedApp, "/", reportName, ".Rmd") + #create file if it doesn't already exist + if (!file.exists(reportPath)) { + file.create(paste0(shinyMgrPath, "/reports/", input$selectedApp, "/", reportName, ".Rmd")) + } + templateCon <- file(input$pkReportName$datapath) + template <- readLines(templateCon) + close(templateCon) + + fileCon <- file(paste0(shinyMgrPath, "/reports/", input$selectedApp, "/", reportName, ".Rmd")) + writeLines(template, fileCon) + close(fileCon) + + #append reports table + reportResult <- qry_insert( + tableName = "reports", + rowValues = newReport, + shinyMgrPath = shinyMgrPath + ) + + #append appReports table + appReportResult <- qry_insert( + tableName = "appReports", + rowValues = newAppReport, + shinyMgrPath = shinyMgrPath + ) + + #clear inputs + updateSelectInput( + session = session, + inputId = "selectedApp", + selected = "" + ) + + updateTextInput( + session = session, + inputId = "displayName", + value = "" + ) + + updateTextAreaInput( + session = session, + inputId = "description", + value = "" + ) + + #show modal when done + showModal( + modalDialog( + title = "Done", + paste0(reportResult, "row was added to the reports table."), + br(), + paste0(appReportResult, "row was added to the appReports table."), + easyClose = TRUE + ) + ) + + }, + error = function(e) { + showModal( + modalDialog( + title = "Report could not be added:", + x, + easyClose = FALSE + ) + ) + } + ) #end trycatch + } #end else statement + }) #end submit button click + + }) #end moduleServer +} #end server function diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/add_tab.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_tab.R new file mode 100644 index 0000000000..2db084050b --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/add_tab.R @@ -0,0 +1,36 @@ +add_tab_metadata_UI <- function(id) { + ns <- NS(id) + tagList( + textInput( + ns('tab_name'), + 'pkTabName' + ), + textInput( + ns('tab_display_name'), + 'Display Name' + ), + textAreaInput( + ns('tab_inst'), + 'Instructions', + rows = 6, + cols = 4 + ), + textAreaInput( + ns('tab_notes'), + 'Tab Notes', + rows = 6, + cols = 4 + ) + ) +} + +add_tab_metadata_Server <- function(id) { + moduleServer(id, function(input, output, session) { + reactiveValues( + tab_name = reactive(input$tab_name), + tab_display_name = reactive(input$tab_display_name), + tab_inst = reactive(input$tab_inst), + tab_notes = reactive(input$tab_notes) + ) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/app_builder.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/app_builder.R new file mode 100644 index 0000000000..eb4a7e978a --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/app_builder.R @@ -0,0 +1,930 @@ +jscode <- " + shinyjs.disableTab = function(name) { + var tab = $('.nav li a[data-value=' + name + ']'); + tab.bind('click.tab', function(e) { + e.preventDefault(); + return false; + }); + tab.addClass('disabled'); +} + +shinyjs.enableTab = function(name) { + var tab = $('.nav li a[data-value=' + name + ']'); + tab.unbind('click.tab'); + tab.removeClass('disabled'); +}" + +css <- " +.nav li a.disabled { + background-color: #aaa !important; + color: #333 !important; + cursor: not-allowed !important; + border-color: #aaa !important; +}" + +app_builder_ui <- function(id) { + ns <- NS(id) + tagList( + useShinyjs(), + extendShinyjs(text = jscode, functions = c('disableTab','enableTab')), + inlineCSS(css), + wellPanel( + style = "background: lightblue", + textOutput(ns('instructions')) + ), + actionButton( + ns('start'), + 'Start New App', + onclick = "var $btn=$(this); setTimeout(function(){$btn.remove();},0);" + ), + uiOutput(ns('builderUI')) + ) +} + +app_builder_server <- function(id, shinyMgrPath) { + moduleServer(id, function(input, output, session) { + + ns <- session$ns + + needs_reset <- reactiveValues(reset = reactive(FALSE)) + + output$instructions <- renderText( + "This is where you use modules from the database to build an app. It is + essential that an App be fully planned before being built. For more on + planning your App, see the walkthrough in the wiki. Once your app is planned, + this builder allows you to write all your App's metadata, define the tabs, + mods, and how the mods will be stitched together. The builder than writes + a script for your App, and adds it to the database. Your App can then be + run using 'New Analysis'. Press \"Start New App\" to begin." + ) + + observeEvent(input$start, { + # INSERT MAIN UI + output$builderUI <- renderUI({ + + tagList( + sidebarLayout( + sidebarPanel( + width = 6, + h1("Your App"), + reactableOutput(ns("app_summary")), + h6("") + ), + mainPanel( + width = 6, + tabsetPanel( + id = ns("mainTabSet"), + tabPanel( + title = "App Metadata", + value = "tab1", + add_app_metadata_UI(ns("add_app")), + actionButton( + ns("next_mod_1"), + "Begin 1st Tab" + ) + ), + tabPanel( + title = "Begin Tab", + value = "tab2", + add_tab_metadata_UI(ns("add_tab")), + actionButton( + ns("next_mod_2"), + "Add Modules" + ), + HTML('  '), + actionButton(ns('reset_button1'), 'Reset the builder') + ), + tabPanel( + title = "Add Module", + value = "tab3", + uiOutput(ns("mod_resettable")), + tags$br(), + tags$br(), + fluidRow( + disabled(actionButton(ns("new_tab"), "New Tab")), + disabled(actionButton(ns("new_mod"), "New Mod")), + disabled(actionButton(ns("finish_app"), "Finish Framework")), + HTML('  '), + actionButton(ns('reset_button2'), 'Reset the builder') + ) + ), + tabPanel( + title = "Match Arguments/Returns", + value = "tab4", + h5(""), + actionButton(ns("complete_app"), "Lock in stitch"), + HTML('  '), + actionButton(ns('reset_button3'), 'Reset the builder') + ) + ) + ) + ) + ) + }) + + delay(50, { + js$disableTab("tab2") + js$disableTab("tab3") + js$disableTab("tab4") + }) + + appNames <- qry_row( + tableName = 'apps', + colConditions = 'pkAppName', + shinyMgrPath = shinyMgrPath + )[,] + tabNames <- qry_row( + tableName = 'tabs', + colConditions = 'pkTabName', + shinyMgrPath = shinyMgrPath + )[,] + mod_info <- qry_row( + tableName = 'modules', + colConditions = c('pkModuleName', 'modDisplayName', 'modDescription'), + shinyMgrPath = shinyMgrPath + ) + qryModReturns <- qry_row('modFunctionReturns', shinyMgrPath = shinyMgrPath) + qryModArguments <- qry_row('modFunctionArguments', shinyMgrPath = shinyMgrPath) + + # Set up table with mod argument info for display on "stitching" + mod_return_table <- qryModReturns[ + c("fkModuleName", "functionReturnName", "functionReturnClass", "description") + ] + names(mod_return_table) <- c('Module Name', 'Return Name', 'class', 'description') + + # Set up table with mod argument info for display on "stitching" + mod_arg_table <- qryModArguments[ + c("fkModuleName", "functionArgName", "functionArgClass", "description") + ] + names(mod_arg_table) <- c('Module Name', 'Argument Name', 'class', 'description') + + # Number of arguments that each module requires + conx <- DBI::dbConnect( + drv = RSQLite::SQLite(), + dbname = paste0(shinyMgrPath, "/database/shinymgr.sqlite") + ) + n_args <- rs <- DBI::dbGetQuery( + conx, + statement = + 'SELECT modules.pkModuleName AS [mod_name], + Count(modFunctionArguments.pkModArgID) AS [ct] + FROM modules LEFT JOIN modFunctionArguments + ON modules.pkModuleName = modFunctionArguments.fkModuleName + GROUP BY modules.pkModuleName;' + ) + DBI::dbDisconnect(conx) + + # Set up table with mod info to display + names(mod_info) <- c('Module Name', 'Display Name', 'Description') + for (i in 1:nrow(mod_info)) { + mod_info$arg_ct[i] <- sum(qryModArguments$fkModuleName == mod_info$`Module Name`[i]) + mod_info$return_ct[i] <- sum(qryModReturns$fkModuleName == mod_info$`Module Name`[i]) + } + + shinyjs::disable('new_tab') + shinyjs::disable('new_mod') + shinyjs::disable('finish_app') + + # Store App, Tab, and Mod names (and order) for display + app_meta <- reactiveVal( + data.frame( + elem_type = character(0), + num = character(0), + elem_name = character(0), + stringsAsFactors = FALSE + ) + ) + + # Store mod order + app_meta2 <- reactiveVal( + data.frame( + tab_num = integer(0), + tab_name = character(0), + mod_num = integer(0), + mod_name = character(0), + stringsAsFactors = FALSE + ) + ) + + # Which mods are available (based on how many returns have happened) + available_mods <- reactiveVal( + data.frame( + mod_name = n_args$mod_name[n_args$ct==0] + ) + ) + + # Store available arguments + available_args <- reactiveVal( + data.frame( + arg_name = character(0), + arg_tab_name = character(0), + arg_mod_order = character(0), + stringsAsFactors = FALSE + ) + ) + + # Initialize counters + mod_ct <- reactiveVal(0) + tab_ct <- reactiveVal(0) + ret_ct <- reactiveVal(0) # Number of returns encountered + + # Keep a running tally of tabs, for adding to the database + tabs_2_add <- reactiveVal( + data.frame( + pkTabName = character(0), + tabDisplayName = character(0), + tabInstructions = character(0), + tabNotes = character(0), + stringsAsFactors = FALSE + ) + ) + + # Keep a running tally of mods, for adding to the database + mods_2_add <- reactiveVal( + data.frame( + pkInstanceID = integer(0), + fkTabName = character(0), + fkModuleName = character(0), + modOrder = integer(0), + stringsAsFactors = FALSE + ) + ) + + output$mod_resettable <- renderUI({ + times <- input$new_mod + times_tab <- input$new_tab + add_mod_ui(ns('add_mod')) + }) + + data1 <- add_app_metadata_Server('add_app') + data2 <- add_tab_metadata_Server('add_tab') + data3 <- add_mod_server('add_mod', available_mods, mod_info) + + # BUTTON-CLICK (Begin 1st Tab) + observeEvent(input$next_mod_1, { + if (data1$app_name() %in% appNames | + data1$app_name() %in% mod_info$pkModuleName) { + showModal(modalDialog( + title = "AppName Already in Use", + "Select a different AppName" + )) + } else { + # ADD TO VERBOSE OUTPUT + # See for approach: https://stackoverflow.com/questions/54495321 + new_dat <- data.frame( + elem_type = 'App Name:', + num = as.character(''), + elem_name = data1$app_display_name(), + stringsAsFactors = FALSE + ) + app_meta( rbind(app_meta(), new_dat, stringsAsFactors = FALSE) ) + + # Move to next tab + js$disableTab("tab1") + js$enableTab("tab2") + updateTabsetPanel( + session, + 'mainTabSet', + selected = 'tab2' + ) + } + }) + + output$mod_info_text <- renderUI({ + HTML( + paste0( + tags$hr(), + tags$h3('Module Information'), + tags$em('Use the information below to help choose which mods to use in your App:'), + tags$br(), tags$br() + ) + ) + }) + + output$mod_info <- renderReactable({ + reactable( + data = mod_info, + filterable = TRUE, + searchable = TRUE + ) + }) + + # Add the mod, when the button is clicked + observeEvent(input$next_mod_2, { + if (data2$tab_name() %in% c(tabNames, tabs_2_add()$pkTabName)) { + showModal(modalDialog( + title = "TabName Already in Use", + "Select a different TabName" + )) + } else { + + js$disableTab("tab2") + js$enableTab("tab3") + + tab_ct(tab_ct() + 1) # Increment tab counter + + app_meta( # Update app metadata (for display) + rbind( + app_meta(), + data.frame( + elem_type = 'Tab Name:', + num = paste0('(', tab_ct(),')'), + elem_name = data2$tab_display_name(), + stringsAsFactors = FALSE + ), + stringsAsFactors = FALSE + ) + ) + + tabs_2_add( # keep a running list of new tabs to be added + rbind( + tabs_2_add(), + data.frame( + pkTabName = data2$tab_name(), + tabDisplayName = data2$tab_display_name(), + tabInstructions = data2$tab_inst(), + tabNotes = data2$tab_notes(), + stringsAsFactors = FALSE + ), + stringsAsFactors = FALSE + ) + ) + + # Add mod table to sidebar + insertUI( + selector = "h6", + where = "beforeEnd", + ui = tagList( + htmlOutput(ns('mod_info_text')), + reactableOutput(ns('mod_info')) + ) + ) + + updateTabsetPanel( + session, + 'mainTabSet', + selected = 'tab3' + ) + + } + }) + + output$app_summary <- renderReactable({ + reactable(data = app_meta()) + }) + + observeEvent(data3(), { + print(data3()$mod_name()) + + shinyjs::enable('new_tab') + shinyjs::enable('new_mod') + shinyjs::enable('finish_app') + + mod_ct(mod_ct() + 1) # Increment mod counter + + app_meta( + rbind( + app_meta(), + data.frame( + elem_type = 'Mod Name:', + num = as.character(paste0('(', mod_ct(),')')), + elem_name = data3()$mod_name(), + stringsAsFactors = FALSE + ), + stringsAsFactors = FALSE + ) + ) + + app_meta2( + rbind( + app_meta2(), + data.frame( + tab_num = tab_ct(), + tab_name = data2$tab_name(), + mod_num = mod_ct(), + mod_name = data3()$mod_name(), + stringsAsFactors = FALSE + ), + stringsAsFactors = FALSE + ) + ) + + # Get names of returns (to be passed into available_args) + new_args <- qryModReturns$functionReturnName[qryModReturns$fkModuleName == data3()$mod_name()] + + available_args( + rbind( + available_args(), + data.frame( + arg_name = new_args, + arg_tab_name = rep(data2$tab_display_name(),length(new_args)), + arg_mod_name = rep(data3()$mod_name(),length(new_args)), + arg_mod_order = rep(mod_ct(),length(new_args)), + stringsAsFactors = FALSE + ), + stringsAsFactors = FALSE + ) + ) + }) + + observeEvent(input$new_tab, { + # Remove "mod_info" since not selecting a mod here + removeUI( + selector = paste0('#',ns("mod_info")), + session = session + ) + removeUI( + selector = paste0('#',ns("mod_info_text")), + session = session + ) + + # BELOW HERE attempt to reset mod on new tab + # This attempt is successful, but lots of repeated code with "new_mod" button + # Tried finding a way to reduce duplication, but removing it from "new_mod" + # messes up the "updatePanel" to stay on the right tab. + times <- input$new_mod + removeTab( + inputId = 'mainTabSet', + target = 'tab3' + ) + insertTab( + inputId = "mainTabSet", + tabPanel( + title = 'Add Module', + value = paste('tab3'), + add_mod_ui(ns('add_mod')), + tags$br(), + tags$br(), + fluidRow( + shinyjs::disabled(actionButton(ns('new_tab'),'New Tab')), + shinyjs::disabled(actionButton(ns('new_mod'),'New Mod')), + shinyjs::disabled(actionButton(ns('finish_app'),'Finish Framework')), + HTML('  '), + actionButton(ns('reset_button2'), 'Reset the builder') + ) + ), + target = 'tab2', + position = 'after' + ) + # ABOVE HERE reset mod on new tab + + # Update available_mods + selected_mod <- data3()$mod_name + ret_ct(ret_ct() + sum(qryModReturns$fkModuleName == selected_mod())) # Update return count + new_dat3 <- data.frame( + mod_name = n_args$mod_name[n_args$ct <= ret_ct()], + stringsAsFactors = FALSE + ) + available_mods(new_dat3) + # END update avail mods + + mod_ct(0) # reset mod_ct + times <- input$new_tab + + removeTab( + inputId = 'mainTabSet', + target = 'tab2' + ) + insertTab( + inputId = 'mainTabSet', + tabPanel( + title = 'Begin Tab', + value = 'tab2', + add_tab_metadata_UI(ns('add_tab')), + tags$br(), + actionButton( + ns('next_mod_2'), + 'Add Modules' + ), + HTML('  '), + actionButton(ns('reset_button1'), 'Reset the builder') + ), + target = 'tab1', + position = 'after' + ) + updateTabsetPanel( + session, + 'mainTabSet', + selected = 'tab2' + ) + + delay(50, { + js$disableTab("tab3") + js$enableTab("tab2") + }) + }) + + # Add some logic for the "new mod" button + observeEvent(input$new_mod, { + + # Update available_mods + selected_mod <- data3()$mod_name + ret_ct(ret_ct() + sum(qryModReturns$fkModuleName == selected_mod())) # Update return count + new_dat3 <- data.frame( + mod_name = n_args$mod_name[n_args$ct <= ret_ct()], + stringsAsFactors = FALSE + ) + available_mods(new_dat3) + + times <- input$new_mod + removeTab( + inputId = 'mainTabSet', + target = 'tab3' + ) + insertTab( + inputId = "mainTabSet", + tabPanel( + title = 'Add Module', + value = paste('tab3'), + add_mod_ui(ns('add_mod')), + tags$br(), + fluidRow( + shinyjs::disabled(actionButton(ns('new_tab'),'New Tab')), + shinyjs::disabled(actionButton(ns('new_mod'),'New Mod')), + shinyjs::disabled(actionButton(ns('finish_app'),'Finish Framework')), + HTML('  '), + actionButton(ns('reset_button2'), 'Reset the builder') + ) + ), + target = 'tab2', + position = 'after' + ) + updateTabsetPanel( + session, + 'mainTabSet', + selected = 'tab3' + ) + }) + + observeEvent(input$finish_app, { + + # Render the tables, based on mods/arguments/returns that have appeared in the app + used_mods <- unique(app_meta()[app_meta()$elem_type == 'Mod Name:','elem_name']) + i_used_arg <- mod_arg_table$`Module Name` %in% used_mods + i_used_return <- mod_return_table$`Module Name` %in% used_mods + + output$mod_arg_table <- renderReactable({ + reactable( + mod_arg_table[i_used_arg,], + defaultPageSize = 5, + filterable = TRUE, + searchable = TRUE + ) + }) + + output$mod_arg_table_text <- renderUI({ + HTML( + paste0( + tags$hr(), + tags$h3('Module ARGUMENT Information'), + tags$em('Use the information below to inform which arguments must be defined in your App:'), + tags$br(), tags$br() + ) + ) + }) + output$mod_return_table <- renderReactable({ + reactable( + mod_return_table[i_used_return,], + defaultPageSize = 5, + filterable = TRUE, + searchable = TRUE + ) + }) + output$mod_return_table_text <- renderUI({ + HTML( + paste0( + tags$hr(), + tags$h3('Module RETURN Information'), + tags$em('Use the information below to inform what has been returned by modules in your App:'), + tags$br(), tags$br() + ) + ) + }) + + # Remove "mod_info" since not selecting a mod here + removeUI( + selector = paste0('#',ns("mod_info")), + session = session + ) + removeUI( + selector = paste0('#',ns("mod_info_text")), + session = session + ) + + # Add mod table to sidebar + insertUI( + selector = "h6", + where = "beforeEnd", + ui = tagList( + htmlOutput(ns('mod_arg_table_text')), + reactableOutput(ns('mod_arg_table')), + htmlOutput(ns('mod_return_table_text')), + reactableOutput(ns('mod_return_table')) + ) + ) + + t_ct <- 0 # count of tabs (for stitching options) + m_ct <- 0 # count of mods (for stitching options) + avail_returns <- data.frame( + return_tab_num = numeric(0), + return_mod_num = numeric(0), + return_name = character(0) + ) # Keep track of which returns will be available + + #Loop through each tab/mod, updating stitching selection process sequentially + for (i in 2:nrow(app_meta())) { + + # Increment if tab + if (app_meta()$elem_type[i] == 'Tab Name:') { + t_ct <- t_ct + 1 + m_ct <- 0 + + # If not tab (ie., it's a mod), do stuff + } else { + m_ct <- m_ct + 1 + mod_name <- app_meta()$elem_name[i] # Mod Name + # Create list of choices for drop-down + mod_arg_choices <- character(nrow(avail_returns)) + for (j in 1:nrow(avail_returns)) { + mod_arg_choices[j] <- paste0( + 'Tab #', + avail_returns$return_tab_num[j], + ' Mod #', + avail_returns$return_mod_num[j], + '; ', + avail_returns$return_name[j] + ) + } + + # Add dropdown for each argument that must be defined for the given mod + + # Arguments for that mod, add UI element for each + the_args <- qryModArguments$functionArgName[qryModArguments$fkModuleName == mod_name] + for (the_arg in the_args) { + insertUI( + selector = "h5", + where = "beforeEnd", + ui = tagList( + selectInput( + ns(paste('arg_select', t_ct, m_ct, the_arg, sep = '_')), + HTML( + paste0( + 'Tab: ', t_ct, '; ', + 'Mod: ', m_ct, '; ', + 'Argument: ', the_arg, '
    ', + 'Assign Argument:' + ) + ), + choices = mod_arg_choices, + width = 500 + ) + ) + ) + } + + # Add any returns generated by the current mod + the_returns <- qryModReturns$functionReturnName[qryModReturns$fkModuleName == mod_name] + for (the_return in the_returns) { + avail_returns <- rbind( + avail_returns, + data.frame( + return_tab_num = t_ct, + return_mod_num = m_ct, + return_name = the_return + ) + ) + } + } + } + + updateTabsetPanel( + session, + 'mainTabSet', + selected = 'tab4' + ) + + js$disableTab("tab3") + js$enableTab("tab4") + }) + + observeEvent(input$complete_app, { + # This will be where all the writing to the database happens + # Add new app to "App" Table + rslt <- qry_insert( + 'apps', + data.frame( + pkAppName = data1$app_name(), + appDisplayName = data1$app_display_name(), + appDescription = data1$app_descrip(), + appVideoURL = data1$app_videoURL(), + appCSS = data1$app_CSS(), + appNotes = data1$app_notes(), + appActive = data1$app_active(), + dateCreated = as.character(Sys.time()), + fkParentAppName = NA + ), + shinyMgrPath + ) + + # Add each new tab to the "Tabs" Table + for (i in seq_len(nrow(tabs_2_add()))) { + rslt <- qry_insert( + 'tabs', + data.frame( + pkTabName = tabs_2_add()$pkTabName[i], + tabDisplayName = tabs_2_add()$tabDisplayName[i], + tabInstructions = ifelse( + test = nchar(tabs_2_add()$tabInstructions[i]) != 0, + yes = tabs_2_add()$tabInstructions[i], + no = NA + ), + tabNotes = tabs_2_add()$tabNotes[i] + ), + shinyMgrPath + ) + + rslt <- qry_insert( + 'appTabs', + data.frame( + fkTabName = tabs_2_add()$pkTabName[i], + fkAppName = data1$app_name(), + tabOrder = i + ), + shinyMgrPath + ) + } + + # + for (i in seq_len(nrow(app_meta2()))) { + # Correction to accommodate scenario where tabModules is empty + instanceIDs <- qry_row('tabModules', shinyMgrPath = shinyMgrPath)$pkInstanceID + instanceID <- ifelse(length(instanceIDs), max(instanceIDs)+1, 1) + + rslt <- qry_insert( + 'tabModules', + data.frame( + fkTabName = app_meta2()$tab_name[i], + fkModuleName = app_meta2()$mod_name[i], + modOrder = app_meta2()$mod_num[i] + ), + shinyMgrPath + ) + + # Add return part of the stitching instructions + i_mod_returns <- which(qryModReturns$fkModuleName == app_meta2()$mod_name[i]) + for (i_mod_return in i_mod_returns) { + rslt <- qry_insert( + 'appStitching', + data.frame( + fkAppName = data1$app_name(), + fkInstanceID = instanceID, + fkModArgID = NA, + fkModReturnID = qryModReturns$pkModReturnID[i_mod_return], + fkStitchID = NA + ), + shinyMgrPath + ) + } + + # Add argument part of the stitching instructions + i_mod_args <- which(qryModArguments$fkModuleName == app_meta2()$mod_name[i]) + for (i_mod_arg in i_mod_args) { + # Find the matching stitchID + input_name <- paste( + 'arg_select', # reserved prefix + app_meta2()$tab_num[i], # tab number + app_meta2()$mod_num[i], # mod number + qryModArguments$functionArgName[i_mod_arg], # Arg name + sep = '_' + ) + + # Now, parse the button to directly fetch stitching info + return_button_val <- input[[input_name]] + delims <- unlist(gregexpr('#', return_button_val)) + delim2 <- unlist(gregexpr(' ', return_button_val)) + delim3 <- unlist(gregexpr(';', return_button_val))[1] + ret_tab_num <- as.integer(substr(return_button_val, delims[1]+1, delim2[2]-1)) + ret_mod_num <- as.integer(substr(return_button_val, delims[2]+1, delim3-1)) + ret_mod_name <- app_meta2()$mod_name[app_meta2()$tab_num == ret_tab_num][ret_mod_num] + ret_name <- substr(return_button_val, delim2[length(delim2)]+1, nchar(return_button_val)) + + # Get the pkModReturnID for the above return (which will become an argument) + # Find tabModules row with... + ret_instance_ID <- qry_row( + 'tabModules', + rowConditions = list( + fkTabName = tabs_2_add()$pkTabName[ret_tab_num], #... matching tab name + fkModuleName = ret_mod_name, #... mathcing mod name + modOrder = ret_mod_num #... and matching mod order + ), + colConditions = 'pkInstanceID', + shinyMgrPath = shinyMgrPath + )[,] + + # Now, get the matching fkModArgID + mod_return_ID <- qryModReturns$pkModReturnID[ + (qryModReturns$functionReturnName == ret_name) & + (qryModReturns$fkModuleName == ret_mod_name) + ] + + # Get stitch ID with matching return instance ID + ret_stitch_id <- qry_row( + 'appStitching', + rowConditions = list( + fkInstanceID = ret_instance_ID, + fkModReturnID = mod_return_ID + ), + colConditions = 'pkStitchID', + shinyMgrPath = shinyMgrPath + )[,] + + rslt <- qry_insert( + 'appStitching', + data.frame( + fkAppName = data1$app_name(), + fkInstanceID = instanceID, + fkModArgID = qryModArguments$pkModArgID[i_mod_arg], + fkModReturnID = NA, + fkStitchID = ret_stitch_id + ), + shinyMgrPath + ) + } + } + + # Run stitching script + stitch_script(data1$app_name(), shinyMgrPath) + + # Show that the App has been Added + showModal(modalDialog( + title = "Update Success", + "Your App has been written and added to the database. You will need to + reset shinymgr (log-out and log back in) to load your new app before + using it to run a new analysis.", + footer = modalButton("OK"), + easyClose = TRUE + )) + + # Trigger builder reset + needs_reset$reset <- reactive(TRUE) + }) + + observeEvent(input$reset_button1, { + needs_reset$reset <- reactive(TRUE) + }) + + observeEvent(input$reset_button2, { + needs_reset$reset <- reactive(TRUE) + }) + + observeEvent(input$reset_button3, { + needs_reset$reset <- reactive(TRUE) + }) + + observeEvent(needs_reset$reset, { + + if (needs_reset$reset()) { + + # Reset tab enabling/disabling + delay(50, { + js$disableTab("tab2") + js$disableTab("tab3") + js$disableTab("tab4") + }) + + # Reset all internal variables + app_meta(app_meta()[0,]) + app_meta2(app_meta2()[0,]) + available_mods(rbind( + available_mods()[0,], + data.frame( + mod_name = n_args$mod_name[n_args$ct==0] + ) + )) + # Reset running list of tabs + tabs_2_add(rbind( + tabs_2_add()[0,], + data.frame( + pkTabName = character(0), + tabDisplayName = character(0), + tabInstructions = character(0), + tabNotes = character(0), + stringsAsFactors = FALSE + ) + )) + + available_args(available_args()[0,]) + + # Reset counters + mod_ct(mod_ct()*0) + tab_ct(tab_ct()*0) + ret_ct(ret_ct()*0) # Number of returns encountered + + delay(50, {shinyjs::hide('start')}) + } + }) + + }) + return(needs_reset) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/my_db.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/my_db.R new file mode 100644 index 0000000000..a7abf2163a --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/my_db.R @@ -0,0 +1,67 @@ +my_db_ui <- function(id) { + fpath <- system.file("extdata", "shinyTable.csv", package = "shinymgr") + shinyTable <- read.csv(fpath, na.strings = "") + tagList( + wellPanel( + id = "database", + textOutput(outputId = NS(id, id = "welcome")) #end textOutput + ), #end wellPanel + do.call( + tabsetPanel, + c( + id = NS(id, id = "main_tab"), + type = "tabs", + lapply( + X = unique(shinyTable$primaryTab), + FUN = function(X) { + tabPanel( + title = X, + do.call( + tabsetPanel, + c( + id = paste0(X, "_tabgroup"), + type = "tabs", + lapply( + X = shinyTable$fkTableName[shinyTable$primaryTab == X], + FUN = function(i) { + tabPanel( + title = i, + value = i, + uiOutput(NS(id, paste0(i, "_output"))) + ) + } + ) #end inner lapply + ) + ) #end inner tabSetPanel + ) #end primary tabPanel + } + ) #end outer lapply + ) #end arguments + ) + ) #end tagList +} #end ui function + +my_db_server <- function(id, con) { + moduleServer( + id, + function(input, output, session) { + + #welcome message + output$welcome <- renderText( + expr = "This database page maintains a constant connection to the .sqlite database + and is only disconnected when you navigate away from this tab." + ) + + #creating content for each tab + lapply( + X = DBI::dbListTables( + conn = con() + ), #end query + FUN = function(X) { + output[[paste0(X, "_output")]] <- renderUI({table_ui(X)}) + } + ) + + } #end moduleServer function + ) #end moduleServer +} #end server function diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/new_analysis.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/new_analysis.R new file mode 100644 index 0000000000..3a24d0a5b1 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/new_analysis.R @@ -0,0 +1,125 @@ + +new_analysis_ui <- function(id) { + ns <- NS(id) + tagList( + uiOutput(ns("apps_available")), + shinyjs::disabled(actionButton( + ns('start_new_analysis'), + 'Select New Analysis' + )), + actionButton( + ns('reset_new_analysis'), + 'Reset Analysis' + ), + wellPanel( + tags$div(id = 'app_ui_goes_here') + ) + ) +} + +new_analysis_server <- function(id, tabSelect, shinyMgrPath) { + moduleServer(id, function(input, output, session) { + ns_ct <- reactiveVal(1) + ns <- session$ns + + # Making sure buttons are enabled properly, when required + needs_reset <- reactiveValues(reset = reactive(TRUE)) + observe({ + if (needs_reset$reset()) { + shinyjs::enable('select_analysis') + shinyjs::enable('start_new_analysis') + needs_reset$reset <- reactive(FALSE) + } + }) + + # Get a list of available apps (from the database) + analysis_list <- sort(qry_row( + 'apps', + list( + appActive = 1 + ), + 'pkAppName', + shinyMgrPath + )$pkAppName) + + # Get a list of available apps (from the directory contents) + # NOTE: Un-comment the below code (and delete the code above) after development + # is complete to remove the dependency on the shinymgr sqlite database. + # analysis_list <- sort(tools::file_path_sans_ext(dir('modules_app'))) + + # Render selectizeInput for choosing which analysis to launch + output$apps_available <- renderUI({ + shinyjs::disabled(selectizeInput( + ns("select_analysis"), + "Select an analysis", + choices = analysis_list, + multiple = TRUE, + selected = NULL, + options = list(placeholder = 'Select an analysis to run.', maxItems = 1) + )) + }) + + observeEvent(input$start_new_analysis, { + + req(input$select_analysis) + + shinyjs::disable('start_new_analysis') + shinyjs::disable('select_analysis') + + # Insert the UI for the App + insertUI( + selector = '#app_ui_goes_here', + where = "beforeEnd", + ui = tags$div( + id = 'app_ui', + tagList( + eval( + parse( + text = paste0( + input$select_analysis, + '_ui(ns("mod_', + ns_ct(), + '"))' + ) + ) + ) + ) + ) + ) + + # Run the server for the App + eval( + parse( + text = paste0( + 'rslt <- ', + input$select_analysis, + '_server("mod_', + ns_ct(), + '", shinyMgrPath)' + ) + ) + ) + + ns_ct(ns_ct() + 1) # Increment namespace counter + + }) + + # Reset the analysis + observeEvent(input$reset_new_analysis, { + req(input$start_new_analysis) + needs_reset$reset <- reactive(TRUE) + + removeUI( + selector = '#app_ui', + session = session + ) + }) + + # Needed to re-enable buttons on tab change before "start" button clicked once + observeEvent(tabSelect(), { + if (!is.null(input$start_new_analysis) && input$start_new_analysis == 0) { + needs_reset$reset <- reactive(TRUE) + } + }) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/new_report.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/new_report.R new file mode 100644 index 0000000000..b91566ed8b --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/new_report.R @@ -0,0 +1,222 @@ + +new_report_ui <- function(id) { + ns <- NS(id) + tagList( + wellPanel( + fluidRow( + column( + 4, + tags$h2('1. Select Report'), + selectizeInput( + ns("select_app"), + "Select an app", + choices = dir('reports'), + multiple = TRUE, + selected = NULL, + options = list(placeholder = 'Select an app to see available reports.', maxItems = 1) + ), + selectInput( + ns('select_report'), + 'Select a report', + choices = character(0) + ) + ), + column( + 4, + tags$h2('2. Select Analysis Parameters'), + uiOutput(ns('param_inputs')) + # fileInput( + # ns("the_analysis"), + # "Choose RDS File", + # multiple = FALSE, + # accept = ".RDS" + # ) + ), + column( + 4, + tags$h2('3. Generate Report'), + wellPanel( + fluidRow( + column( + 6, + tags$h4('Download Report'), + selectInput( + ns('output_type'), + label = NULL, + choices = c('pdf', 'html', 'word') + ), + downloadButton( + ns('generate_report'), + 'Download Report' + ) + ), + column( + 6, + tags$h4('View Report'), + actionButton( + ns('view_report'), + 'View in browser' + ) + ) + ) + ) + ) + ) + ), + uiOutput(ns('the_report')) + ) +} + +new_report_server <- function(id) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + + observeEvent(input$select_app, { + reports_avail <- list.files( + paste('reports', input$select_app, sep = '/'), + pattern = ".Rmd$" + ) + + updateSelectInput( + session, + 'select_report', + 'Select a report', + choices = reports_avail + ) + + output$the_report <- renderUI({""}) + }) + + observeEvent(input$select_report, { + req(input$select_report) + output$param_inputs <- renderUI({ + #get yaml header + tempReport <- file.path(tempdir(), "report.Rmd") + file.copy( + paste('reports', input$select_app, input$select_report, sep = '/'), + tempReport, + overwrite = TRUE + ) + rmdString <- readLines(tempReport) + indices <- which(rmdString == "---") + yamlheader <- rmdString[(min(indices)+1):(max(indices)-1)] + yamllist <- yaml::yaml.load(yamlheader) + #create taglist of ui stuff + do.call(tagList, args = + lapply( + X = names(yamllist$params), + FUN = function(X) { + + yamlName <- X + + #switch function to create all the ui stuff + if (is.null(yamllist$params[[X]]$input)) { + yamlInput <- yamllist$params[[X]] + do.call(textInput, args = c(inputId = ns(yamlName), yamlInput)) + } else { + #get rid of the input for shiny arguments + yamlInput <- yamllist$params[[X]][-which(names(yamllist$params[[X]]) == "input")] + switch( + yamllist$params[[X]]$input, + text = do.call(textInput, args = c(inputId = ns(yamlName), yamlInput)), + slider = do.call(sliderInput, args = c(inputId = ns(yamlName), yamlInput)), + checkbox = do.call(checkboxInput, args = c(inputId = ns(yamlName), yamlInput)), + numeric = do.call(numericInput, args = c(inputId = ns(yamlName), yamlInput)), + date = do.call(dateInput, args = c(inputId = ns(yamlName), yamlInput)), + select = do.call(selectInput, args = c(inputId = ns(yamlName), yamlInput)), + file = { + yamlInput <- yamlInput[-which(names(yamlInput) == "value")] + do.call(fileInput, args = c(inputId = ns(yamlName), yamlInput)) + }, + do.call(textInput, args = c(inputId = ns(yamlName), yamlInput)) + ) + } + } + ) + ) + }) + }) + + output$generate_report <- downloadHandler( + filename = function() { + paste( + 'report', + ifelse( + input$output_type == 'word', + 'doc', + input$output_type + ), + sep = '.' + ) + }, + content = function(file) { + tempReport <- file.path(tempdir(), "report.Rmd") + file.copy( + paste('reports', input$select_app, input$select_report, sep = '/'), + tempReport, + overwrite = TRUE + ) + rmdString <- readLines(tempReport) + indices <- which(rmdString == "---") + yamlheader <- rmdString[(min(indices)+1):(max(indices)-1)] + yamllist <- yaml::yaml.load(yamlheader) + newParams <- list() + for (X in names(yamllist$params)) { + if (!is.null(yamllist$params[[X]]$input)) { + if (yamllist$params[[X]]$input == "file") { + newParams[[X]] <- input[[X]]$datapath + } else { + newParams[[X]] <- input[[X]] + } + } else { + newParams[[X]] <- input[[X]] + } + } + rmarkdown::render( + tempReport, + output_file = file, + output_format = paste(input$output_type, 'document', sep = '_'), + params = newParams, + envir = new.env(parent = globalenv()) + ) + } + ) + + observeEvent(input$view_report, { + req(input$select_report) + output$the_report <- renderUI({ + + #get yaml header (again) + tempReport <- file.path(tempdir(), "report.Rmd") + file.copy( + paste('reports', input$select_app, input$select_report, sep = '/'), + tempReport, + overwrite = TRUE + ) + rmdString <- readLines(tempReport) + indices <- which(rmdString == "---") + yamlheader <- rmdString[(min(indices)+1):(max(indices)-1)] + yamllist <- yaml::yaml.load(yamlheader) + #create list of params + newParams <- list() + for (X in names(yamllist$params)) { + if (!is.null(yamllist$params[[X]]$input)) { + if (yamllist$params[[X]]$input == "file") { + newParams[[X]] <- input[[X]]$datapath + } else { + newParams[[X]] <- input[[X]] + } + } else { + newParams[[X]] <- input[[X]] + } + } + the_report <- rmarkdown::render( + tempReport, + output_format = 'html_document', + params = newParams + ) + includeHTML(the_report) + }) + }) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/queries.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/queries.R new file mode 100644 index 0000000000..c92f7b2c35 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/queries.R @@ -0,0 +1,109 @@ +queries_ui <- function(id) { + ns <- NS(id) + tagList( + uiOutput(ns("dropdowns")), + fluidRow( + column( + 3, + actionButton( + inputId = ns("app_flow"), + label = "Query App Flow" + ) + ), + column( + 3, + actionButton( + inputId = ns("app_stitching"), + label = "Query App Stitching" + ) + ), + column( + 3, + actionButton( + inputId = ns("mod_info"), + label = "Query Mod Info" + ) + ) + ), + h2("Query Result:"), + reactableOutput(ns("query_result")) + ) +} + + +queries_server <- function(id, shinyMgrPath) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + #app/mod dropdown + output$dropdowns <- renderUI({ + fluidRow( + column( + 6, + selectInput( + inputId = ns("app_name"), + label = "Select an app to query:", + choices = sort(qry_row( + tableName = 'apps', + colConditions = 'pkAppName', + shinyMgrPath = shinyMgrPath + )$pkAppName) + ) + ), + column( + 6, + selectInput( + inputId = ns("mod_name"), + label = "Select a mod to query:", + choices = sort(qry_row( + tableName = 'modules', + colConditions = 'pkModuleName', + shinyMgrPath = shinyMgrPath + )$pkModuleName) + ) + ) + ) + }) + + #query app flow + observeEvent(input$app_flow, { + output$query_result <- renderReactable( + isolate( + reactable( + data = qry_app_flow( + appName = input$app_name, + shinyMgrPath = shinyMgrPath + ) + ) + ) + ) + }) + + #query app stitching + observeEvent(input$app_stitching, { + output$query_result <- renderReactable( + isolate( + reactable( + data = qry_app_stitching( + appName = input$app_name, + shinyMgrPath = shinyMgrPath + ) + ) + ) + ) + }) + + observeEvent(input$mod_info, { + output$query_result <- renderReactable( + isolate( + reactable( + data = qry_mod_info( + modName = input$mod_name, + shinyMgrPath = shinyMgrPath + ) + ) + ) + ) + }) + + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/save_analysis.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/save_analysis.R new file mode 100644 index 0000000000..23023d0d53 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/save_analysis.R @@ -0,0 +1,105 @@ +#!! ModName = save_analysis +#!! ModDisplayName = Save Analysis +#!! ModDescription = Save the inputs and results of an analysis as an RDS file in a location chosen by the user +#!! ModCitation = Baggins, Bilbo. 2021. save_analysis. Source code. +#!! ModNotes = +#!! ModActive = 1 +#!! FunctionArg = input !! all user inputs from each module !! list +#!! FunctionArg = returns !! all user returns from each module !! list + +save_analysis_ui <- function(id) { + ns <- NS(id) + tagList( + textInput( + inputId = ns("username"), + label = "Enter your name here:" + ), + br(), + textAreaInput( + inputId = ns("notes"), + label = "Notes", + placeholder = "Notes about this analysis" + ), + br(), + downloadButton( + outputId = ns("save"), + label = "Save Analysis as RDS" + ), + br() + ) +} + +save_analysis_server <- function(id, appName, moduleInput, returns, metadata) { + moduleServer(id, function(input, output, session) { + output$save <- downloadHandler( + filename = function() { + paste0(appName, "_", input$username, "_", format(Sys.time(), "%Y_%m_%d_%H_%M"), ".RDS") + }, + content = function(file) { + + # Initialize variable to hold analysis data + analysis <- list( + analysisName = paste0( + appName, "_", + input$username, "_", + format(Sys.time(), "%Y_%m_%d_%H_%M") + ) + ) + + analysis[['app']] <- appName + analysis[['username']] <- input$username + + inputNames <- names(moduleInput) + for (i_return in 1:length(returns)) { + for (i_input in grep(paste0('mod',i_return), inputNames)) { + analysis[[inputNames[i_input]]] <- moduleInput[[inputNames[i_input]]] + } + } + + analysis[['returns']] <- returns # Add return data to analysis + analysis[['notes']] <- input$notes + analysis[['timestamp']] <- Sys.time() + analysis[['metadata']] <- metadata #Add metadata + + #save whole app code + app_code <- paste( + readLines( + paste0( + getwd(), + '/modules_app/', + appName, + ".R" + ) + ), + collapse = '\n' + ) + + analysis[['app_code']] <- app_code + + #save whole module codes for unique modules + allMods <- vector() + for (i_mod in grep('mod', names(metadata))) { + allMods <- c(allMods, metadata[[i_mod]][["modName"]]) + } + uniqueMods <- unique(allMods) + for (modName in uniqueMods) { + modCode <- paste( + readLines( + paste0( + getwd(), + '/modules/', + modName, + ".R" + ) + ), + collapse = '\n' + ) + analysis[[paste0(modName, "_code")]] <- modCode + } + + #save everything to an rds + saveRDS(analysis, file = file) + } + ) + }) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/stitch_script.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/stitch_script.R new file mode 100644 index 0000000000..82912379c9 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/stitch_script.R @@ -0,0 +1,521 @@ +#' @name stitch_script +#' @aliases stich_script +#' @title Stitching script (for building apps with shinymgr) +#' @description The stitching script is called internally by shinymgr to write the +#' R script for an app, based on instructions received from the shinymgr app builder. +#' After the "Lock in stitch" button is clicked in the app builder, instructions for +#' building the app are populated into the shinymgr database before the app's name and +#' shinymgr database file path are passed to the stitching script. The resulting app +#' synthesized by the stitching script are saved to the "modules_app" sub-directory +#' of the current working directory (the directory populated by shinymgr_setup). +#' +#' This function is called internaly by shinymgr and not indended to be called +#' directly by a user. But, this script can be modified to change any aspect of +#' app's built with shinymgr. +#' @param app_name The app name to stitch +#' @param shinyMgrPath File path to the main shiny manager project directory +#' @usage stitch_script(app_name, shinyMgrPath) +#' @inheritSection rerun_analysis Tutorials +#' @inherit rerun_analysis references +#' @importFrom DBI dbAppendTable +#' @importFrom DBI dbConnect +#' @importFrom DBI dbDisconnect +#' @importFrom RSQLite dbClearResult +#' @importFrom RSQLite SQLite +#' @family stitching + + +stitch_script <- function(app_name, shinyMgrPath) { + # Add line function + add_line <- function(line_code = "", ...) { + write(paste0(line_code, ...), fileConn, append = T) + } + + # Returns "nSpaces" extra indentations, where an indentation equals two spaces + indent <- function(nSpaces) { + paste(rep(' ',nSpaces), collapse = '') + } + + # STITCH THIS THING UP! ----------------- + + # connect to database for running initial queries + conx <- DBI::dbConnect( + drv = RSQLite::SQLite(), + dbname = paste0(shinyMgrPath, "/database/shinymgr.sqlite") + ) + + # Run queries to fetch instructions for buildling the app from the database + + appTabs <- DBI::dbGetQuery( + conx, + statement = paste0( + 'SELECT appTabs.fkAppName, appTabs.fkTabName, appTabs.tabOrder, + tabs.tabDisplayName, tabs.tabInstructions, tabs.tabNotes + FROM tabs INNER JOIN appTabs ON tabs.pkTabName = appTabs.fkTabName + WHERE (((appTabs.fkAppName)="', app_name, '")) + ORDER BY appTabs.tabOrder;' + ) + ) + + tab_names <- appTabs$fkTabName[sort(appTabs$tabOrder)] + n_tabs <- length(tab_names) + + appTabMods <- DBI::dbGetQuery( + conx, + statement = paste0( + "SELECT appTabs.fkAppName, appTabs.fkTabName, + appTabs.tabOrder, tabModules.fkModuleName, tabModules.modOrder + FROM appTabs + INNER JOIN tabModules + ON appTabs.fkTabName = tabModules.fkTabName + WHERE (appTabs.fkAppName='", app_name, "') + ORDER BY appTabs.tabOrder, tabModules.modOrder;" + ) + ) + + mod_names <- unique(appTabMods$fkModuleName) # Grab mod names from here (OK for now, double-check) + + modReturns <- qry_row('modFunctionReturns', shinyMgrPath = shinyMgrPath) + + # Stitching data query and pre-processing + appStitching <- DBI::dbGetQuery( + conx, + paste0( + 'SELECT appStitching.fkAppName, tabModules.fkTabName, + tabModules.fkModuleName, tabModules.modOrder, + modFunctionArguments.functionArgName, + modFunctionReturns.functionReturnName, appStitching.pkStitchID, + appStitching.fkStitchID, appTabs.tabOrder + FROM (tabModules + INNER JOIN ((appStitching + LEFT JOIN modFunctionArguments + ON appStitching.fkModArgID = modFunctionArguments.pkModArgID) + LEFT JOIN modFunctionReturns + ON appStitching.fkModReturnID = modFunctionReturns.pkModReturnID) + ON tabModules.pkInstanceID = appStitching.fkInstanceID) + INNER JOIN appTabs ON tabModules.fkTabName = appTabs.fkTabName + WHERE (((appStitching.fkAppName)="', app_name, '"));' + ) + ) + + DBI::dbDisconnect(conx) + + # Establish the ordering of argument vs. return rows of the "stitching" table + m <- match(appStitching$fkStitchID, appStitching$pkStitchID) + # ARGUMENTS (IN ORDER) + stitch_arg <- appStitching[is.finite(m), c('fkTabName', 'tabOrder', 'fkModuleName', 'modOrder', 'functionArgName')] + names(stitch_arg) <- c('tabNameARG', 'tabOrderARG', 'modNameARG', 'modOrderARG', 'argName') + # RETURNS (IN ORDER) + stitch_return <- appStitching[m[is.finite(m)], c('fkTabName', 'tabOrder', 'fkModuleName', 'modOrder', 'functionReturnName')] + names(stitch_return) <- c('tabNameRET', 'tabOrderRET', 'modNameRET', 'modOrderRET', 'ReturnName') + # Stitch Instructions + stitch <- cbind(stitch_arg, stitch_return) + + # Begin stitching + file.create(paste0(shinyMgrPath, '/modules_app/', app_name, '.R')) + fileConn <- file(paste0(shinyMgrPath, '/modules_app/', app_name, '.R'), "w") # NOTE: globally accessed via add_line + + # Disclaimer header + add_line( + '# This script was automatically generated by the shinymgr R package\'s App Builder on ', + Sys.time(), '.' + ) + add_line('# For more information, visit: https://code.usgs.gov/vtcfwru/shinymgr') + + # Javascript for enabling/disabling tabs + add_line('jscode <- "') + add_line('shinyjs.disableTab = function(name) {') + add_line("var tab = $('.nav li a[data-value=' + name + ']');") + add_line("tab.bind('click.tab', function(e) {") + add_line('e.preventDefault();') + add_line('return false;') + add_line('});') + add_line("tab.addClass('disabled');") + add_line('}') + add_line() + add_line('shinyjs.enableTab = function(name) {') + add_line("var tab = $('.nav li a[data-value=' + name + ']');") + add_line("tab.unbind('click.tab');") + add_line("tab.removeClass('disabled');") + add_line('}') + add_line('"') + add_line() + add_line('css <- "') + add_line('.nav li a.disabled {') + add_line('background-color: #bbb !important;') + add_line('border-color: #ccc !important;') + add_line('cursor: not-allowed !important;') + add_line('}"') + add_line() + add_line() + + # Open up the UI function + add_line(indent(0), app_name, "_ui <- function(id) {") + add_line(indent(1), "ns <- NS(id)") + add_line(indent(1), "tagList(") + add_line(indent(2), "fluidPage(") + + # Add custom css + theme <- qry_row( + 'apps', + rowConditions = list(pkAppName = app_name), + 'appCSS', + shinyMgrPath = shinyMgrPath + ) + if (!is.na(theme) & theme != '') { + if (grepl("css", theme)) { + add_line(indent(3), 'theme = "', theme, '",') + } else { + add_line(indent(3), 'theme = shinythemes::shinytheme("', theme, '"),') + } + + } + + add_line(indent(3), 'useShinyjs(),') + add_line(indent(3), "extendShinyjs(text = jscode, functions = c('disableTab','enableTab')),") + add_line(indent(3), 'inlineCSS(css),') + add_line(indent(3), 'actionButton(') + add_line(indent(4), 'ns("start"),') + add_line(indent(4), '"Start New Analysis",') + add_line(indent(4), 'onclick = "var $btn=$(this); setTimeout(function(){$btn.remove();},0);"') + add_line(indent(3), "),") + add_line(indent(3), "uiOutput(ns('test'))") + add_line(indent(2), ")") + add_line(indent(1), ')') + add_line(indent(0), '}') + + # Now for the server part + add_line(indent(0), app_name, "_server <- function(id, userID, shinyMgrPath) {") + add_line(indent(1), 'moduleServer(id, function(input, output, session) {') + add_line(indent(2), 'ns <- session$ns') + add_line(indent(2), 'observeEvent(input$start, {') + add_line() + add_line(indent(3), "disable('start')") + add_line() + add_line(indent(3), 'output$test <- renderUI({') + add_line(indent(4), 'tagList(') + add_line(indent(5), 'tabsetPanel(') + add_line(indent(6), 'id = ns("mainTabSet"),') + + mod_counter <- 1 + tab_counter <- 1 + + for (i_tab in 1:n_tabs) { + tab_id_name <- tab_names[i_tab] + tab_disp_name <- appTabs$tabDisplayName[appTabs$fkTabName == tab_id_name] + add_line(indent(6), 'tabPanel(') + add_line(indent(7), '"', tab_disp_name ,'", ') + add_line(indent(7), 'value = "tab', tab_counter, '",') + + # Add tab instructions + tab_instructions <- appTabs$tabInstructions[appTabs$fkTabName == tab_id_name] + + # Add tab instructions (if there are any) + if (!is.na(tab_instructions)) { + add_line(indent(7), 'tags$br(),') + add_line(indent(7), 'wellPanel(') + add_line(indent(8), 'style = "background: skyblue",') + add_line(indent(8), '"', gsub('"', '\\\\"', tab_instructions), '"') + add_line(indent(7), '),') + } + + tab_mods <- appTabMods$fkModuleName[appTabMods$fkTabName == tab_id_name] + for (tab_mod in tab_mods) { + add_line(indent(7), tab_mod, '_ui(ns("mod', mod_counter ,'")),') + mod_counter <- mod_counter + 1 + } + add_line(indent(7), 'fluidRow(') + if (tab_counter > 1) { + add_line(indent(8), "actionButton(ns('previous_tab_", tab_counter, '\'), label = "Previous"),') + } + if (tab_counter <= n_tabs) { + add_line(indent(8), "actionButton(ns('next_tab_", tab_counter, '\'), label = "Next")') + } + add_line(indent(7), ')') + add_line(indent(6), '),') + tab_counter <- tab_counter + 1 + } + + + add_line(indent(6), 'tabPanel(') + add_line(indent(7), '"Save",') + add_line(indent(7), 'value = "tab', n_tabs + 1, '",' ) + add_line(indent(7), 'save_analysis_ui(ns("mod', mod_counter ,'")),') + add_line(indent(7), 'tags$br(),') + add_line(indent(7), 'tags$br(),') + add_line(indent(7), 'fluidRow(') + add_line(indent(8), 'actionButton(ns("previous_tab_', n_tabs+1, '"), label = "Previous")') + add_line(indent(7), ')') + add_line(indent(6), ')') + add_line(indent(5), ')') + add_line(indent(4), ')') + add_line(indent(3), '})') + + # Disables all but first tab on launch + add_line(indent(3), 'delay(50, {') + for (i in 2:(n_tabs+1)) { + add_line(indent(4), 'js$disableTab("tab', i, '")') + } + add_line(indent(3), '})') + add_line(indent(2), '})') + add_line() + + data_counter <- 1 + mod_counter <- 1 + + # Keep track of the output name being assigned to each mod + mod_to_output <- data.frame( + tab_num = numeric(0), + mod_order = numeric(0), + return_name = character(0) + ) + + for (i_tab in 1:n_tabs) { + tab_id_name <- tab_names[i_tab] + tab_disp_name <- appTabs$tabDisplayName[appTabs$fkTabName == tab_id_name] + tab_mods <- appTabMods$fkModuleName[appTabMods$fkTabName == tab_id_name] + mod_orders <- appTabMods$modOrder[appTabMods$fkTabName == tab_id_name] + + for (i_mod in 1:length(tab_mods)) { + mod_name <- tab_mods[i_mod] + mod_order <- mod_orders[i_mod] + + if (mod_name %in% modReturns$fkModuleName) { + prefix <- paste0(indent(2), 'data', data_counter,' <- ') + + mod_to_output <- rbind(mod_to_output, data.frame( + tab_num = i_tab, + mod_order = mod_order, + return_name = paste0('data', data_counter)) + ) + data_counter <- data_counter + 1 + } else { + prefix <- indent(2) + } + + prefix <- paste0(prefix, mod_name, '_server("mod', mod_counter, '"') + mod_counter <- mod_counter + 1 + + i_args <- which(stitch$tabOrderARG == i_tab & stitch$modOrderARG == mod_order) + + # Check if inputs are required, and add arguments accordingly + for (i_arg in i_args) { + temp <- which(mod_to_output$tab_num == stitch$tabOrderRET[i_arg] & mod_to_output$mod_order == stitch$modOrderRET[i_arg]) + correct_arg_name <- stitch$argName[i_arg] # argument name + correct_data_var <- mod_to_output$return_name[temp] + correct_return_name <- stitch$ReturnName[i_arg] + prefix <- paste0( + prefix, + ', ', + correct_arg_name, + ' = ', + correct_data_var, + '$', + correct_return_name + ) + } + prefix <- paste0(prefix, ')') + add_line(prefix) + } + } + + # Save analysis server function -------------- + add_line(indent(2), 'save_analysis_server("mod', mod_counter,'",') + add_line(indent(3), 'appName = "', app_name, '",') + add_line(indent(3), 'moduleInput = input,') + add_line(indent(3), 'returns = list(') + + data_counter <- 1 + # Add returns argument ----------- + for (i in 1:nrow(appTabMods)) { + if (appTabMods[i, 'fkModuleName'] %in% modReturns$fkModuleName) { + + add_line(indent(4), 'data', data_counter, ' = list(') + + fct_returns <- qry_row( + tableName = 'modFunctionReturns', + rowConditions = data.frame( + fkModuleName = appTabMods[i,'fkModuleName'] + ), + colConditions = 'functionReturnName', + shinyMgrPath = shinyMgrPath + ) + + for (j in 1:nrow(fct_returns)) { + fct_return <- fct_returns[j,] + if (j < nrow(fct_returns) & nrow(fct_returns) >= 2) { + add_line(indent(5), fct_return, ' = data', data_counter, '$', fct_return, '(),') + } else { + add_line(indent(5), fct_return, ' = data', data_counter, '$', fct_return, '()') + } + } + + if (i < nrow(appTabMods) & + nrow(appTabMods) >= 2 & + sum(appTabMods[(i+1):nrow(appTabMods), 'fkModuleName'] %in% modReturns$fkModuleName) >= 1 + ) { + add_line(indent(4), '),') + data_counter <- data_counter + 1 + } else { + add_line(indent(4), ')') + } + } + } + + add_line(indent(3), '),') + #metadata argument + add_line(indent(3), 'metadata = list(') + #add app description + appInfo <- qry_row( + "apps", + rowConditions = list(pkAppName = app_name), + shinyMgrPath = shinyMgrPath + ) + add_line(indent(4), 'appDescription = "', appInfo[1, 'appDescription'], '",') + + #loop through modules, add row for each mod + data_counter <- 1 + for (i_mods in 1:nrow(appTabMods)) { + #get info for that mod + modInfo <- qry_row( + "modules", + rowConditions = list(pkModuleName = appTabMods[i_mods, 'fkModuleName']), + shinyMgrPath = shinyMgrPath + ) + + modArguments <- qry_row( + "modFunctionArguments", + rowConditions = list(fkModuleName = appTabMods[i_mods, 'fkModuleName']), + shinyMgrPath = shinyMgrPath + ) + + modReturns <- qry_row( + "modFunctionReturns", + rowConditions = list(fkModuleName = appTabMods[i_mods, 'fkModuleName']), + shinyMgrPath = shinyMgrPath + ) + + modPackages <- qry_row( + 'modPackages', + rowConditions = list(fkModuleName = appTabMods[i_mods, 'fkModuleName']), + shinyMgrPath = shinyMgrPath + ) + + add_line(indent(4), 'mod', i_mods, ' = list(') + #if has returns, include data counter, otherwise don't + if (appTabMods[i_mods, 'fkModuleName'] %in% modReturns$fkModuleName) { + add_line(indent(5), 'dataset = "data', data_counter, '",') + data_counter <- data_counter + 1 + } else { + add_line(indent(5), 'dataset = "no returns",') + } + #mod name + add_line(indent(5), 'modName = "', modInfo[1, 'pkModuleName'], '",') + #mod display name + add_line(indent(5), 'modDisplayName = "', modInfo[1, 'modDisplayName'], '",') + #mod description + add_line(indent(5), 'modDescription = "', modInfo[1, 'modDescription'], '",') + #mod arguments + if (nrow(modArguments) > 0) { + add_line(indent(5), 'modArguments = data.frame(') + add_line(indent(6), 'name = c("', paste(modArguments$functionArgName, collapse = '","'), '"),') + add_line(indent(6), 'class = c("', paste(modArguments$functionArgClass, collapse = '","'), '"),') + add_line(indent(6), 'description = c("', paste(modArguments$description, collapse = '","'), '")') + add_line(indent(5), '),') + } else { + add_line(indent(5), 'modArguments = "This module has no additional arguments",') + } + #mod returns + if (nrow(modReturns) > 0) { + add_line(indent(5), 'modReturns = data.frame(') + add_line(indent(6), 'name = c("', paste(modReturns$functionReturnName, collapse = '","'), '"),') + add_line(indent(6), 'class = c("', paste(modReturns$functionReturnClass, collapse = '","'), '"),') + add_line(indent(6), 'description = c("', paste(modReturns$description, collapse = '","'), '")') + add_line(indent(5), '),') + } else { + add_line(indent(5), 'modReturns = "This module has no returns",') + } + #mod packages + if (nrow(modPackages) > 0) { + add_line(indent(5), 'modPackages = data.frame(') + add_line(indent(6), 'name = c("', paste(modPackages$packageName, collapse = '","'), '"),') + add_line(indent(6), 'version = c("', paste(modPackages$version, collapse = '","'), '")') + add_line(indent(5), ')') + } else { + add_line(indent(5), 'modPackages = "This module has no package dependencies"') + } + #closing bracket for mod + if (i_mods == nrow(appTabMods)) { + add_line(indent(4), ')') + } else { + add_line(indent(4), '),') + } + } #end metadata list + add_line(indent(3), ')') + add_line(indent(2), ')') + + for (i_tab in 1:(n_tabs+1)) { + # Add the "next" button logic + if (i_tab <= n_tabs) { + add_line(indent(2), 'observeEvent(input$next_tab_', i_tab,', {') + add_line(indent(3), "js$enableTab('tab", i_tab+1,"')") + add_line(indent(3), "js$disableTab('tab", i_tab,"')") + add_line(indent(3), "updateTabsetPanel(") + add_line(indent(4), "session, 'mainTabSet',") + add_line(indent(4), "selected = 'tab", i_tab+1,"'") + add_line(indent(3), ")") + add_line(indent(2), "})") + } + + # Add the "previous" button logic + if (i_tab > 1) { + add_line(indent(2), 'observeEvent(input$previous_tab_', i_tab,', {') + add_line(indent(3), 'delay(50, {') + add_line(indent(4), "js$enableTab('tab", i_tab-1,"')") + add_line(indent(4), "js$disableTab('tab", i_tab,"')") + add_line(indent(3), '})') + + + # Only insert a tab if not the last ("save") tab + if (i_tab <= n_tabs) { + add_line(indent(3), "removeTab('mainTabSet','tab", i_tab,"',session)") + add_line(indent(3), "insertTab(") + add_line(indent(4), "inputId = 'mainTabSet',") + add_line(indent(4), "tab = tabPanel(") + add_line(indent(5), 'title = "', appTabs$tabDisplayName[appTabs$fkTabName == tab_names[i_tab]],'",') + add_line(indent(5), 'value = "tab', i_tab,'",') + tab_mods <- appTabMods$fkModuleName[appTabMods$tabOrder == i_tab] + for (i_mod in 1:length(tab_mods)) { + # Figure out what the proper namespace id is for the mods being replaced + ns_id <- which((appTabMods$tabOrder == i_tab) & (appTabMods$modOrder == i_mod)) + add_line(indent(5), appTabMods$fkModuleName[ns_id], '_ui(ns("mod', ns_id ,'")),') + } + add_line(indent(5), 'fluidRow(') + if (i_tab > 1) { + add_line(indent(6), "actionButton(ns('previous_tab_", i_tab, '\'), label = "Previous"),') + } + if (i_tab <= n_tabs) { + add_line(indent(6), "actionButton(ns('next_tab_", i_tab, '\'), label = "Next")') + } + add_line(indent(5), ')') + add_line(indent(4), "),") + add_line(indent(4), "target = 'tab", i_tab-1,"',") + add_line(indent(4), "position = 'after'") + add_line(indent(3), ")") + } + + add_line(indent(3), "updateTabsetPanel(") + add_line(indent(4), "session, 'mainTabSet',") + add_line(indent(4), " selected = 'tab", i_tab-1,"'") + add_line(indent(3), ")") + add_line(indent(2), "})") + } + } + + add_line(indent(1), '})') + add_line(indent(0), '}') + + # End Stitching + close(fileConn) + unlink(fileConn) +} diff --git a/_articles/RJ-2024-009/shinymgr/modules_mgr/table.R b/_articles/RJ-2024-009/shinymgr/modules_mgr/table.R new file mode 100644 index 0000000000..978a637f29 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/modules_mgr/table.R @@ -0,0 +1,351 @@ +table_ui <- function(id) { + tagList( + wellPanel( + id = "tableName", + style = "background: lightblue", + textOutput(outputId = NS(id, id = "table_instructions")) + ), #end wellPanel + uiOutput(NS(id, "buttons")), + br(), + reactableOutput(outputId = NS(id, id = "show_table")) + ) #end taglist +} #end ui function + +table_server <- function(id, con) { + moduleServer( + id, + function(input, output, session) { + ns <- session$ns + #conditional delete button for apps and modules only + if (id %in% c("apps", "modules", "reports")) { + output$buttons <- renderUI({ + fluidRow( + column( + width = 3, + actionButton( + class = "btn-danger", + inputId = ns("delete"), + label = "Delete record", + icon = icon("trash-alt") + ) + ), + column( + width = 3, + actionButton( + class = "btn-secondary", + inputId = ns("refresh"), + label = "Refresh Table", + icon = icon("sync") + ) + ) + ) + }) #end render UI + } else { + output$buttons <- renderUI({ + fluidRow( + column( + width = 3, + actionButton( + class = "btn-secondary", + inputId = ns("refresh"), + label = "Refresh Table", + icon = icon("sync") + ) + ) + ) + }) + } #end ifelse for buttons + + #get list of column names for the table + fieldlist <- DBI::dbListFields(conn = con(), name = id) + #make dataframe of subset of dbDictionary + fpath <- system.file("extdata", "dictionary.csv", package = "shinymgr") + fullDictionary <- read.csv(fpath, na.strings = "") + dictionary <- fullDictionary[fullDictionary$pkTableName == id,] + + + #get primary keys + primarykeys <- dictionary$pkFieldName[which(dictionary$pk == 1)] + + #get shinyTable entry + spath <- system.file("extdata", "shinyTable.csv", package = "shinymgr") + fullShinyTable <- read.csv(spath, na.strings = "") + shinyTable <- fullShinyTable[fullShinyTable$fkTableName == id,] + + + #instructions and set up table------------- + output$table_instructions <- renderText({ + shinyTable[1, "description"] + }) #end renderText + + # set a counter which will call the database when triggered + update_trigger <- reactiveVal(1) + db_table <- eventReactive( + eventExpr = {update_trigger()}, + valueExpr = { + dbTab <- DBI::dbReadTable(conn = con(), name = id) + columnOrder <- vector() + endCols <- vector() + for (i in 1:length(dictionary$pkFieldName)) { + if (!is.na(dictionary$sortOrder[i])) { + columnOrder <- c(columnOrder, dictionary$pkFieldName[i]) + } else { + endCols <- c(endCols, dictionary$pkFieldName[i]) + } + } + columnOrder <- c(columnOrder, fieldlist[which(fieldlist %in% endCols)]) + dbTab <- dbTab[columnOrder] + } + ) + + #setting up subTable ---------------------------- + if (nrow(shinyTable) != 0) { + if (shinyTable$subTable != "" & !is.na(shinyTable$subTable)) { + #get subDictionary + subDictionary <- fullDictionary[fullDictionary$pkTableName == shinyTable$subTable,] + + #get subTable + subTable <- eventReactive( + eventExpr = {update_trigger()}, + valueExpr = { + subTab <- DBI::dbGetQuery( + conn = con(), + statement = paste0( + "SELECT * FROM ", + shinyTable$subTable, + ";" + ) + ) + columnOrder <- subDictionary$pkFieldName + subTab <- subTab[columnOrder] + } + ) + + } + } + + #create reactable-------------------- + output$show_table <- renderReactable({ + reactable( + db_table(), + selection = "single", + filterable = TRUE, + details = function(i) { + if (nrow(shinyTable) != 0) { + if (shinyTable$subTable != "" & !is.na(shinyTable$subTable)) { + + #render reactable + output[[paste0("subTable_", i)]] <- renderReactable({ + reactable( + subTable()[subTable()[[ + subDictionary$pkFieldName[ + which(subDictionary$foreignKeyField %in% primarykeys) + ] + ]] == db_table()[[primarykeys]][i], ], + outlined = TRUE + ) + }) + + reactableOutput(NS(id, paste0("subTable_", i))) + } + } else { + NULL + } + }, + onClick = "select" + ) + }) + + #delete row button------------------- + observeEvent( + eventExpr = input$delete, + handlerExpr = { + #get selected row + rowIndex <- getReactableState(outputId = "show_table", name = "selected") + + if (is.null(rowIndex)) { + showModal( + modalDialog( + title = "No row selected.", + "Please select a row to delete.", + easyClose = TRUE + ) + ) + } else { + selectedRow <- db_table()[rowIndex[1],] + + if (id == "apps") { + showModal( + modalDialog( + title = "Confirm deletion of row:", + print(selectedRow[1,1]), + br(), + "Note that any associated tabs in the database will also be deleted.", + checkboxInput( + inputId = ns("fileDelete"), + label = "Delete any associated files", + value = FALSE + ), + footer = tagList( + actionButton( + inputId = NS(id, "confirm_delete"), + label = "Confirm", + class = "btn-danger" + ), + modalButton("Cancel") + ) + ) + ) #end modal + + } else { + showModal( + modalDialog( + title = "Confirm deletion of row:", + print(selectedRow[1,1]), + br(), + checkboxInput( + inputId = ns("fileDelete"), + label = "Delete any associated files", + value = FALSE + ), + footer = tagList( + actionButton( + inputId = NS(id, "confirm_delete"), + label = "Confirm", + class = "btn-danger" + ), + modalButton("Cancel") + ) + ) + ) #end modal + } + + } + } #end handlerExpr + ) #end delete row first button + + #delete confirmation---------------------- + observeEvent( + eventExpr = input$confirm_delete, + handlerExpr = { + #get selected row + rowIndex <- getReactableState(outputId = "show_table", name = "selected") + + selectedRow <- db_table()[rowIndex[1],] + selectedDict <- dictionary + tableName <- id + tablePk <- selectedDict$pkFieldName[which(selectedDict$pk == 1)] + + #create the delete statement + deleteStatement <- paste0( + "DELETE FROM ", + tableName, + " WHERE ") + + for (i in tablePk) { + deleteStatement <- paste0( + deleteStatement, + i, + " = '", + selectedRow[[i]], + "' AND " + ) + } #end loop + + deleteStatement <- substr(deleteStatement, 1, nchar(deleteStatement)-5) + deleteStatement <- paste0(deleteStatement, ";") + + #create another delete statement for tabs if deleting from the apps table + if (id == "apps") { + + #get list of tabs from database + tabNames <- DBI::dbGetQuery( + con(), + statement = paste0( + "SELECT fkTabName FROM appTabs WHERE fkAppName = '", + selectedRow[1,"pkAppName"], + "';" + ) + )$fkTabName #end query + + deleteTabStatement <- paste0( + "DELETE FROM tabs WHERE pkTabName IN ('", + paste(tabNames, collapse = "','"), + "');" + ) + } + + #delete the row (trycatch) + tryCatch( + expr = { + + deleted <- 0 + # Turn on SQLite foreign key constraints + rs <- DBI::dbSendQuery(con(), statement = "PRAGMA foreign_keys = ON;") + dbClearResult(rs) + if (id == "apps") { + deleted <- deleted + dbExecute(conn = con(), statement = deleteTabStatement) + } + deleted <- deleted + dbExecute(conn = con(), statement = deleteStatement) + + #only runs if deletion works - update tables + update_trigger(update_trigger() + 1) + # db_table <- db_table()[-selectedRow,] + + #delete app file if it's still in the directory + if (id == "apps" & input$fileDelete == TRUE) { + if (file.exists(paste0(shinyMgrPath, "/modules_app/", selectedRow[1,"pkAppName"], ".R"))) { + file.remove(paste0(shinyMgrPath, "/modules_app/", selectedRow[1,"pkAppName"], ".R")) + } + } + + #if module, delete from module directory + if (id == "modules" & input$fileDelete == TRUE) { + if (file.exists(paste0(shinyMgrPath, "/modules/", selectedRow[1,"pkModuleName"], ".R"))) { + file.remove(paste0(shinyMgrPath, "/modules/", selectedRow[1,"pkModuleName"], ".R")) + } + } + + #if module, delete from module directory + if (id == "reports" & input$fileDelete == TRUE) { + file.remove( + list.files( + path = paste0(shinyMgrPath, "/reports"), + pattern = paste0(selectedRow[1, "pkReportName"], ".Rmd"), + recursive = TRUE, + full.names = TRUE + ) + ) + } + + showModal( + modalDialog( + title = "Deletion successful", + paste0("Rows deleted: ", deleted), + easyClose = TRUE + ) + ) + }, + error = function(x) { + showModal( + modalDialog( + title = "Row could not be deleted", + x, + easyClose = FALSE + ) + ) + } + ) #end trycatch + } + ) #end observe delete confirmation + + #refresh table (update trigger) ---------------------- + observeEvent( + eventExpr = input$refresh, + handlerExpr = { + update_trigger <- update_trigger(update_trigger() + 1) + } + ) + } + ) #end moduleServer +} diff --git a/_articles/RJ-2024-009/shinymgr/reports/iris_explorer/iris_explorer_report.Rmd b/_articles/RJ-2024-009/shinymgr/reports/iris_explorer/iris_explorer_report.Rmd new file mode 100644 index 0000000000..5a54822a12 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/reports/iris_explorer/iris_explorer_report.Rmd @@ -0,0 +1,60 @@ +--- +title: 'Annual Report for Iris Explorer' +output: html_document +params: + user: + label: "User" + value: "Bilbo" + placeholder: "Enter user name" + year: + label: "Year" + value: 2017 + input: slider + min: 2010 + max: 2018 + step: 1 + sep: "" + file: + input: file + label: "Choose RDS" + value: "" + multiple: FALSE + buttonLabel: "Browse to analysis output..." +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +library(knitr) +ps <- readRDS(params$file) +``` + +This report summarizes an analysis of iris data by +`r params$user` conducted in `r params$year`. Iris +data was clustered into `r ps$'mod2-clusters'` groups +based on `r ps$'mod2-xcol'` and `r ps$'mod2-ycol'`. +A random sample of `r ps$'mod3-sample_num'` records +were collected, with sample sizes shown in the pie +chart below: + +```{r} +pie_data <- table(ps$returns$data2$subset_data$cluster) +pie( + x = pie_data, + labels = as.character(pie_data), + col = rainbow(length(pie_data)), + main = "Number of random samples by cluster" +) +legend( + x = "topright", + legend = names(pie_data), + fill = rainbow(length(pie_data)) +) + + +``` + +Some things to note about this analysis are: `r ps$notes` + +Respectfully submitted, + +Gandalf diff --git a/_articles/RJ-2024-009/shinymgr/server.R b/_articles/RJ-2024-009/shinymgr/server.R new file mode 100644 index 0000000000..16a7bb12a7 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/server.R @@ -0,0 +1,138 @@ +source("global.R") + +server <- function(input, output, session) { + + # call database module + output$my_db_output <- renderUI({ + my_db_ui("my_db") + }) + + # make reactive connection object + con <- reactiveVal({ + DBI::dbConnect( + drv = RSQLite::SQLite(), + dbname = paste0( + shinyMgrPath, + "/database/shinymgr.sqlite") + ) + }) + + # call server functions + isolate({ + my_db_server("my_db", con) + lapply( + X = DBI::dbListTables(conn = con()), + FUN = function(X) { + table_server(X, con) + } + ) + }) + + # control the reactive db connection object + observeEvent( + eventExpr = { + input$dev_tool_tabs + input$tabs + }, + handlerExpr = { + if (input$tabs == "DevTools" & input$dev_tool_tabs == "shinymgr_db") { + print("connecting to database...") + con( + DBI::dbConnect( + drv = RSQLite::SQLite(), + dbname = paste0(shinyMgrPath, "/database/shinymgr.sqlite")) + ) + + } else { + if (DBI::dbIsValid(con())) { + print("disconnecting...") + DBI::dbDisconnect(con()) + print(con()) + } # end disconnecting if still connected + } # end not being on database tab + } # end handler expr + ) # end observe event + + # disconnect from the database when the session is ended + session$onSessionEnded(function(){ + isolate({ + print("session ended") + if (DBI::dbIsValid(con())) { + print("disconnecting...") + DBI::dbDisconnect(con()) + print(con()) + } # end disconnecting if still connected + }) + }) + + # also disconnect if session stops + onStop(function(){ + isolate({ + print("session stopped") + if (DBI::dbIsValid(con())) { + print("disconnecting...") + DBI::dbDisconnect(con()) + print(con()) + } # end disconnecting if still connected + }) + }) + + # call the new_analyses module ui ----------------------------- + output$new_analysis <- renderUI({ + new_analysis_ui("new_analysis") + }) + + new_analysis_server( + id = "new_analysis", + tabSelect = reactive({input$tabs}), + shinyMgrPath = shinyMgrPath + ) + + # call the new_report module ui ----------------------------- + output$new_report <- renderUI({ + new_report_ui("new_report") + }) + + new_report_server( + id = "new_report" + ) + + # call the buildApp module ui ----------------------------- + output$build_app <- renderUI({ + app_builder_ui("app_builder") + }) + + reset_builder <- app_builder_server( + 'app_builder', + shinyMgrPath = shinyMgrPath + ) + + observeEvent(reset_builder$reset, { + if (reset_builder$reset()) { + # remove and re-insert builder tab + output$build_app <- renderUI({ + app_builder_ui("app_builder") + }) + } + }) + + # call the add_report module ui and server ---------- + output$add_report_output <- renderUI({ + add_report_ui("add_report") + }) + + add_report_server( + id = "add_report", + shinyMgrPath = shinyMgrPath + ) + + #call the query module ui and server ----------- + output$query_output <- renderUI({ + queries_ui("queries") + }) + queries_server( + id = "queries", + shinyMgrPath = shinyMgrPath + ) + +} # end of server function diff --git a/_articles/RJ-2024-009/shinymgr/tests/shinytest.R b/_articles/RJ-2024-009/shinymgr/tests/shinytest.R new file mode 100644 index 0000000000..d995504ae8 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/tests/shinytest.R @@ -0,0 +1,5 @@ +library(shinytest) +library(shinymgr) + + +shinytest::testApp("../") diff --git a/_articles/RJ-2024-009/shinymgr/tests/shinytest/test-iris_explorer-expected/001.json b/_articles/RJ-2024-009/shinymgr/tests/shinytest/test-iris_explorer-expected/001.json new file mode 100644 index 0000000000..ce96878c34 --- /dev/null +++ b/_articles/RJ-2024-009/shinymgr/tests/shinytest/test-iris_explorer-expected/001.json @@ -0,0 +1,144 @@ +{ + "input": { + "iris_explorer-iris-clusters": 4, + "iris_explorer-iris-xcol": "Sepal.Length", + "iris_explorer-iris-ycol": "Petal.Length", + "iris_explorer-mainTabSet": "K-means clustering", + "iris_explorer-subset-resample": 0, + "iris_explorer-subset-sample_num": 10 + }, + "output": { + "iris_explorer-iris-plot1": { + "src": "[image data sha1: a122f911242d170da743cfba08e09a73c38a532d]", + "width": 611, + "height": 400, + "alt": "Plot object", + "coordmap": { + "panels": [ + { + "domain": { + "left": 4.156, + "right": 8.044, + "bottom": 0.764, + "top": 7.136 + }, + "range": { + "left": 59.0400096628493, + "right": 596.599997643207, + "bottom": 325.559981639995, + "top": -1 + }, + "log": { + "x": null, + "y": null + }, + "mapping": { + + } + } + ], + "dims": { + "width": 611, + "height": 400 + } + } + }, + "iris_explorer-test": { + "html": "
    \n