Dissertation/ClarkDisser.tex at master · nick3703/Dissertation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Template file for a standard thesis
\documentclass[11pt]{isuthesis}
%\usepackage[spanish,english]{babel}
%\addto\captionsenglish{
%\renewcommand{\chaptername}{CHAPTER\ }
%}
\usepackage[pdftex]{graphicx}
% Standard, old-style thesis
\usepackage{isutraditional} \chaptertitle
% Old-style, thesis numbering down to subsubsection
\alternate
\usepackage{rotating}
% Bibliography without numbers or labels
\usepackage{natbib}

\RequirePackage[OT1]{fontenc}
\RequirePackage{amsthm,amsmath,amsfonts}
\usepackage{tikz}
\usepackage{graphicx}
%\usepackage[caption=false]{subfig}
\usepackage[all]{nowidow}
\usepackage{listings}


\newtheorem{prop}{Proposition}

\bibliographystyle{apa}
%\includeonly{titletoc,chapter1}
%Optional Package to add PDF bookmarks and hypertext links
\usepackage[pdftex,hypertexnames=false,linktocpage=true]{hyperref}
\hypersetup{colorlinks=true,linkcolor=blue,anchorcolor=blue,citecolor=blue,filecolor=blue,urlcolor=blue,bookmarksnumbered=true,pdfview=FitB}

\usetikzlibrary{arrows,chains,matrix,positioning,scopes}
%
\makeatletter
\tikzset{join/.code=\tikzset{after node path={%
			\ifx\tikzchainprevious\pgfutil@empty\else(\tikzchainprevious)%
			edge[every join]#1(\tikzchaincurrent)\fi}}}
\makeatother
%
\tikzset{>=stealth',every on chain/.append style={join},
	every join/.style={->}}
\tikzstyle{labeled}=[execute at begin node=$\scriptstyle,
execute at end node=$]
\begin{document}
\DeclareGraphicsExtensions{.jpg,.pdf,.mps,.png}
% Template Titlepage File
\title{Self-exciting spatio-temporal statistical models for count data with applications to modeling the spread of violence}
\author{Nicholas John Clark}
\degree{DOCTOR OF PHILOSOPHY}
%\level{master's}
\mprof{Philip M. Dixon}
%\members{Mary Jones \\ Bjork Petersen}
\notice
% Add these additional lines for a Doctoral Dissertation
\degree{DOCTOR OF PHILOSOPHY}
\level{doctoral}
\format{dissertation}
\committee{4}
\members{Ulrike Genschel \\ Mark Kaiser \\ Jarad Niemi \\ Zhengyuan Zhu }
% Add these additional lines for a Doctoral Dissertation
%\degree{DOCTOR OF PHILOSOPHY}
%\level{doctoral}
%\format{dissertation}
%\committee{4}
%\members{Mary Jones \\ Bjork Petersen \\ Sam Anders \\ Harold Jones}
% Add these additional lines for a Creative Component
% - also comment out the \maketitle command
%\format{Creative Component}
%\submit{the graduate faculty}
\maketitle

% Optional thesis dedication
\chapter*{DEDICATION}
For Sarah, Piper, and Raegan.

\pdfbookmark[1]{TABLE OF CONTENTS}{table}
\tableofcontents
\addtocontents{toc}{\def\protect\@chapapp{}} \cleardoublepage \phantomsection
\addcontentsline{toc}{chapter}{LIST OF TABLES}
\listoftables
\cleardoublepage \phantomsection \addcontentsline{toc}{chapter}{LIST OF FIGURES}
\listoffigures
% Comment out the next line if NOT using chaptertitle
\addtocontents{toc}{\def\protect\@chapapp{CHAPTER\ }}
%Optional Acknowledgements
\cleardoublepage \phantomsection
\specialchapt{ACKNOWLEDGEMENTS}
I am extremely grateful for the hard work and dedication of my advisor, Dr. Philip Dixon, in making this dissertation possible.  Our twice-weekly meetings, over the last two and a half years, were not only instrumental in shaping my vision for this work, but also quickly became the highlight of my week.  I particularly appreciate the many conversations we had about 'telling a story' through our writing and translating ideas into words that others, potentially outside the field of statistics, would actually want to read.

I am also grateful to Profs Genschel, Kaiser, Niemi, and Zhu.  Your questions, guidance, and encouragement helped grow a germ of an idea into a scholarly work.  Individual conversations from each of you are interwoven into this work.

Going back to graduate school has long been a goal of mine.  The camaraderie and professionalism of the faculty and the other graduate students has exceeded any of my expectations.  I have genuinely looked forward to every day that I've spent in my cubicle and it has rarely felt like work and for this I am appreciative beyond what I could possibly convey.
%Optional thesis abstract
\cleardoublepage \phantomsection
\specialchapt{ABSTRACT}
In this dissertation we provide statistical models and inferential techniques for analyzing the number of violent or criminal events as they evolve over space and time.  Our research focuses on a class of models we refer to as self-exciting spatio-temporal models.  These are a class of parametric models that allow for dependence in a latent structure as well as dependence in the data model combining what is sometimes referred to as observation driven and parameter driven models.  This class of models arise from straight-forward assumptions on how violence or crime evolves over space and time and has use in the statistical modeling of situations where there is an expected repeat or near-repeat victimization. In Chapter 2 we present the spatially correlated self-exciting model and the reaction-diffusion self-exciting model to analyze the number of violent events in different regions in Iraq.  We also demonstrate how Laplace approximations can be used to conduct efficient Bayesian inference.  We further show how the choice of the latent structure matters in this problem.  In Chapter 3 we generalize the spatially correlated self-exciting model and show how it extends the classic integer generalized auto-regressive conditionally heteroskedastic, or INGARCH, model to account for spatial correlation and improves the second order properties of the INGARCH model.  We refer to this new class of models as the spatially correlated INGARCH, or SPINGARCH, model.  We show how the spatially correlated self-exciting model is similar to the SPINGARCH(0,1) model.  Finally in Chapter 4 we present a fast extended Laplace approximation algorithm for fitting the SPINGARCH(0,1) model demonstrating empirically how the extended Laplace approximation method reduces a bias that exists in the Laplace approximation method while performing much quicker than Markov Chain Monte Carlo approaches.
\newpage
\pagenumbering{arabic}
% Chapter 1 of the Thesis Template File
\chapter{Introduction}
The statistical modeling of violence has generally neglected to properly account for spatial and temporal correlation.  In \cite{ratcliffe2010crime}, the argument is made that potential temporal correlation is often ignored in the statistical modeling of crimes whereas in examples such as \cite{mohler2013modeling} spatial correlation is ignored.  One of the principal challenges is that there are few available statistical models for count data that both offer a mechanism for capturing spatial-temporal correlation as well as result in meaningful inference.  The commonly used Poisson-Log Normal model, for example, only allows limited data correlation as described in \cite{aitchison1989multivariate}.  Furthermore, this method of modeling assumes that, in the terminology of \cite{cox1981statistical}, large scale structure in the model is parameter-driven.  On the other hand, criminologists have shown the presence of repeat victimization in \cite{johnson1997new} and \cite{johnson2007space}.  To properly account for this phenomena, a statistical model should consider observation-driven structure, commonly referred to as self-excitement.

In this dissertation, we propose a new class of statistical models that accounts for self-excitement in the modeling of the spread of violence.  Motivating our work is the assumption that violence at a given space-time region can arise both due to repeat victimization as well as  exogenous factors.  This assumption allows us to formulate a class of parametric model for the counts of violence that has flexible second order properties and can be fit using standard Bayesian software.

\section{Spatio-temporal counts of violent events}

The number of crimes or other violent events is usually aggregated over a fixed space-time lattice and presented as count data.  For example, in this dissertaiton we use two primary datasets that are both aggregated due to privacy issues and convenience.  The first dataset is from the Global Terrorism Database (\cite{lafree2007introducing}).  We specifically look at the violent incidents in Iraq from 2003-2010 aggregated over province and month.  Figure \ref{IZSpread} depicts the monthly counts of US government reported violent incidents in Iraq aggregated over province during this time period.  Here it appears that the violence over this time period spreads out from the center of the country.

 We also use crime in Chicago freely available from the city of Chicago  available at \href{https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present}{https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present}.  The data is aggregated over city block, we further aggregate it over census block group and month.  The number of burglaries per census block per month in the south side of Chicago is shown in figure \ref{ChiSpread}.  Here we see that the spatial correlation is much less obvious than in Iraq, however as we will show it is still present in the data.

\begin{figure}[h] % figuur 1
	\begin{center}
		\vspace{6pc}
		\includegraphics[width=.8 \linewidth]{Iraq2.pdf}
		\isucaption[Spread of violence in Iraq in 2004]{The spread of the log-count of violence in Iraq over discrete regions and discrete times.}
		\label{IZSpread}
	\end{center}
\end{figure}


\begin{figure}[h] % figuur 1
	\begin{center}
		\vspace{6pc}
		\includegraphics[width=.8 \linewidth]{ChiBurg.pdf}
		\isucaption[Spread of burglaries in the south side of Chicago]{Spread of burglaries in the south side of Chicago in 2011.}
		\label{ChiSpread}
	\end{center}
\end{figure}

Though the spatio-temporal spread of violence in Iraq and Chicago appear to be very different, they both have been associated with the repeat-victimization or self-excitement.  In \cite{lewis2012self} and \cite{mohler2013modeling}, civilian deaths were found to be partially attributed to self-excitement and in \cite{mohler2011self} and \cite{mohler2013modeling} burglaries were also shown to be attributed to self-excitement.  A further mathematical model for the spatio-temporal diffusion of burglaries presented in \cite{short2008statistical} also heavily relied on the notion of self-excitment.


\section{Self-excitement in a statistical model}
The models we present and use also account for self-excitement.  The general idea of self-excitement in a statistical model is not new and have been extensively used in modeling phenomena from earthquakes in \cite{ogata1988statistical} to finance (see e.g. \cite{bacry2015hawkes}).  Self-excitement, often referred to as Hawkes processes, originates from \cite{hawkes1971spectra} work in point process data where the intensity of a Poisson process, $\lambda(t)$ is assumed to depend both on $t$ as well as past realizations of the process
\begin{equation}
\lambda(t)=\nu + \int_{0}^t g(t-s)N(ds),\quad t>0.
\end{equation}
Here $g(t)$ can be thought of as a triggering function that generally decays over time.  In this manner, an observed event today increases the probability of a subsequent event occurring the following day.  In the modeling of violence this is the notion of repeat, or near repeat victimization shown to exist in burglaries in \cite{johnson1997new} and \cite{johnson2007space}.

The resulting statistical model using self-excitation is doubly stochastic as $\int_{0}^t g(t-s)N(ds)$ depends on a random variable, the number of events between time $t$ and $t-s$.

However, as demonstrated in \cite{mohler2013modeling}, there may be other forms of latent variation in the intensity function.  In that manuscript the authors demonstrated how the modeling of crimes in Chicago as well as violence in Iraq is improved through the addition of a latent auto-regressive term.  In particular, they assumed that the number of violent events, $y_t$, between $t$ and some fixed $t-\Delta t$ follows a Poisson distribution with expectation given by
\begin{align}
& \lambda_t  =\exp(x_t)+\sum_{t>j} \eta \kappa^{t-j} y_j\\
& \boldsymbol{x} \sim \mbox{Gau }(\boldsymbol{0},\Sigma)\\
& \Sigma_{t,j} =\sigma^2 a^{|t-j|}.
\end{align}
That is, the expected number of violent events is a linear combination of a discrete self-excitement term derived from an exponential kernel of a Hawkes process and a log-Gaussian Auto-Regressive (1) model.  While this improved the fit of the data they considered, clearly spatial correlation was ignored.

\section{Self-exciting spatio-temporal parametric models}

Historically, the modeling of spatial-temporal count data has relied on the assumption that the log expected counts can be modeled as a latent Gaussian random variable (e.g. in \cite{python2016bayesian}).  In our belief this is overly restrictive and makes an realistic assumption that the counts are conditionally independent given a latent process and does not account for the possibility of self-excitement.  Attempts at accounting for self-excitement generally fail to address the spatial correlation as in \cite{mohler2013modeling} or estimate it non-parametrically as in \cite{mohler2011self}.

In this dissertation we consider the impacts of combining self-excitement with spatial and spatio-temporal latent structures.  In Chapter \ref{AoAS} we propose two different latent structures resulting in a spatially correlated self-exciting statistical model and a reaction diffusion self-exciting statistical model.  We further demonstrate an efficient methodology for conducting Bayesian inference based on Laplace approximations and apply the methodology to the counts of violent events in Iraq.

In Chapter \ref{JASA} we extend the spatially correlated self-exciting model and demonstrate similarities between this model and a relatively recent discrete valued time series model called the integer generalized autoregressive conditional heteroskedasticity, or INGARCH, model.  In that spirit, we refer to the resulting model as the spatially correlated INGARCH, or SPINGARCH model.  We demonstrate how the model can be fit using off the shelf software for Bayesian inference.  We further show how the model out performs the INGARCH model in capturing the second order properties of the number of burglaries in the south side of Chicago.

Finally, in Chapter \ref{CSDA}, we present an efficient methodology for Bayesian inference of the spatially correlated self-exciting model (also referred to as the SPINGARCH(0,1) model) based on an extended Laplace approximation.  We show how for a large range of the parameter space the extended Laplace approximation matches Markov chain Monte Carlo (MCMC) based techniques while providing a drastic computational speedup.


\chapter{Modeling and Estimation for Self-Exciting Spatio-temporal Models of Terrorist Activity}\label{AoAS}
\begin{center}
	Preprint of a manuscript accepted in \textit{The Annals of Applied Statistics}.\\
	Nicholas J. Clark, Philip M. Dixon\\
	Please cite \cite{2017arXiv170308429C}
\end{center}
\section*{Abstract}
Spatio-temporal hierarchical modeling is an extremely attractive way to model the spread of crime or terrorism data over a given region, especially when the observations are counts and must be modeled discretely.  The spatio-temporal diffusion is placed, as a matter of convenience, in the process model allowing for straightforward estimation of the diffusion parameters through Bayesian techniques.  However, this method of modeling does not allow for the existence of self-excitation, or a temporal data model dependency, that has been shown to exist in criminal and terrorism data.  In this manuscript we will use existing theories on how violence spreads to create models that allow for both spatio-temporal diffusion in the process model as well as temporal diffusion, or self-excitation, in the data model.  We will further demonstrate how Laplace approximations similar to their use in Integrated Nested Laplace Approximation can be used to quickly and accurately conduct inference of self-exciting spatio-temporal models allowing practitioners a new way of fitting and comparing multiple process models.  We will illustrate this approach by fitting a self-exciting spatio-temporal model to terrorism data in Iraq and demonstrate how choice of process model leads to differing conclusions on the existence of self-excitation in the data and differing conclusions on how violence spread spatially-temporally in that country from 2003-2010.

\section{Introduction}
A typical spatio-temporal model consists of three levels, a data model, a process model, and a parameter model.  A common way to model data then is to assume the data model, $Y(\cdot)$, is conditionally independent given the process model $X(\cdot)$. For example, if observations take place at areal regions, $\boldsymbol{s_i}$, at discrete time periods, $t$, and $Y(\boldsymbol{s_i},t)$ are counts, a common model is $Y(\boldsymbol{s_i},t)|X(\boldsymbol{s_i},t)\sim \mbox{Pois}(\exp(X(\boldsymbol{s_i},t)))$.  The spatio-temporal diffusion structure is commonly then placed on the process model which commonly is assumed to have a Gaussian joint distribution of $\boldsymbol{X}\sim \mbox{Gaus}(\boldsymbol{0},Q^{-1}(\theta))$.  The majority of analysis of these models is done using Bayesian techniques requiring a further parameter model for $\theta$.  The challenge in these models is, then, determining an appropriate structure for $\boldsymbol{Q}^{-1}(\theta)$ or $\boldsymbol{Q}(\theta)$.  If both the covariance and the precision is chosen to be too dense inference quickly becomes impossible due to the size of $\boldsymbol{Q}^{-1}(\theta)$.  In spatio-temporal models it is quite common for the dimension of $\boldsymbol{Q}$ to be larger than, say, $10^4\times 10^4$.  A thorough overview giving many examples of this method of modeling is given in \cite{cressie2015statistics}.

In modeling terrorism or crime data one possibility is to use an extremely general spatio-temporal process model to capture variance not explained through the use of covariates.  For example \cite{python2016bayesian} use a Matern class covariance function over space and an AR(1) process over time.  They then use covariates to test the impact of infrastructure, population, and governance.  The general spatio-temporal process models used, in this case, has an extremely sparse precision structure greatly aiding in computations.

While diffusion in spatio-temporal models is often modeled through a latent process, more recent models describing the spread of violence have incorporated self-excitation, or spatio-temporal diffusion that exists linearly in the data model itself.  Self-excitation is the theory that in terrorism, or crime, the probability of an event occurring is a function of previous successful events.  For instance \cite{mohler2011self} demonstrate that burglars are more likely to visit locations that have previously, successfully, been burgled.  \cite{mohler2013modeling} derived a class of models that allowed for temporal diffusion in both the process model as well as the data model and demonstrated how the two processes were identifiable.

In the modeling of terrorism data \cite{lewis2012self}, \cite{porter2012self}, and \cite{mohler2013modeling} have all successfully used the self-excitation approach to model.  Most recently, \cite{tench2016spatio} used a likelihood approach for temporal modeling of IEDs in Northern Ireland using self-excitation.  However, in these papers, the existence and analysis of self-excitation was the primary objective and any process model dependency was either ignored or treated as a nuisance.  The one exception is in \cite{mohler2013modeling} where a temporal only model was assumed for the process model and inference was conducted allowing both process model dependence and data model dependence.

In this manuscript, we will consider a spatial and a spatial-temporal process model that allows for self-excitation.  We will present two self-exciting models for terrorist activity that have different process models corresponding to different notions of how terrorism evolves in time and space as well as temporal dependency in the data model to account for self-excitation.  These two models are specific cases of more general spatio-temporal models that allow dependency in both the process model as well as the data model.

We will further show how Laplace approximations similar to their use in Integrated Nested Laplace Approximation, or INLA, an approximate Bayesian method due to \cite{rue2009approximate} can be used to conduct inference for these types of models. We will show, via simulation, how INLA, when appropriately modified, can accurately be used to make inference on process level parameters for self-exciting models and aid analysts in determining the appropriate process model when scientific knowledge cannot be directly applied as in \cite{cressie2015statistics}.  Finally, we will apply this technique to terrorism data in Iraq.  We will show that choice of process model, in this case, results in differing conclusions on the impact of self-excitation in the model.


\section{Self-Exciting Spatio-Temporal Models}
The use of self-exciting models in both criminal and terrorism modeling has become increasingly popular over the last decade after being originally introduced in \cite{short2008statistical}.  Self-excitement, in a statistical model, directly models copy-cat behavior by letting an observed event increase the intensity (or excites a model) over a specified time or location. Self-exciting models are closely related to Hawkes processes, which are counting processes where the probability of an event occurring is directly related to the number of events that previously occurred. In a self-exciting model, the criminal intensity at a given spatio-temporal location, $(x,y,t)$ is a mixture of a background rate, $\nu$ and self excitement function, $f(\mathcal{H}_{x,y,t})$ that is dependent on the observed history at that location, $\mathcal{H}_{x,y,t}$.

A common temporal version of a discretized Hawkes process is
\begin{align}
& Y_t\sim \mbox{Pois}(\lambda_t) \\
& \lambda_t = \nu(t) + \sum_{j < t} \kappa (t-j) y_j \nonumber\\
& t \in \{1,2,...T\}
\end{align}

In this example, in order for the process to have finite expectation in the limit, $\kappa(t-j)$ must be positive and $\sum_{i=1}^{\infty} \kappa (i) <1$.  $\kappa (t-j)$ can be thought of as the probability that an event at time $j$ triggers an event at time $i$.

\cite{laub2015hawkes} provides an excellent overview of the mathematical properties of the continuous Hawkes process and the discrete process when $\kappa (t-j)$ is taken to be an exponential decay function.

In \cite{mohler2011self}, $\nu$ was treated as separable in space and time and was non-parametrically estimated using stochastic declustering, while in \cite{mohler2013modeling}, the spatial correlations were ignored and an AR(1) process was used for $\nu$ and an exponential decay was assumed for the self-excitation.  In terrorism modeling \cite{lewis2012self} used a piece-wise linear function for $\nu$.

Here we will first define a general model that allows spatial or spatial-temporal correlation to exist in the process model and positive temporal correlation to exist in the data model to allow for self-excitation.  First define $\boldsymbol{s_i} \in \mathbb{R}^2$, $i \in \{1,2,...,n_d\}$ as locations in a fixed, areal, region. We further define $t \in \{1,2,...,T\}$ as discrete time. The general form of a spatial-temporal self-exciting model is then given in \eqref{eq:gen Model}

\begin{align}
& Y(\boldsymbol{s_i},t)|\mu(\boldsymbol{s_i},t) \sim \mbox{Pois}(\mu(\boldsymbol{s_i},t)) \label{eq:gen Model}\\
& \mu(\boldsymbol{s_i},t) = \exp(X(\boldsymbol{s_i},t)) + \eta Y(\boldsymbol{s_i},t-1) \nonumber \\
& \boldsymbol{X} \sim \mbox{Gau}(\boldsymbol{0},Q^{-1}(\theta)) \nonumber
\end{align}

Comparing the above to the Hawkes process, we now have $\nu$ as a function of space-time and denote it as $X(\boldsymbol{s_i},t)$.  We use the simplest form of self-excitation letting $\kappa(t-j)$ be a point-mass function such that $\kappa(k)=\eta$ for $k=1$ and $\kappa(k)=0$ for $k \neq 1$.  In all cases $Y(\boldsymbol{s_i},t)$ will be discrete, observable, count data.

To contrast \eqref{eq:gen Model} with a typical spatial model, figure \ref{exc} depicts the expectation for one areal location $(\boldsymbol{s_i},t)$ without self-excitation and with self-excitation as shown in Figure \ref{exc}. In this figure, the lower line shows $\mu(\boldsymbol{s_i},t)$ with $\eta=0$, and the upper line has $\eta=.4$.  The impact of self-excitation is clearly present in time 10-13.

\begin{figure}[h]
	\begin{center} % figuur 1
		\vspace{6pc}
		\includegraphics[width=.5 \linewidth]{Contageon3.pdf}
		\isucaption[ Self-Excitement in a Statistical Model]{This figure shows an example of the expectation of two processes, one with self-excitation and one without.  The bottom line is the expectation of a process with no self-excitation, the top has self-excitation of $\eta=.4$. The data realizations are from the process with self-excitation.}
		\label{exc}
	\end{center}
\end{figure}


\subsection{Spatially Correlated Self-Exciting Model}

In the first example of \eqref{eq:gen Model} we assume the background intensity rate, $X(\boldsymbol{s_i},t)$ has only spatial correlation.  This model is motivated through the assumption that the latent dependency, $X(\boldsymbol{s_i},t)$, is as a continuous measure of violent tendency at region $\boldsymbol{s_i}$ at time period $t$ and regions that are closer together in space are assumed to share common characteristics.

Next, define $N(\boldsymbol{s_i})$ as the neighborhood of location $\boldsymbol{s_i}$ where two regions are assumed to be neighbors if they share a common border.   $|N(\boldsymbol{s_i})|$ is the number of neighbors of location $\boldsymbol{s_i}$.  The model for $Y(\boldsymbol{s_i},t)$, or the number of observed violent events at a given space-time location is then given by:

\begin{align}
& Y(\boldsymbol{s_i},t)|\mu(\boldsymbol{s_i},t) \sim \mbox{Pois}(\mu(\boldsymbol{s_i},t)) \label{eq:Full Model}\\
& \mu(\boldsymbol{s_i},t) = \exp(X(\boldsymbol{s_i},t)) + \eta Y(\boldsymbol{s_i},t-1) \nonumber \\
& X(\boldsymbol{s_i},t) = \theta_1 \sum_{\boldsymbol{s_j}\in N(\boldsymbol{s_i})}X(\boldsymbol{s_j},t) + \epsilon(\boldsymbol{s_i},t) \nonumber\\
&\epsilon(\boldsymbol{s_i},t) \sim \mbox{Gau}(0,\sigma^2) \nonumber
\end{align}

Letting $\boldsymbol{H}$ denote the spatial neighborhood matrix such that $H_{i,j}=H_{j,i}=1$ if $\boldsymbol{s_i}$ and $\boldsymbol{s_j}$ are neighbors, the full distribution of the joint distribution of the latent state is $\boldsymbol{X}\sim \mbox{Gau} (\boldsymbol{0},(\boldsymbol{I}_{ns,ns}-\boldsymbol{I}_{n,n} \otimes \theta_1\boldsymbol{H})^{-1}\boldsymbol{L}(\boldsymbol{I}_{ns,ns}-\boldsymbol{I}_{n,n} \otimes \theta_1\boldsymbol{H})^{-1})$ where $\boldsymbol{L}=\text{diag}(\sigma^2,...,\sigma^2)$.  The evolution in the latent field is equivalent to the spatial evolution in what is commonly referred to as a Simultaneous or Spatial Auto-regressive model (SAR).  Alternatively, the Conditional Auto-regressive model (CAR) of \cite{besag1974spatial} could be used to model the latent state modifying the joint density above.

The difference between the SAR and \eqref{eq:Full Model} is in the self excitement parameter, $\eta$.  In \eqref{eq:Full Model}, temporal correlation is present, but is present through the data model specification rather than through a temporal evolution in the latent state.  Therefore, temporal correlation is a function solely of the self-excitation in the process.  $\eta$ gives the probability of an event at time $t-1$ creating an event at time $t$.  In order for the system to be well-behaved, $\eta$ is constrained to (0,1).  In order for the joint distribution of $\boldsymbol{X}$ to be valid, $\theta_1 \in (\psi_{(1)}^{-1},\psi_{(n)}^{-1})$ where $\psi_{i}$ is the $i$th smallest eigenvalues of $\boldsymbol{H}$.

The critical assumption in this model is that the propensity of a given location to be violent is spatially correlated with its adjacent spatial neighbors and only evolves over time as a function of excitation.  If terrorism is diffusing according to this model, regions that are geographically adjacent are behaving in a similar manner.  The existence of self-excitation would indicate that individuals within a region are being inspired through the actions of others.  While combating terrorism is complex, if terrorism is diffusing in this manner, one suggestion would be to identify the root causes within a geographic area as well as quick action against any malicious actor to discourage copy-cat behavior.


\subsection{Reaction Diffusion Self-Exciting Model}

Alternatively, a model similar to \cite{short2008statistical} can be used to motivate the process model for the latent state resulting in a non-separable spatio-temporal, $\boldsymbol{X}$.  Here we let $X(\boldsymbol{s_i},t)$ corresponds to a continuous measure of violence due to terrorists or criminals at location $\boldsymbol{s_i}$ at time $t$. This is still a latent variable as we are not directly measuring $X(\boldsymbol{s_i},t)$.  However, now in order for an area to increase in violent tendency, a neighboring area must decrease as the actors causing the violence move from location to location.  Furthermore, if terrorists are removed from the battlefield at a rate proportional to the total number of terrorists present and if terrorists move to fill power vacuums, the process model is similar to the reaction-diffusion partial differential equation (see \cite{cressie2015statistics} for more on the reaction-diffusion model)
\begin{equation}
\frac{\partial X(\boldsymbol{s_i},t)}{\partial t}=\frac{\kappa}{|N(\boldsymbol{s_i})|} \triangle X(\boldsymbol{s_i},t)-\alpha X(\boldsymbol{s_i},t) \label{eq:Reaction}
\end{equation}
In order to generalize this partial differential equation (PDE) to an irregular lattice, we make use of the graphical Laplacian, $\Gamma$, in place of $\triangle$ in \eqref{eq:Reaction}. $\Gamma$ is a matrix that extends the notion of second derivatives to irregular graphs and can be defined as a matrix of the same dimension as the number of geographical regions with entries given by

\[
\Gamma (s_i,s_j)
\begin{cases}
\hfill -|N(s_i)|   \hfill &  j=i \\
\hfill 1 \hfill & j\in N(s_i)  \\
\hfill 0 \hfill& \text{Otherwise}
\end{cases}
\]
With the addition of a random noise term assumed to be Gaussian, the full model can be seen as an example of \eqref{eq:gen Model}.
\begin{align}
& Y(\boldsymbol{s_i},t)|\mu(\boldsymbol{s_i},t) \sim \mbox{Pois}(\mu(\boldsymbol{s_i},t)) \label{eq:ReacDiffuse Model}\\
& \mu(\boldsymbol{s_i},t) = \exp(X(\boldsymbol{s_i},t)) + \eta Y(\boldsymbol{s_i},t-1) \nonumber \\
& \small X(\boldsymbol{s_i},t) = \frac{\kappa}{|N(s_i)|} \sum_{\boldsymbol{s_j}\in N(\boldsymbol{s_i})}X(\boldsymbol{s_j},t-1) + (1-\kappa-\alpha)  X(\boldsymbol{s_i},t-1) + \epsilon(\boldsymbol{s},t) \nonumber\\
&\epsilon(\boldsymbol{s},t) \sim \mbox{Gau}(0,\sigma^2) \nonumber
\end{align}

In contrast to the Spatially Correlated Self-Exciting (SCSE) Model, the process model dependency exists in both space and time.  In order to derive properties of this model we first let $\boldsymbol{M}=\kappa \text{ diag}\left( \frac{1}{|N_{s_i}|}\right) \Gamma + (1-\alpha) \boldsymbol{I}_{s,s}$ and now note that this is equivalent to a Vector Auto-Regressive, VAR, model $\boldsymbol{X}_t = \boldsymbol{M} \boldsymbol{X}_{t-1} +  \boldsymbol{\epsilon}$ with $\boldsymbol{\epsilon}\sim \mbox{Gau}(\boldsymbol{0},\sigma^2 \boldsymbol{I})$.

The VAR(1) model requires all the eigenvalues of $\boldsymbol{M}$ to be between -1 and 1.  This can be satisfied by first noting that 0 is always an eigenvalue of $\text{ diag}\left( \frac{1}{|N_{s_i}|}\right)$ trivially corresponding to the eigenvector of all 1s.  The largest eigenvalue is at most 2 as shown in \cite{chung1997spectral}.  Due to the structure of $(1-\alpha) \boldsymbol{I}_{s,s}$ this implies maximum eigenvalue of $\boldsymbol{M}$ is $(1-\alpha)$ and minimum is  $-2\beta+(1-\alpha)$.  Therefore, the parameter spaces for $\alpha$ and $\kappa$ are $\alpha \in (0,1)$ and $\kappa \in (\frac{-\alpha}{2},\frac{2-\alpha}{2})$.


Just as in the SCSE Model, if $\epsilon$ has a Gaussian distribution, the Reaction Diffusion Self-Exciting (RDSE) Model has an exact solution for the latent Gaussian field, $\boldsymbol{X}$.

Letting $\Sigma_s$ be the spatial covariance at a fixed period of time which is assumed to be invariant to time , then we can solve for $\Sigma_s$ by using the relationship $\Sigma_s = \boldsymbol{M}\Sigma_s\boldsymbol{M^T} + \sigma^2 \boldsymbol{I}$.  As demonstrated by \cite{cressie2015statistics}, this leads to  \begin{equation}\text{vec}(\Sigma_s)=\left(\boldsymbol{I}_{s^2,s^2}-\boldsymbol{M}\otimes\boldsymbol{M}\right)^{-1}\text{vec}\left(\sigma^2 \boldsymbol{I}_{s,s}\right),\end{equation} where $\text{vec}\left( \right)$ is the matrix operator that stacks each column of the matrix on top of one or another.  Recall that $s$ is the size of the lattice that is observed at each time period.  The joint distribution for all $\boldsymbol{X}$ is then $\boldsymbol{X}\sim \mbox{Gau}(\boldsymbol{0},Q_{rd}^{-1}(\theta))$ where


\begin{equation}
Q_{rd}^{-1}(\theta)=
\left[
\begin{array}{c|c|c|c}
\Sigma_s & M \Sigma_s & ... & M^n\Sigma_s \\
\hline
\Sigma_s M^T & \Sigma_s & ... & M^{n-1}\Sigma_s\\
\hline
... & ... & ... & ...\\
\hline
\Sigma_s (M^T)^n & \Sigma_s (M^T)^{n-1}& ... & \Sigma_s
\end{array}
\right]\label{eq:Sig}
\end{equation}
.

However, practically, this involves inverting a potentially large matrix $\boldsymbol{I}_{s^2,s^2}-\boldsymbol{M}\otimes\boldsymbol{M}$.  Therefore, it is easier to deal with the inverse of \eqref{eq:Sig} given in \eqref{eq:Prec}.
\begin{equation}
Q_{rd}(\theta)=
\left[
\begin{array}{c|c|c|c|c}
\boldsymbol{I}_{n,n} & -M & \boldsymbol{0}&  ... & ... \\
\hline
- M^T  & M^T M +\boldsymbol{I}_{n,n} & - M & \boldsymbol{0} & ... \\
\hline
\boldsymbol{0} &- M^T  & M^T M +\boldsymbol{I}_{n,n} & - M & ...\\
\hline
...&...&...&...&...\\
\hline
\boldsymbol{0} & ... & - M^T  & M^T M +\boldsymbol{I}_{n,n} & - M\\
\hline
\boldsymbol{0} & ... & ... & -M^T & \boldsymbol{I}_{n,n}
\end{array}
\right] \frac{1}{\sigma^2}\label{eq:Prec}
\end{equation}


The primary difference between the SCSE model and the RDSE model is that the process model correlation in the SCSE is only spatial while in the RDSE it is spatio-temporal.  In the below toy examples, we show the expectation for $X(s_i,t)$ for both the SCSE and the RDSE model on a 4 x 4 lattice structure.  We fixed both models with a value of $X(s_1,1)=10$ as the upper left hand observation at time 1.  As seen in the RDSE model, the high count at time 1 spreads to neighboring regions in time 2 and time 3 whereas the process model has no temporal spread in the SCSE but has a high level of spatial spread.
\begin{center}
	Spatially Correlated Latent Process Conditional on $(s_1,1)=10$
\end{center}
\begin{center}
	\begin{tikzpicture}[>=latex']
	\tikzset{block/.style= { rectangle, align=left,minimum width=.2cm,minimum height=.1cm},
		rblock/.style={draw, shape=rectangle,rounded corners=1.5em,align=center,minimum width=.2cm,minimum height=.1cm},
		input/.style={ % requires library shapes.geometric
			draw,
			trapezium,
			trapezium left angle=60,
			trapezium right angle=120,
			minimum width=2cm,
			align=center,
			minimum height=1cm
		},
	}
	\node [block, fill=orange!90]  (x1) {\footnotesize 10};
	\node [block, right = .1cm of x1, fill=blue!50] (x2) {\footnotesize 5};
	\node [block, right = .1cm of x2,fill=blue!20] (x3) {\footnotesize 2};
	\node [block, right = .1cm of x3, fill=blue!10] (x4) {\footnotesize 1};
	\node [block, below = .1cm of x1, fill=blue!50] (y1) {\footnotesize 5};
	\node [block, below = .1cm of x2, fill=blue!50] (y2) {\footnotesize 4};
	\node [block, below = .1cm of x3, fill=blue!30] (y3) {\footnotesize 3};
	\node [block, below = .1cm of x4,fill=blue!20] (y4) {\footnotesize 2};
	\node [block, below = .1cm of y1,fill=blue!20] (z1) {\footnotesize 2};
	\node [block, below = .1cm of y2, fill=blue!30] (z2) {\footnotesize 3};
	\node [block, below = .1cm of y3, fill=blue!30] (z3) {\footnotesize 2};
	\node [block, below = .1cm of y4, fill=blue!10] (z4) {\footnotesize 1};
	\node [block, below = .1cm of z1, fill=blue!10] (q1) {\footnotesize 1};
	\node [block, below = .1cm of z2, fill=blue!10] (q2) {\footnotesize 1};
	\node [block, below = .1cm of z3, fill=blue!10] (q3) {\footnotesize 1};
	\node [block, below = .1cm of z4, fill=blue!10] (q4) {\footnotesize 1};
	\node [block, above = .1cm of x2] {\footnotesize Time 1};
	\end{tikzpicture}
	\quad
	\begin{tikzpicture}[>=latex']
	\tikzset{block/.style= { rectangle, align=left,minimum width=.2cm,minimum height=.1cm},
		rblock/.style={draw, shape=rectangle,rounded corners=1.5em,align=center,minimum width=.2cm,minimum height=.1cm},
		input/.style={ % requires library shapes.geometric
			draw,
			trapezium,
			trapezium left angle=60,
			trapezium right angle=120,
			minimum width=2cm,
			align=center,
			minimum height=1cm
		},
	}
	\node [block]  (x1) {\footnotesize 0};
	\node [block, right = .1cm of x1] (x2) {\footnotesize 0};
	\node [block, right = .1cm of x2] (x3) {\footnotesize 0};
	\node [block, right = .1cm of x3] (x4) {\footnotesize 0};
	\node [block, below = .1cm of x1] (y1) {\footnotesize 0};
	\node [block, below = .1cm of x2] (y2) {\footnotesize 0};
	\node [block, below = .1cm of x3] (y3) {\footnotesize 0};
	\node [block, below = .1cm of x4] (y4) {\footnotesize 0};
	\node [block, below = .1cm of y1] (z1) {\footnotesize 0};
	\node [block, below = .1cm of y2] (z2) {\footnotesize 0};
	\node [block, below = .1cm of y3] (z3) {\footnotesize 0};
	\node [block, below = .1cm of y4] (z4) {\footnotesize 0};
	\node [block, below = .1cm of z1] (q1) {\footnotesize 0};
	\node [block, below = .1cm of z2] (q2) {\footnotesize 0};
	\node [block, below = .1cm of z3] (q3) {\footnotesize 0};
	\node [block, below = .1cm of z4] (q4) {\footnotesize 0};
	\node [block, above = .1cm of x2] {\footnotesize Time 2};
	\end{tikzpicture}
\end{center}
\begin{center}
	Reaction Diffusion Latent Process Conditional on $(s_1,1)=10$
\end{center}
\quad
\begin{center}
	\begin{tikzpicture}[>=latex']
	\tikzset{block/.style= { rectangle, align=left,minimum width=.2cm,minimum height=.1cm},
		rblock/.style={draw, shape=rectangle,rounded corners=1.5em,align=center,minimum width=.2cm,minimum height=.1cm},
		input/.style={ % requires library shapes.geometric
			draw,
			trapezium,
			trapezium left angle=60,
			trapezium right angle=120,
			minimum width=2cm,
			align=center,
			minimum height=1cm
		},
	}
	\node [block, fill=orange!90]  (x1) {\footnotesize 10};
	\node [block, right = .1cm of x1,fill=blue!10] (x2) {\footnotesize 1};
	\node [block, right = .1cm of x2, fill=blue!10] (x3) {\footnotesize 1};
	\node [block, right = .1cm of x3] (x4) {\footnotesize 0};
	\node [block, below = .1cm of x1,fill=blue!10] (y1) {\footnotesize 1};
	\node [block, below = .1cm of x2, fill=blue!10] (y2) {\footnotesize 1};
	\node [block, below = .1cm of x3] (y3) {\footnotesize 0};
	\node [block, below = .1cm of x4] (y4) {\footnotesize 0};
	\node [block, below = .1cm of y1, fill=blue!10] (z1) {\footnotesize 1};
	\node [block, below = .1cm of y2] (z2) {\footnotesize 0};
	\node [block, below = .1cm of y3] (z3) {\footnotesize 0};
	\node [block, below = .1cm of y4] (z4) {\footnotesize 0};
	\node [block, below = .1cm of z1] (q1) {\footnotesize 0};
	\node [block, below = .1cm of z2] (q2) {\footnotesize 0};
	\node [block, below = .1cm of z3] (q3) {\footnotesize 0};
	\node [block, below = .1cm of z4] (q4) {\footnotesize 0};
	\node [block, above = .1cm of x2] {\footnotesize Time 1};
	\end{tikzpicture}
	\qquad
	\begin{tikzpicture}[>=latex']
	\tikzset{block/.style= { rectangle, align=left,minimum width=.2cm,minimum height=.1cm},
		rblock/.style={draw, shape=rectangle,rounded corners=1.5em,align=center,minimum width=.2cm,minimum height=.1cm},
		input/.style={ % requires library shapes.geometric
			draw,
			trapezium,
			trapezium left angle=60,
			trapezium right angle=120,
			minimum width=2cm,
			align=center,
			minimum height=1cm
		},
	}
	\node [block,fill=blue!30]  (x1) {\footnotesize 3};
	\node [block, right = .1cm of x1, fill=blue!30] (x2) {\footnotesize 3};
	\node [block, right = .1cm of x2] (x3) {\footnotesize 0};
	\node [block, right = .1cm of x3] (x4) {\footnotesize 0};
	\node [block, below = .1cm of x1, fill=blue!30] (y1) {\footnotesize 3};
	\node [block, below = .1cm of x2, fill=blue!10] (y2) {\footnotesize 1};
	\node [block, below = .1cm of x3] (y3) {\footnotesize 0};
	\node [block, below = .1cm of x4] (y4) {\footnotesize 0};
	\node [block, below = .1cm of y1] (z1) {\footnotesize 0};
	\node [block, below = .1cm of y2] (z2) {\footnotesize 0};
	\node [block, below = .1cm of y3] (z3) {\footnotesize 0};
	\node [block, below = .1cm of y4] (z4) {\footnotesize 0};
	\node [block, below = .1cm of z1] (q1) {\footnotesize 0};
	\node [block, below = .1cm of z2] (q2) {\footnotesize 0};
	\node [block, below = .1cm of z3] (q3) {\footnotesize 0};
	\node [block, below = .1cm of z4] (q4) {\footnotesize 0};
	\node [block, above = .1cm of x2] {\footnotesize Time 2};
	\end{tikzpicture}
	\qquad
	\begin{tikzpicture}[>=latex']
	\tikzset{block/.style= { rectangle, align=center,minimum width=.2cm,minimum height=.1cm},
		rblock/.style={draw, shape=rectangle,rounded corners=1.5em,align=center,minimum width=.2cm,minimum height=.1cm},
		input/.style={ % requires library shapes.geometric
			draw,
			trapezium,
			trapezium left angle=60,
			trapezium right angle=120,
			minimum width=2cm,
			align=center,
			minimum height=1cm
		},
	}

	\node [block, fill=blue!20]  (x1) {\footnotesize 2};
	\node [block, right = .1cm of x1, fill=blue!10] (x2) {\footnotesize 1};
	\node [block, right = .1cm of x2, fill=blue!10] (x3) {\footnotesize 1};
	\node [block, right = .1cm of x3] (x4) {\footnotesize 0};
	\node [block, below = .1cm of x1, fill=blue!10] (y1) {\footnotesize 1};
	\node [block, below = .1cm of x2, fill=blue!10] (y2) {\footnotesize 1};
	\node [block, below = .1cm of x3] (y3) {\footnotesize 0};
	\node [block, below = .1cm of x4] (y4) {\footnotesize 0};
	\node [block, below = .1cm of y1, fill=blue!10] (z1) {\footnotesize 1};
	\node [block, below = .1cm of y2] (z2) {\footnotesize 0};
	\node [block, below = .1cm of y3] (z3) {\footnotesize 0};
	\node [block, below = .1cm of y4] (z4) {\footnotesize 0};
	\node [block, below = .1cm of z1] (q1) {\footnotesize 0};
	\node [block, below = .1cm of z2] (q2) {\footnotesize 0};
	\node [block, below = .1cm of z3] (q3) {\footnotesize 0};
	\node [block, below = .1cm of z4] (q4) {\footnotesize 0};
	\node [block, above = .1cm of x2] {\footnotesize Time 3};
	\end{tikzpicture}\\
\end{center}


Practically, if data follows the RDSE model, it implies a high terrorism count in one region will manifest into a high terrorism count in a neighboring region at a later time period.  In combating terrorism, the RDSE might suggest isolating geographical regions to mitigate the risk of spread while addressing self-excitation through direct action against malicious actors who are inspiring others.

\section{Model Fitting}
In both the RDSE and the SCSE, spatio-temporal diffusion exists in both the process model and the data model.  If the diffusion was solely in the process model, a technique for inference would be Integrated Nested Laplace Approximation, or INLA.

INLA was first proposed in \cite{rue2009approximate} to specifically address the issue of Bayesian Inference of high dimensional Latent Gaussian Random Fields, LGRFs.  An example of this for count data is:
\begin{align}
Y(s_i)&\sim \mbox{Pois}(\mu(s_i,t)) \label{eq:model}\\
\mu(\boldsymbol{s_i,t})&=\exp(\lambda(\boldsymbol{s_i},t))\nonumber\\
\lambda(\boldsymbol{s_i},t) &= \beta_0 + \boldsymbol{Z}^t \boldsymbol{\beta} + X(\boldsymbol{s_i,t})\nonumber\\
\boldsymbol{X}& \sim \mbox{Gau}(\boldsymbol{0},Q^{-1}(\theta))\nonumber
\end{align}

INLA is often preferable over MCMC for these types of models.  An issue with traditional Markov Chain Monte Carlo (MCMC) techniques for these models is that the dimension of $X$ is often very large.  Therefore, while MCMC has $O_p(N^{-1/2})$ errors, the $N$ in the errors is the simulated sample size for the posterior.  Just getting $N=1$ may be extremely difficult due to the vast number of elements of $X$ that need to be estimated.  In general, MCMC will take hours or days in order to successfully simulate from the posterior making the computational cost of fitting multiple process models extremely high.  In \cite{python2016bayesian}, terrorism data was fit using a grid over the entire planet using INLA, though without self-excitation in the model.

To address the issues with MCMC use in LGRFs, \cite{rue2009approximate} developed a deterministic approach based on multiple Laplacian approximations. A LGRF is any density that can be expressed as
\begin{equation}
\pi(\boldsymbol{\theta},\boldsymbol{X} |\boldsymbol{Y}) \propto \pi (\theta)|Q(\boldsymbol{\theta})|^{1/2}\exp \left[\frac{-1}{2}\boldsymbol{X}^t Q(\boldsymbol{\theta}) \boldsymbol{X}+\sum_{\boldsymbol{s}} \log\left(\pi(Y(\boldsymbol{s_i})|X(\boldsymbol{s_i}),\boldsymbol{\theta})\right)\right]
\end{equation}
In order to conduct inference on this model, we need to estimate $\pi(\boldsymbol{\theta}|\boldsymbol{y})$, $\pi(\theta_i|\boldsymbol{y})$ and $\pi(x_i|\boldsymbol{y})$.  The main tool \cite{rue2009approximate} employ is given in their equation (3) as
\begin{equation}
\tilde{\pi}(\boldsymbol{\theta}|\boldsymbol{Y})\propto \frac{\pi(\boldsymbol{X},\boldsymbol{\theta},\boldsymbol{Y})}{\tilde{\pi}_G (\boldsymbol{X}|\boldsymbol{\theta},\boldsymbol{Y})}|_{X=x^*(\boldsymbol{\theta})}\label{eq:main}
\end{equation}

In \cite{rue2009approximate} they note that the denominator of \eqref{eq:main} almost always appears to be unimodal and approximately Gaussian.  The authors then propose to use a Gaussian approximation to $\pi(\boldsymbol{X}|\boldsymbol{\theta},\boldsymbol{Y})$ which is denoted above as $\tilde{\pi}_G$.  Moreover, \eqref{eq:main} should hold no matter what choice of $\boldsymbol{X}$ is used, so a convenient choice for $\boldsymbol{X}$ is the mode for a given $\theta$, which \cite{rue2009approximate} denote as $x^*(\boldsymbol{\theta})$.

Now, $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ can be explored by calculating the marginal for choices of $\theta$, which if chosen carefully can greatly decrease the computational time.  These explored values can then be numerically integrated out to get credible intervals for $\pi(\theta_i|\boldsymbol{Y})$.

Following the exploration of $\theta|Y$, and computation of $\theta_i|Y$, INLA next proceeds to approximate $\pi(X(s_i)|\boldsymbol{\theta},\boldsymbol{Y})$.  The easiest way to accomplish this is to use the marginals that can be derived straightforwardly from $\tilde{\pi}_G(\boldsymbol{X}|\boldsymbol{\theta},\boldsymbol{Y})$ from \eqref{eq:main}.  In this manuscript we will use this technique for simplicity of computation, however, if the latent states are of interest in the problem (and they often are), this can be problematic as it fails to capture any skewness of the posterior of $\boldsymbol{X}$.  One way to correct this is to re-apply \eqref{eq:main} in the following manner:
\begin{equation}
\tilde{\pi}_{LA}(X(\boldsymbol{s_i})|\theta,y)\propto \frac{\pi(\boldsymbol{X},\theta,\boldsymbol{Y})}{\tilde{\pi}_G(\boldsymbol{X}_{-s_i}|X(s_i),\boldsymbol{\theta},\boldsymbol{Y})}|_{x_{-i}=\boldsymbol{x_{-i}}^*(x_i,\theta)}\label{eq:remain}
\end{equation}
In \eqref{eq:remain} $\boldsymbol{X}_{-s_i}$ is used to represent $\boldsymbol{X}$ with latent variable $X(s_i)$ removed.  This is a reapplication of Tierney and Kadane's marginal posterior density and gives rise to the nested term in INLA.

\subsection{Laplace Approximation for Spatio-Temporal Self-Exciting Models}
While INLA is an attractive technique due to computational speed and implementation, it is not immediately usable for the SCSE and the RDSE as the structure in \eqref{eq:gen Model} is
\begin{align}
\mu(\boldsymbol{s_i},t) & = \exp(X(\boldsymbol{s_i},t)) + \eta Y(\boldsymbol{s_i},t-1)\nonumber\\
& \eta \in (0,1)
\end{align}
In this structure, $X(.)$ and $Y(.)$ are not linearly related and a Gaussian prior for $\eta$ is clearly not appropriate due to the parameter space constraints.


However, Laplace approximations can still be used by conducting inference on $\eta$ at the same time inference is conducted on the the set of latent model parameters.  In both the Spatially Correlated Self-Exciting Model and the Reaction Diffusion Self-Exciting model, the full conditional for the latent state is
\begin{equation}
\footnotesize\pi(\boldsymbol{X}|\boldsymbol{Y},\boldsymbol{\theta}) \propto \exp\left(-\frac{1}{2}\boldsymbol{X}^T \boldsymbol{Q(\boldsymbol{\theta})}\boldsymbol{X} + \sum_{s_i,t} \log \pi\left(Y(\boldsymbol{s_i},t)|X(\boldsymbol{s_i},t),\eta,Y(\boldsymbol{s_i},t-1)\right)\right)\label{eq:FullCond}
\end{equation}
Here we will let $\boldsymbol{\theta}=(\theta_1,\sigma^2,\eta)^T$ and $\boldsymbol{Q_{sc}(\boldsymbol{\theta})}=(\boldsymbol{I}_{sn,sn}-\theta_1\boldsymbol{I}_{t,t} \otimes \boldsymbol{H})$ for the Spatially Correlated Self-Exciting Model and use $\boldsymbol{Q_{rd}(\boldsymbol{\theta})}$ for the RDSEM defined in \eqref{eq:Prec}.

While $\boldsymbol{\theta}$ in \eqref{eq:FullCond} does not contain $\eta$ we next do a Taylor series expansion of \begin{equation*}\log \pi\left(Y(\boldsymbol{s_i},t)|X(\boldsymbol{s_i},t),\eta,Y(\boldsymbol{s_i},t-1)\right),\end{equation*} as a function of $X(\boldsymbol{s_i},t)$ and, for each $\boldsymbol{s_i},t$, expand the term about a guess for the mode, say $\mu_0(\boldsymbol{s_i},t)$.  First we write $\boldsymbol{B^*}(\boldsymbol{\theta}|\mu_0)$ as a vector of the same length as $X(\boldsymbol{s_i},t)$ where each element is given by
\begin{equation}
B(\boldsymbol{s_i},t|\mu_0)=\left(\frac{\partial \log\pi\left(Y(\boldsymbol{s_i},t)\right)}{\partial X(\boldsymbol{s_i},t)}\Bigr|_{X(\boldsymbol{s_i},t)=\mu(\boldsymbol{s_i},t)}-\mu(\boldsymbol{s_i},t) \frac{\partial^2\log\pi\left(Y(\boldsymbol{s_i},t)\right)}{\partial X(\boldsymbol{s_i},t)^2}\Bigr|_{X(\boldsymbol{s_i},t)=\mu(\boldsymbol{s_i},t)}\right)\label{eq:B(si)}
\end{equation}

Next, we further define
$\boldsymbol{Q^*(\boldsymbol{\theta})}|\mu_0$ as the updated precision matrix.
\begin{equation}
\boldsymbol{Q^*(\boldsymbol{\theta})|\mu_0}=\boldsymbol{Q(\boldsymbol{\theta})}+\text{diag }\left(-\frac{\partial^2\log \pi\left(Y(\boldsymbol{s_i},t)\right)}{\partial X(\boldsymbol{s_i},t)^2}\right)\Bigr|_{X(\boldsymbol{s_i},t)=\mu(\boldsymbol{s_i},t)} \label{eq:Precision at Mode}\\
\end{equation}
Where $\boldsymbol{Q(\boldsymbol{\theta})}$ is either $\boldsymbol{Q_{sc}(\boldsymbol{\theta})}$ or $\boldsymbol{Q_{rd}(\boldsymbol{\theta})}$ depending on the context.  Then we can write
\begin{equation}
\footnotesize\pi(\boldsymbol{X}|\boldsymbol{Y},\boldsymbol{\theta}) \propto \exp\left(-\frac{1}{2}\boldsymbol{X}^T\left( \boldsymbol{Q^*(\boldsymbol{\theta})|\mu_0}\right)\boldsymbol{X} + \boldsymbol{X}^T\left( \boldsymbol{B^*}(\boldsymbol{\theta})|\boldsymbol{\mu_0}\right)\right) \label{eq:FullCondExpand}
\end{equation}


While in \eqref{eq:Precision at Mode}, $\boldsymbol{Q}(\boldsymbol{\theta})$, the original precision matrix, does not contain $\eta$, $\boldsymbol{Q^*(\boldsymbol{\theta})}$, the updated precision matrix, does depend on the self-excitation parameter.


Next we find the values of $\mu(\boldsymbol{s_i})$ that maximize \eqref{eq:FullCondExpand}.  This is done through the use of an iterative maximization algorithm by solving for $\boldsymbol{\mu_1}$ in $\left(Q^*(\boldsymbol{\theta})|\boldsymbol{\mu_0}\right)\boldsymbol{\mu_1}=\boldsymbol{B^*}(\boldsymbol{\theta}|\mu_0)$. For a fixed $\boldsymbol{\theta}$, this converges rapidly, due to the sparsity of both $\boldsymbol{Q_{sc}}$ and $\boldsymbol{Q_{rd}}$.  .

In \eqref{eq:main}, for a fixed $\boldsymbol{\theta}$, we can then find $x^*(\boldsymbol{\theta})$.  When the denominator of \eqref{eq:main} is evaluated at $x^*(\boldsymbol{\theta})$ it becomes $|\boldsymbol{Q^*(\boldsymbol{\theta})}\frac{1}{2\pi}|^{1/2}$ which is equivalent to the hyperparameter inference recommended by \cite{lee1996hierarchical} as pointed out by R. A. Rigby in \cite{rue2009approximate}.

In order to best explore $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ the posterior mode is first found through a Newton-Raphson based method.  In order to do this we approximate the Hessian matrix based off of finite difference approximation to the second derivatives.


After locating the posterior mode of $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$, the parameter space can be explored using the exploration strategy laid out in section 3.1 of \cite{rue2009approximate}.

Now, for the set of diffusion parameters, $\boldsymbol{\theta}$ which contain $\eta$, we have a method of estimating $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$.  Inference for any further data model covariates can now be conducted in the same manner as done in \cite{rue2009approximate}.
\subsection{Model Comparison and Goodness of Fit}
In order to conduct model comparison, we will use the deviance information criterion (DIC) originally proposed by \cite{spiegelhalter2002bayesian}.  Goodness of fit will be conducted through the use of posterior predictive p-values, outlined by \cite{gelman1996posterior}.

To approximate the DIC, we first find the effective number of parameters for a given $\boldsymbol{\theta}$.  As noted in \cite{rue2009approximate}, we can estimate this by using $n-\text{tr}\left(\boldsymbol{Q(\theta)}\boldsymbol{Q^*(\theta)}^{-1}\right)$ for both the SCSEM and the RDSEM.  This gives the effective number parameters for a given $\boldsymbol{\theta}$, which can then be averaged over $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ to get the effective number of parameters for the model.

Secondly, we calculate the deviance of the mean
\begin{equation}
-2\sum_{s_i,t} \log \pi\left(Y(s_i,t)|\hat{X}(s_i,t),\boldsymbol{\theta^*}\right)
\end{equation} where $\boldsymbol{\theta^*}$ is the posterior mode and $\hat{X}(s_i,t)$ is the expectation of the latent state fixing $\theta=\theta^*$.  DIC can then be found through deviance of the mean plus two times the effective number of parameters as in chapter 7 of \cite{gelman2014bayesian} and initially recommended by \cite{spiegelhalter2002bayesian}.

In order to assess goodness of fit in analyzing the terrorism data in Section 5, we will use posterior predictive P-values as described by \cite{gelman1996posterior}.  Here, we pick critical components of the original dataset that we wish to see if the fitted model can accurately replicate, for instance the number of zeros in the dataset which we can designate as $T(\boldsymbol{Y})$.  Next, for an index $m=1...M$, We then draw a value of $\boldsymbol{\theta_m}$ according to $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ and simulate a set of observations $Y^*(\boldsymbol{s_i,t})_m$ of the same dimension as $\boldsymbol{Y}$ and compute $T(\boldsymbol{Y}^*_m)$.  This process is repeated M times and a posterior predictive p-value is computed as $\frac{1}{M}\sum_{m=1}^M I\left[T(\boldsymbol{Y}^*_m) > T(\boldsymbol{Y}) \right]$ where $I\left[ . \right]$ is the indicator function.  While not a true P-value, both high and low values of the posterior predictive p-value should cause concern over the fitted models ability to replicate features of the original dataset.

\section{Simulation}

In order to validate the Laplace based methodology for spatially correlated self-exciting models we conducted simulation studies using data on a 8 by 8 Spatial grid assuming a rook neighborhood structure.  In order to decrease the edge effect, we wrapped the grid on a torus so each node had four neighbors.  For each grid location we simulated 100 observations, creating a spatio-temporal model that had 6400 observations, meaning in \eqref{eq:main}, $\boldsymbol{Q}(\boldsymbol{\theta})$ had a dimension of $6400 \times 6400$.

In the first simulation we used \eqref{eq:Full Model} fixing the parameters at values that generated data that appeared to resemble the data from Iraq used in Section 5.  The generating model we used was:

\begin{align}
& Y(\boldsymbol{s_i},t)|\mu(\boldsymbol{s_i},t) \sim \text{Pois }(\mu(\boldsymbol{s_i},t)) \label{eq:First Simulation}\\
& \mu(\boldsymbol{s_i},t) = \exp(-1+X(\boldsymbol{s_i},t)) + .2 Y(\boldsymbol{s_i},t-1) \nonumber \\
& X(\boldsymbol{s_i},t) = .22 \sum_{\boldsymbol{s_j}\in N(\boldsymbol{s_i})}X(\boldsymbol{s_j},t) + \epsilon(\boldsymbol{s_i},t) \nonumber\\
&\epsilon(\boldsymbol{s_i},t) \sim \mbox{Gau}(0,.4) \nonumber
\end{align}

The spatial parameter for model was $\theta_1=.22$ which suggests a positive correlation between spatially adjacent locations.  An $\eta$ value of 0.2 would suggest that each event that occurs at one time period increases the expected number of events at the next time period by .2.  Here we fix $\sigma^2$ was fixed at 0.4 and use a value of $\beta_0=-1$  to reflect that in real world applications the latent process likely is not zero mean.

Once the data were generated, we found $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ by applying \eqref{eq:main}.  Here we note that the numerator of \eqref{eq:main} is $\small\pi(\boldsymbol{X},\boldsymbol{\theta},\boldsymbol{Y})=\pi(\boldsymbol{Y}|\boldsymbol{X},\eta,\boldsymbol{\theta})\pi(\eta)\pi(\boldsymbol{X}|\theta_1,\sigma^2)\pi(\theta_1)\pi(\sigma^2)$ which requires a prior specification for $\eta$,$\theta_1$, and $\sigma$.  In order to reflect an a-priori lack of knowledge we choose vague priors for all parameters.   In this model, we use a Half-Cauchy with scale parameter of 25 for $\sigma$ and a Uniform ($\psi_{(1)}^{-1},\psi_{(n)}^{-1}$) where $\psi_{(i)}$ is the $i$th largest eigen value of the spatial neighborhood.  As we used a shared-border, or rook, neighbor structure wrapped on a torus, the parameter space is (-0.25,0.25) as each spatial location has four neighbors.  The choice of the Half-Cauchy is in line with the recommendations for vague priors for variance components of hierarchical models as outlined in \cite{gelman2006prior} and rigorously defended in \cite{polson2012half}.  We let the prior for $\eta$ be Uniform(0,1).

Using a gradient descent method with step-halving we found the posterior mode of $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ to be $\sigma^2=0.32$, $\theta_1=0.22$, and $\eta=0.20$. Using the z based parameterization described in Section 3.1 we next explored the parameterization $\log\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ and found credible intervals of $\pi(\sigma^2|\boldsymbol{Y})=(0.29,0.36),\pi(\theta_1|\boldsymbol{Y})=(0.22,0.23)$ and $\pi(\eta|\boldsymbol{Y})=(.18,.21)$.  Fixing $\boldsymbol{\theta}$ at the posterior mode, we then found an approximate 95\% credible interval for $\beta_0$ to be (-1.67,.07).

The posterior maximum and credible interval for $\sigma^2$ appear to be slightly lower than expected, but the remaining parameter credible intervals covered the generating parameter.

Next we simulated from the reaction-diffusion self-excitation model letting $\beta_0=0$, $\alpha=0.1$, $\kappa=0.2$, $\sigma^2=0.25$, and $\eta=0.4$

In fitting the model, we again use vague priors for all the parameters.  Again, we place a Half-Cauchy prior on $\sigma^2$ as described above.  In order to conform to the parameter space of $\alpha$ and $\kappa,$ we let $\pi(\alpha)\sim \text{Unif }(0,1)$ and $\pi(\kappa|\alpha)\sim\text{Unif }(-\frac{\alpha}{2},1-\frac{\alpha}{2})$.


Again using the Laplace approximation technique of section 3, we found the posterior mode of $\pi(\boldsymbol{\theta}|\boldsymbol{Y})$ to be at $\alpha=0.085$, $\kappa=0.19$, $\sigma^2=0.21$, $\eta=0.35$.  95\% credible intervals for the posterior marginals were $\alpha \in (0.07, 0.10)$, $\kappa \in (0.14,0.24)$, $\sigma^2 \in (0.18,0.24)$, and $\eta \in (0.32, 0.40)$.  At the posterior mode of $\boldsymbol{\theta}$, the  posterior marginal for $\beta_0$ was approximately (-0.03,0.01).  Critically, if there is self-excitement in the data, in all simulations it was differentiable from the latent diffusion.  This is a spatial-temporal analogue to the finding in \cite{mohler2013modeling} where a temporal AR(1) process was differentiable from self-excitement.


In our simulations, the approximations described in this manuscript performed reasonably well for inference on the spatio-temporal diffusion parameters in most cases.  However, when $\sigma^2$ is large, or when $\eta$ is large, we have found that the approximations create bias in one or more of the parameters likely due to the high effective number of parameters.  However, all approximate likelihood based methods will likely struggle in these cases as well.  As noted in \cite{rue2009approximate}, the approximation error in Laplace based methods is related to the number of effective latent variables over the total sample size.


\section{Spatio-Temporal Diffusion of Violence in Iraq (2003-2010)}

\subsection{Statistical Models and Data}

One region where the reasons for the diffusion of terror and crime still remains unclear is in Iraq during 2003 to 2010.  While violence undoubtedly spread throughout the country, it remains unclear how or why, spatio-temporally, the spread occurred.  Part of the uncertainty is that there still is not agreement over whether violence was due to insurgency, civil war, or organized crime. For example, \cite{hoffman2006insurgency} refers to the violence in Iraq as an insurgency, \cite{fearon2007iraq} argues that the spread of violence was due to a civil war, and \cite{williams2009criminals} argues that there was a large presence of organized crime in the country.

A few previous studies have examined the presence or absence of self-excitation.  In \cite{lewis2012self}, the authors concluded that self-excitation was present in select cities in Iraq during this time period.  The presence of the self-excitation finding was echoed in \cite{braithwaite2015battle} where the authors also noted a correlation between locations that shared microscale infrastructure similarities.  This would suggest repeat or near-repeat actions were causing the increase in violence in a region.

However, in both of these cases, the structural form of the latent spatio-temporal diffusion was, a-priori, assumed to be known.  In fact, this is likely not the case. In a classic work on the subject, \cite{midlarsky1980violence} discuss how heterogeneity between locations can cause correlation in violence or individuals who cause violence can actually physically move from one location to another.  In particular, if violence is strictly due to crime we would expect self-excitement and limited diffusion between geographical regions.  Whereas if violence is due to insurgencies we would expect more movement of actors as they seek to create widespread disruption in the country.
The former theory is reflected in the Spatial Correlation in the Spatially Correlated Self-Exciting model and the later theory would correspond to the Reaction Diffusion component of the second model.

The overarching goal of this analysis, thus, is to determine whether in Iraq the growth of violence in fixed locations was due to the presence or absence of self-excitation.  Furthermore, we want to determine whether the latent diffusion of violence is due to the movement of population such as in the Reaction Diffusion model, or whether there is static spatial correlation.  We will answer this while controlling for exogenous factors that may also explain the rise in violence in a region.


In order to address this question as well as demonstrate how Laplace based approximations can be used to fit real world data to models of the class of \eqref{eq:model} we used data from the Global Terrorism Database (GTD) introduced in \cite{lafree2007introducing} to examine the competing theories.  The GTD defines terrorist events as events that are intentional, entail violence, and are perpetrated by sub-national actors.  Additionally, the event must be aimed at obtaining a political, social, religions, or economic goal and must be conducted in order to coerce or intimidate a larger actor outside of the victim.  The majority of lethal events in Iraq from 2003-2010 fit the above category.

The GTD uses a variety of open media sources to capture both spatial and temporal data on terroristic events.  The database contains information on what the event was, where it took place, when it took place, and what terrorist group was responsible for the event.  From 2003 to 2010 in Iraq, the database contains 6263 terrorist events, the spatial structure is shown in figure \ref{SpaceIZ}.

\begin{figure}[h] % figuur 1
	\begin{center}
		\vspace{6pc}
		\includegraphics[width=.8 \linewidth]{IncidentsOverPop.pdf}
		\isucaption[Location of Violent Events in Iraq]{Spatial depiction of 6263 events in Iraq.}
		\label{SpaceIZ}
	\end{center}
\end{figure}

As seen in this map, the majority of the violence is in heavily populated areas such as Baghdad and in the regions north up the Tigris river to Mosul and west through the Euphrates river.  In order to model this data, we aggregated the point data to 155 political districts intersected by ethnicities and aggregated the data monthly for 96 months meaning that $\Sigma(\theta)$ in \eqref{eq:gen Model} is a 14880 x 14880 matrix.  Population for each political district was taken from the Empirical Studies of Conflict Project website, available from https://esoc.princeton.edu/files/ethnicity-study-ethnic-composition-district-level.

We considered covariates controlling for the population density within a fixed region as well as for the underlying ethnicity.  We will make the simplifying assumption that both of these are static over time.  Previous statistical analysis on terrorism considered macro level covariates, such as democracy in \cite{python2016bayesian} that differ country to country but would not change within a single country as analyzed here.  Other studies considered more micro level covariates such as road networks that were found to be statistically related to terrorism in \cite{braithwaite2015battle}.  Here we take the view point that the vast majority of the incidents in Iraq were directed against individuals rather than terrorist events directed at fixed locations.  Therefore, we would expect a higher population density to provide more targets for a potential terrorist to attack.  Furthermore, covariates such as road networks, number of police, or number of US soldiers, would all be highly collinear with population density.  We do, though, consider a covariate for ethnicity in a region.  Specifically, we add an indicator if the region is predominately Sunni.    The disenfranchisement of the Sunnis and high level of violence in Sunni dominated areas has been well established, see for e.g. \cite{baker2006iraq}.  Previous research in \cite{linke2012space} focusing on Granger Causality also suggested indicators for majority Sunni/Shia may be appropriate in any analysis of violence in Iraq.


\begin{align}
& Y(\boldsymbol{s_i},t)|\mu(\boldsymbol{s_i},t) \sim \text{Pois }(\mu(\boldsymbol{s_i},t)) \label{eq:IZ Model}\\
& \mu(\boldsymbol{s_i},t) = \exp[\beta_0+\beta_1\log \text{Pop}(\boldsymbol{s_i})+\beta_2\text{Sunni }(\boldsymbol{s_i})+X(\boldsymbol{s_i},t)] + \eta Y(\boldsymbol{s_i},t-1) \nonumber \\
& \boldsymbol{X}\sim \mbox{Gau}(\boldsymbol{0},\boldsymbol{Q}^{-1}(\theta))  \nonumber
\end{align}

The complete statistical model is given in \eqref{eq:IZ Model}.  We next fit this model letting $\boldsymbol{Q}=\boldsymbol{Q_{sc}}$ and $\boldsymbol{Q}=\boldsymbol{Q_{rd}}$.  We further consider fixing $\eta=0$ to test the presence or absence of self-excitement in the data for both process models.


\subsection{Results}


We fit all four models using the Laplace approximation method described in Section 3.1.  For each of the parameters we used vague proper priors to ensure posterior validity.  In the SCSEM and the Spatially Correlated models we used a Half-Cauchy with scale parameter of 5 for $\sigma$ and a Uniform ($\psi_{1}^{-1},\psi_{n}^{-1}$) where $\psi$ are the eigenvalues of $\boldsymbol{H}$.  Using the neighborhood structure corresponding to the geographical regions described above, this corresponded to a Uniform (-.3,.13). For each of the exogenous covariates, we used independent Gaussian (0,1000) priors.  For the SCSEM model we further assumed a Uniform (0,1) prior for $\eta$.

In fitting the RDSEM, we again used a Half-Cauchy with scale parameter of 5 for $\sigma$.  For the decay parameter, $\alpha$, we used a Uniform (0,1) prior and for the diffusion parameter, $\kappa$, we chose a Uniform $\left(\frac{-\alpha}{2},1-\frac{\alpha}{2}\right)$ in order to ensure we were in the allowable parameter space.


\begin{table}[h]
	\isucaption[Credible intervals for inference of violence in Iraq]{95\% Credible Intervals for Model Parameters} \label{tab:params}
	\begin{center}
		\begin{tabular}{ |c|c|c|c|c| }
			\hline
			& Spatial Correlation Only & SCSEM & Reation Diffusion Only & RDSEM \\
			\hline
			$\beta_0$&(-20.2,-18.4) &(-16, -15.5) &(-22.4,-18.0) &(-22,-18.6 )\\
			$\beta_1$& (1.2,1.4)&(0.9, 1.1) &(1.1, 1.4) &(1.1, 1.5)\\
			$\beta_2$& (1.6,1.9)&(1.1,1.3) &(0.10, 0.33) &(0.17, 0.53)\\
			$\eta$& - &(0.35,0.37) & -  &(0, 0.04)\\
			$\sigma^2$&(1.9,2.7) &(2.1, 2.5) & (0.20, 0.30)&(0.18,0.26)\\
			$\theta_1$&(.08,.10) &(0.09,0.1) & - & - \\
			$\alpha$& - & - & (0.001, 0.007) &(0.001, 0.007)\\
			$\kappa$& - & - & (0.03, 0.07) &(0.03, 0.06) \\
			\hline
		\end{tabular}
	\end{center}
\end{table}


All four models took approximately 30 min to an hour depending on starting values to converge using a Newton-Raphson based algorithm to find the maximum.  Gaussian approximations to the 95\% credible intervals for the parameters are given for all four models are shown in table \ref{tab:params}.  As can be seen in comparing the SCSEM to the RDSEM, the presence or absence of self-excitation appears to be dependent on the choice of structure of $\boldsymbol{Q}$.  Furthermore, the impact of majority Sunni is also dependent on whether the Reaction Diffusion or Spatially Correlated model was used.

Using the methodology described in Section 3.2, we next calculated DIC as well as posterior predictive P-values based off of the maximum observed value and the number of zeros in the dataset.  In the original dataset, the maximum number of events observed for all regions and months was 26 and the dataset had 13445 month/district observations that were zero. The model assessment and selection results are shown in Table \ref{tab:results}.
\begin{table}
	\isucaption[Model assessment and selection statistics for Iraq]{Model Assessment and Selection Statistics For Iraq Data} \label{tab:results}
	\begin{center}
		\begin{tabular}[H]{|c|c|c|c| }
			\hline
			Model & DIC & \begin{tabular}{@{}c@{}}P-Values\\ Maximum Value \end{tabular} & \begin{tabular}{@{}c@{}}P-Value \\ Zeros\end{tabular} \\

			\hline
			\begin{tabular}{@{}c@{}}Spatially Correlated Model \\ without Self-Excitation \end{tabular} & 9370 & .02 & 1 \\
			\hline
			\begin{tabular}{@{}c@{}}Spatially Correlated \\ Self-Exciting Model \end{tabular} &  8722 & .97 & 0\\
			\hline
			\begin{tabular}{@{}c@{}}Reaction Diffusion \\ Only Model \end{tabular} & 8664 & .53 & .81\\
			\hline
			\begin{tabular}{@{}c@{}}Reaction Diffusion \\ Self-Exciting Model \end{tabular} & 8699 & .45 & .89\\
			\hline
		\end{tabular}
	\end{center}
\end{table}

Clearly from table \ref{tab:results}, the models with an underlying reaction diffusion process model outperform those with spatial correlation only.  Furthermore, the addition of self-excitation in the model appears to have minimal impact.  In particular, without self-excitation, the spatial correlation model tends to under count the number of violent activities while the SCSEM tends to over count.  There really is not much difference between the RDSEM and the reaction diffusion model so we prefer the simper reaction diffusion only model. While $\beta_2$ is only minimally significant in the reaction diffusion model, the models perform better including the covariate than disregarding it entirely.

\subsection{Significance}

Under all measures of performance, the reaction diffusion model, \eqref{eq:ReacDiffuse Model}, without self-excitation outperforms the other models under consideration.  This process model as well as values of the covariates and the lack of self-excitement in the data offer several insights into the causes of the spread of violence in Iraq.

Not surprisingly, the reaction diffusion model has a positive relationship between log population and violence.  As the majority of attacks are directed at individuals it would clearly follow that regions that have higher population will offer more targets as well as more potential combatants.  Further, higher populated areas also would have had higher number of Iraqi government officials as well as US military presence.  The positive, though small, relationship between Sunni and violence is also not surprising as, in general, predominately Sunni areas were generally more disenfranchised following the transition to a new Iraqi government after the downfall of Saddam Hussein.

More significantly, though, was that the reaction diffusion model fit the data better than the SCSEM or the RDSEM.  This suggests that increases in violence can be attributed, at least in part, to movement between high violence and low violence areas rather than repeat or near-repeat actors in a fixed location.


While the $\kappa$ parameter may appear small in \ref{tab:params} it still has an impact on the process.  For the sake of simplicity, we can demonstrate this on a three node system.  For this system we consider a central node that has a high level of violence surrounded by two nodes that have a low level of violence. In this set up we will fix $\beta_0=-19$, $\beta_1=1.3$ and consider each node as having a population of 1000 and let $\kappa \in \{.03,.07\}$.  The resultant system over time is shown in Figure \ref{fig:kappa}.   Even in this simple system, there is a noticeable increase in violence as the center node diffuses throughout the entire system.


\begin{figure}[h]
	\vspace*{.51cm}
	\centering
	\includegraphics[width=8cm]{kappasensitivity}
	\isucaption[Sensitivity of response to change in diffusion parameter]{This plot shows the expected changes in violence in a simple three node system where the center node starts with a high level of violence and the other two nodes start with a low level of violence for $\kappa=.03$, depicted as dashed lines in above figure, and $\kappa=.07$, depicted as straight lines.  As seen here after 12 months for $\kappa=.07$ the nodes are essentially at equilibrium. }
	\label{fig:kappa}
\end{figure}

The implications of the reaction-diffusion model being preferable over the SCSEM or the spatial correlation only model can be seen by going back to the original PDE that inspired the model, \eqref{eq:Reaction}.  The underlying assumption in that model is that violence is that the rate of violence spreading to a region is determined through the levels of violence in neighboring region.  From a military planning perspective, this would suggest that if there is a peaceful region surrounded by areas of high violence, the peaceful region should be isolated to prevent the movement of malicious actors.  This strategy would be consistent with published military strategy as outlined in \cite{army2006counterinsurgency}.


Finally, this offers insight into the nature of the conflict that was fought in Iraq.  For instance, \cite{zhukov2012roads} discuss how insurgencies diffuse throughout a population by either physical movement of actors or through movement of ideas, whereas \cite{short2008statistical} suggest that criminal violence would be expected to have an element of self-excitation.  When accounting for the possibility of spatial diffusion, this self-excitation does not appear to be present in the Iraqi dataset. Therefore, as a diffusion based model fits the data better, this would suggest the spread of violence was due to the physical movement of an insurgent population or ideology rather than a criminal element that would be expected to stay more static at a location.


\section{Discussion}

In this manuscript we develop statistical models that allow for spatio-temporal diffusion in the process model and temporal diffusion in the data model.  We relate the models to existing theory in how violence diffuses in space and time.  We further developed a Laplace approximation for spatio-temporal models that contain self-excitement.  This modification allows for a quick and accurate fit to commonly used models in both the analysis of terrorism and criminology. A critical difference between classical INLA and our proposal is that in our proposal, inference is not only performed on the hyperparameters during the exploration of $\pi(\theta|Y)$ in \eqref{eq:main} but also on the self-excitation parameter $\eta$.  While $\eta$ is not generally thought of as a hyperparameter, when the linear expansion of the log-likelihood is done in \eqref{eq:FullCondExpand}, $\eta$ enters into $\boldsymbol{Q^*}$ and $\boldsymbol{B^*}$ in a similar manner as the hyperparameters.

While we only considered two process models and self-excitation that only exists for one time period, the methodology outlined above can easily be extended to allow self-excitation to have an exponential decay similar to the modeling technique of \cite{mohler2011self}.  As shown above, the absence of testing multiple process models may result in premature conclusions about how violence is spreading over regions.  While self-excitation may be present in one model, its significance may be dulled through the use of alternative process models resulting in differing conclusions.

Although self-excitation has become increasingly popular, alternate approaches based on Besag's auto-logistic model, as used in \cite{weidmann2010predicting} are possible, though care must be taken if count data is used as Besag's auto-Poisson does not permit positive dependency. As shown in \cite{kaiser1997modeling}, a Winsorized Poisson must be used if positive dependency is desired, as it most certainly is in terrorism modeling.  In this case, the data model dependency would linearly be associated with the log of $\mu(\boldsymbol{s_i},t)$.

Though the motivation for the models in this manuscript was the spatio-temporal spread of violence, the novel concept of combining latent process dependency and data model dependency has the potential to be used in other fields.  For example in the modeling of thunderstorms, self-excitation may be present temporally, while process model dependency may also be appropriate due to small-scale, unobservable, spatial or spatio-temporal dependency.  Laplace approximations, as demonstrated in this manuscript, allow for quick and relatively accurate methods to fit multiple types of self-exciting spatio-temporal models for initial inference.
\chapter{A Spatially Correlated Auto-Regressive Model For Count Data}\label{JASA}
\begin{center}
	A paper to be submitted to \textit{The Journal of the American Statistical Association}
\end{center}
\section*{Abstract}
	The statistical modeling of multivariate count data observed on a space-time lattice has generally focused on using a hierarchical modeling approach where space-time correlation structure is placed on a continuous, unobservable, process.  The data distribution is then assumed to be conditionally independent given the latent process.  However, in many real-world applications, especially in the modeling of criminal or terrorism data, this conditional independence between the observed counts may be inappropriate.  In the absence of spatial correlation, the Integer Auto-Regressive Conditionally Heteroskedastic (INGARCH) process could be used to capture this data model dependence however this model does not allow for any unexplained spatial correlation in the data. In this manuscript we propose a class of models that extends the INGARCH process to account for small scale spatial variation, which we refer to as a SPINGARCH process. The resulting model allows both data model dependence as well as dependence in a latent structure.  We demonstrate how second-order properties can be used to differentiate between models in this class.  Finally, we apply Bayesian inference for the SPINGARCH process demonstrating its use in modeling the spatio-temporal structure of burglaries in Chicago from 2010-2015 and demonstrate how accounting for spatial correlation changes the conclusion on the existence of repeat victimization.


\section{Introduction}

The modeling of count data where each observation takes place on a space-time lattice arises in multiple disciplines.  In the disease literature, the number of infected patients is often aggregated over geographic areas and discrete times to protect the confidentiality of patients.  In the modeling of terrorism or criminal acts, that we consider in this manuscript, data is often presented aggregated over time and space for security reasons.  Even for spatial continuous and temporally continuous data, the analysis is often performed aggregated over fixed spatial and temporal domains as a matter of convenience.  The challenge, then, is how to appropriately model the relationship between observations.  Assumptions on either the spatial relationship or the temporal relationship between observations are necessary if any statistical analysis is to be performed.  In this paper we present a novel approach for structuring space-time dependency for count data through a combination of spatial dependence in a latent, process model, and temporal dependence in the data model.

In the spatial statistics literature, an early attempt at structuring spatial relationships for count data was made in \cite{besag1974spatial} where the data model distribution was conditionally specified given a fixed spatial neighborhood.  However, as shown in \cite{besag1974spatial}, this results in a statistical model that only allows negative correlation.  More recently, \cite{kaiser1997modeling} demonstrated how modifications could be made to the statistical model that allowed both negative and positive correlation.  A similar methodology was employed in \cite{augustin2006using} to address the spatial dynamics of seed count data in agricultural models.  The critical assumption in these classes of models is the distribution of the observed counts can be conditionally specified from the observed counts at spatial neighbors, a Markov assumption in space.

Advances in computation and Bayesian inference have also allowed for modeling through spatial hierarchical models similar to the Poisson log-Normal approach of \cite{aitchison1989multivariate}. Letting $\boldsymbol{s_i}$ be a discrete spatial location and $t$ be discrete time, spatial-temporal dependence can be introduced by assuming the existence of a latent process, $Y(\boldsymbol{s_i},t)\sim \mbox{Gau}(\boldsymbol{\alpha},\Sigma(\theta))$ that has a spatial-temporal structure characterized by $\Sigma(\theta)$.  The data model is then assumed to be independent given the latent state. The idea was extended to incorporate spatial dynamics in \cite{besag1991bayesian}.  Here a spatial Markov assumption was still made, but it was made in a latent, unobserved, continuous process.  The spatial observations were then assumed to be independent given the latent process.  This idea was also used in \cite{wolpert1998poisson}, who used a Poisson-gamma model with spatial dependence in the latent, gamma, structure.

The concept of allowing the spatial dependence to exist only in a latent field overcomes the difficulty of only negative spatial correlation that arises in the auto-Poisson model of \cite{besag1974spatial}.  Although this form of modeling only allows for limited dependence in the data as demonstrated in \cite{aitchison1989multivariate}, it has become commonplace in literature.  For example in \cite{goicoa2016age} mortality rates are studied using latent conditional effects for space, time, and age.  This approach also has given rise to specialized analytical techniques and software to conduct efficient inference, for example Integrated Nested Laplace Approximations of \cite{rue2009approximate} or spBayes of \cite{finley2007spbayes}.  However, this approach assumes that the dependence in the data is due to a latent, unobservable process which does not capture  repeat victimization that is believed to exist in violence as explained for example in \cite{polvi1990repeat}.  Repeat victimization is the belief that an observed crime or violent act increases the likelihood of a future crime occurring at that exact same spot or against the exact same person and can be modeled assuming a data model dependence or as an observation driven process.

While count data in the spatial statistics literature has predominately been addressed through structure in a latent process, in the time series literature it has evolved quite differently.  For example, the INGARCH model of \cite{ferland2006integer} and \cite{heinen2003modelling} is a time series model for counts where the data model is Poisson with expectation that is a function of both previous counts and previous expectations.  Specifically, if we let ${Z_t}$ be a time series of counts and $\mathcal{F}_t$ be the $\sigma$-field generated by ${Z_0,...,Z_t,\lambda_0}$, the INGARCH$(p,q)$ model is
\begin{equation}
	Z_t|\mathcal{F}_{t-1}\sim \mbox{Pois}(\lambda_t),\quad \lambda_t=d + \sum_{i=1}^p a_i \lambda_{t-i}+\sum_{j=1}^q b_j Z_{t-j}\label{eq:INGARCH}
\end{equation}

This results in a time series model that is a function of both the data model and a deterministic process model.  \cite{ferland2006integer} demonstrated how the INGARCH(1,1), given as $\lambda(s_i,t)=d+\kappa \lambda(s_i,t-1)+\eta Z(s_i,t-1)$, is analogous to an ARMA(1,1) for counts.  In \cite{fokianos2009poisson} it was shown that a perturbed INGARCH(1,1) model was geometrically ergodic giving a unique stationary distribution and asymptotic normality of the roots of the likelihood equations.  The stationary distribution of the INGARCH(1,1) process is also equivalent to a stochastic process given in \cite{hawkes1971spectra}, often called a self-exciting point process, where
\begin{align}
Z_t|\lambda_t& \sim \mbox{Pois}(\lambda_t)\\
 \lambda(t) & = \nu(t) + \int_{0}^{t} g (t-u) N(ds)\nonumber,
\end{align}
when the process is sampled at discrete times and $g(t-u)=\eta \exp(-\alpha (t-t_i))$.

While the INGARCH model was motivated to model univariate time series data, there has been some recent effort to apply it to multivariate count data.  \cite{heinen2007multivariate} used copulas to model the contemporaneous correlation.  However there are issues with using copulas for count data and it is generally less reliable and identifiable than the use of copulas for continuous data, as explained in \cite{genest2007primer}.  \cite{liu2012some} allows for a spatial lag dependency through treating $\lambda_t$ as a vector and replacing $a_i$ and $b_j$ with a series of matrices.  The author then allows for contemporaneous correlation through the bivariate Poisson.  These models, though, do not naturally extend previous spatial models to the INGARCH class nor do they capture how criminologists and others believe crime actually evolves over space and time.  Furthermore, as we will show, the variance to mean ratio for the INGARCH process dictates the range for the allowable autocorrelation limiting its practical use.

In this manuscript, we introduce a class of Self-Exciting Spatio-Temporal models for count data we refer to as Spatially Correlated Integer Auto-Regressive Conditionally Heteroskedastic, or SPINGARCH, models.  These models maintain many of the stationarity properties of the INGARCH process while allowing for spatial correlation through the addition of a latent log-Gaussian spatially correlated process.  We will demonstrate how the models arise from assumptions on how crime and violence evolves over space and time, how they retain the same stationarity properties as the INGARCH model and how they can be differentiated through assessment of second order characteristics.  The SPINGARCH model also allows a much wider range of second order properties affording the modeler more flexibility in describing the autocorrelation and variance to mean ratio of the data.  We will further show how to conduct inference and model assessment to differentiate between models within this class using burglaries in Chicago as an exemplar.


\section{General Model}

A self-exciting spatio-temporal model is characterized through the existence of a latent process that is allowed to have spatial correlation as well as a data process that has self-excitation, or a positive feedback mechanism.  Denote $Z(\boldsymbol{s_i},t)$ as the data model with $\boldsymbol{s_i} \in \{\boldsymbol{s_1},\cdots,\boldsymbol{s_{n_d}}\}$ as fixed spatial locations and $t \in \{1,\cdots,T\}$ as discrete points in time.  We next introduce $Y(\boldsymbol{s_i},t)$ as a latent random variable defined on $(\boldsymbol{s_i},t)$. Finally we define a spatial set: $N_i=\{\boldsymbol{s_j} :\boldsymbol{s_j}\text{ is a spatial neighbor of } \boldsymbol{s_i}\}$.  Note that $\boldsymbol{s_i}$ is normally a vector in $\mathbb{R}^2$, often times representing a fixed geographic region. For example a county in the state of Iowa may be given a single representative $\boldsymbol{s_i}$.

Now, letting $\mathcal{H}_{Z(\boldsymbol{s_i},t)}$ denote the history of the data process at location $\boldsymbol{s_i}$ up until time period $t$,  the SPINGARCH process, $\left[ Z(\boldsymbol{s_i},t)|Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i},t)}\right] $ is conditionally Poisson and has a mass function of

\begin{align}
	& f(Z(\boldsymbol{s_i},t)|Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i},t)})=\exp\left[A_i(Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i},t)})Z(\boldsymbol{s_i},t)-B_i(Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i})})+C(Z(\boldsymbol{s_i},t))\right]\label{eq:example}\\
	& \exp \left[ A_i(Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i},t)}) \right]=\exp \left[ Y(\boldsymbol{s_i},t) \right] + \eta Z(\boldsymbol{s_i},t-1) + \kappa E\left[Z(\boldsymbol{s_i},t-1)|Y(\boldsymbol{s_i},t-1),\mathcal{H}_{Z(\boldsymbol{s_i},t-1)}\right]\nonumber\\
	& B_i(Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i},t)}) = \exp\left[A_i(Y(\boldsymbol{s_i},t),\mathcal{H}_{Z(\boldsymbol{s_i},t)})\right]\nonumber\\
	& C(Z(\boldsymbol{s_i},t)) = -\log\left[Z(\boldsymbol{s_i},t)!\right]	\nonumber,
\end{align}
where
\begin{align}
	& Y(\boldsymbol{s_i},t)|\boldsymbol{y_t}(N_i)\sim N(\mu(\boldsymbol{s_i},t),\sigma^2) \label{eq:Latent Dependency}\\
	& \mu(\boldsymbol{s_i},t) = \alpha(\boldsymbol{s_i})+ \zeta \sum_{s_j \in N_i} \{y(s_j,t)-\alpha(s_j)\} \nonumber.
\end{align}
The SPINGARCH structure consists of combining \eqref{eq:example}, which is Markovian in time and \eqref{eq:Latent Dependency} which is Markovian in space.  The observation $Z(s_i,t)$ is conditioned on the entire past and current latent process $Y(s_i,t)$ is modeled through the use of full conditional distribution in space at each point in time.  Markov assumptions reduce the conditioning to one time step and spatial neighbors.

  If we let $\boldsymbol{A}$ be the neighborhood matrix such that entry $(i,j)=1$ if $s_i$ and $s_j$ are neighbors and restrict $\zeta \in (\psi_{(1)}^{-1},\psi_{(n)}^{-1})$ where $\psi_{(k)}$ is the $k$th largest eigenvalue of $\boldsymbol{A}$, the resulting latent process model, at each time $t$, is a Conditional Auto-Regressive or CAR model used in \cite{cressie2015statistics}.

  Letting $\boldsymbol{\alpha}=(\alpha(\boldsymbol{s_1}),\cdots \alpha(\boldsymbol{s_{n_d}})$, the CAR model has joint distribution $\boldsymbol{Y_t}\sim \mbox{Gau}(\boldsymbol{\alpha},(I-\boldsymbol{C})^{-1}\boldsymbol{M})$ where $C$ is an $n \times n$ matrix with entries $\zeta$ if $s_j,t \in N(\boldsymbol{s_i})$ or $0$ otherwise. $\boldsymbol{M}$ is a diagonal matrix with entries $\sigma^2$. For notational convenience we define $\lambda(\boldsymbol{s_i},t)\equiv\exp \left[ A_i(.) \right]$.  We further take $\alpha(\boldsymbol{s_i})=\alpha$ for $i=1,\cdots n_d$, but these terms could be further modeled as, for example, functions of spatially varying covariates. To simplify notation, we let $\boldsymbol{\theta}$ represent the vector of parameters, $\sigma^2,\zeta$ and $\Sigma(\boldsymbol{\theta}) \equiv (I-\boldsymbol{C})^{-1}\boldsymbol{M}$ as the $n_{d} \times n_{d}$ latent covariance matrix.  We further write the covariance between location $\boldsymbol{s_i}$ and $\boldsymbol{s_j}$ as $\Sigma_{i,j}$ with the understanding that this value will depend on whether $s_j$ is in the neighborhood of $\boldsymbol{s_i}$ or not.  Similarly, we let all the diagonal elements of $\Sigma$ (generally assumed to be equal) be expressed as $\Sigma_{i,i}$.

This model extends the INGARCH(1,1) model given in \eqref{eq:INGARCH} by taking the leading term $d$ in that model to itself be a log Gaussian spatial process. We refer to model \eqref{eq:example} as the Spatially Correlated Integer Auto-Regressive Conditionally Heteroskedastic, or SPINGARCH, model.


\subsection{Model Motivation and Relationship to Mathematical Models of Crime}

Though the original motivation for the INGARCH(1,1) model was as a variance property for time series, e.g. \cite{ferland2006integer}, it also arises out of an exponentially decaying difference equation with self-excitation and is similar to models used in mathematical criminology.  Recall that the INGARCH(1,1) model assumes that $\lambda(\boldsymbol{s_i},t)=d+\kappa \lambda(\boldsymbol{s_i},t-1)+\eta Z(\boldsymbol{s_i},t-1)$.  Letting $\kappa=1-\chi$ this can be re-written as
\begin{equation}
	\frac{\lambda(\boldsymbol{s_i},t)-\lambda(\boldsymbol{s_i},t-1)}{\Delta t}=d-\chi \lambda(\boldsymbol{s_i},t-1)+\eta Z(\boldsymbol{s_i},t-1), \label{eq:INGARCHDDS}
\end{equation}
where $\Delta t$ is 1.  The difference equation given in \eqref{eq:INGARCHDDS} assumes that there is a natural exponential decay in the $\lambda$ process as well as self-excitation, or data model dependence, $\eta$.  The $\eta$ term captures the expected change  that is due to repeat or near-repeat actions, a characteristic of violence that has been shown to exist in the social science literature, see e.g. \cite{polvi1990repeat} and \cite{pease1998repeat}.  Furthermore, at each time period the process is increased by $d$, some exogenous, potentially spatially varying structure.  To account for covariates associated with large scale spatial structure, $d$ could then be parameterized as $d=\exp(\boldsymbol{X}^T \boldsymbol{\beta})$.

The INGARCH(1,1) model is equivalent to equation (2.4) in \cite{short2008statistical} where the authors formulated mathematical models for burglaries occurring on a lattice.  Here, the assumption was made that the the expected number of burglaries were a function of geographic specific measure of attractiveness, $d$, a natural exponential decay, $\chi$, and repeat victimization, $\eta$.  While this and other works in the mathematical criminology literature were concerned with the limiting differential equation as $\Delta t \to 0$, we are concerned with fitting aggregated data to the model which would result in fitting \eqref{eq:INGARCHDDS}.

Applying the INGARCH model to data on a spatial lattice, the underlying assumption is that all spatial variation is captured through $\boldsymbol{X^T}\boldsymbol{\beta}$. If we view the difference equation, \eqref{eq:INGARCHDDS}, as the process of scientific interest, the model assumes that the INGARCH process manifests itself the exact same way at every spatial location.  In particular, the INGARCH process assumes that any exogenous increase to the expected count at time $t$ is the exact same at every single point in space and time that share the same large scale spatial structure.   More realistically, though, there exist small scale variation between locations that cannot be captured through covariates.  If the INGARCH model was used to model the spatio-temporal evolution of violence or crime, this assumption would suggest that any geographic characteristics impacting the expected number of events are fully captured in $d$.

The SPINGARCH (1,1) model, on the other hand assumes a similar difference equation as \eqref{eq:INGARCHDDS} however it is now assumed that $d$ is a separate, latent, spatial process.  In particular, $d$ is assumed to follow a Conditional Auto-regressive, or CAR, process.  That is, we now assume there is a separate latent spatial process that describes the increase in the difference equation due to exogenous factors. The use of a CAR process here assumes that given its geographic neighbors, the impact of the exogenous factors at the spatial location is conditionally independent of the impact at all other locations.  The assumption in the INGARCH(1,1) described above is therefore relaxed to allow for slight variations in how geographic characteristics impact expected violence.  A natural question is what are the impacts of the model properties through relaxing this assumption and what differing roles do $\eta$ and $\kappa$ practically play in this model.  To answer this we first note that the SPINGARCH model can conveniently be written as
\begin{align}
	& Z(\boldsymbol{s_i},t)|\lambda(\boldsymbol{s_i},t) \sim \mbox{Pois}(\lambda(\boldsymbol{s_i},t)) \label{eq:timeseries2} \\
	& E[Z(\boldsymbol{s_i},t)|\lambda(\boldsymbol{s_i},t)]=\lambda(\boldsymbol{s_i},t)\nonumber\\
	& \boldsymbol{\lambda_t} = \exp(\boldsymbol{Y_t})+\eta \boldsymbol{Z_{t-1}}+\kappa \boldsymbol{\lambda_{t-1}}\nonumber\\
	& \boldsymbol{Y_t} \sim \mbox{Gau}\textit{} (\boldsymbol{\alpha},(I_{{n_d},{n_d}}-\boldsymbol{C})^{-1}\boldsymbol{M}),\nonumber
\end{align}
where $\boldsymbol{\lambda_t}=\left(\lambda(s_1,t),\lambda(s_2,t),\cdots,\lambda(s_{n_d},t)\right)^T$.  Here it is clear that $\boldsymbol{\lambda_t}$ is a Markov chain on state space $(\mathbb{R}^{+})^{n_d}$.  Viewing it this way will allow us to note the existence of a unique stationary distribution with finite moments.  The moments, in particular the second moments, will differentiate between the INGARCH(1,1) and the SPINGARCH(1,1).  Furthermore, as we will show, changing the variance to mean ratio in the INGARCH(1,1) process impacts the implied autocorrelation whereas the SPINGARCH(1,1) offers more flexibility in controlling the variance to mean ratio and the temporal correlation in the model.  The second moments will further differentiate between the SPINGARCH(1,0), that is $\eta=0$ and the SPINGARCH(0,1), $\kappa=0$, models.

This means that it matters if we assume that the previous observed violence impacts the current expected violence or we assume that the previous expected level of violence impacts the current expected violence.  Though the difference may seem nuanced, the choice of assumptions will result in different second order properties and as we will demonstrate in the simulation data simulated from the SPINGARCH(0,1) cannot be accurately fit to the SPINGARCH(1,0) model.


\section{Model Properties}

The INGARCH(1,1) model, when perturbed with a random sequence that has a density on the positive real line, is geometrically ergodic and subsequently converges to a unique stationary distribution as $T \to \infty$, as proved in \cite{fokianos2009poisson}.  A similar argument can be made for the SPINGARCH (1,1) process as $\boldsymbol{\lambda_t}=\left(\lambda(s_1,t),\cdots,\lambda(s_{n_d},t)\right)^T$ is a Markov chain on the state space $(\mathbb{R}^+)^{n_d}$ and is perturbed by the multivariate log-Gaussian density. Unlike the proof in \cite{fokianos2009poisson} further perturbation is not needed as the log Gaussian spatially correlated errors allow the chain to always have a positive probability for visiting any set of positive Lebesgue measure.

\begin{prop} \label{Prop 1}
	Under the parameter space restriction, $\eta,\kappa\geq0$ and $\eta+\kappa<1$, the SPINGARCH (1,1) is geometrically ergodic and admits a unique stationary distributions that has finite first two moments.
\end{prop}
A complete proof of Proposition 1 follows closely the development in \cite{fokianos2009poisson} relying on Markov chain theory and is given in Appendix A.  As a result of proposition 1, we can use the stationary distribution to derive first and second order properties for the SPINGARCH (1,1) model.  The second order properties will provide methods for differentiating between the INGARCH (1,1) and the SPINGARCH(1,1) as well as between the SPINGARCH(1,0) and the SPINGARCH(0,1) models.

\subsection{First Order Properties}
 To derive the expectation for data from the Self-Exciting Poisson CAR model, we first note that $\exp(\boldsymbol{Y})$ has a multivariate log-normal distribution.  Therefore, as the natural parameter is linked exponentially with the linear predictor, using properties of the Poisson distribution, we have
\begin{align}
	E\left[Z(\boldsymbol{s_i},t)\right] & = E\left[E\left[Z(\boldsymbol{s_i},t)|\lambda(\boldsymbol{s_i},t)\right]\right] \nonumber\\
	&= E\left[\lambda(\boldsymbol{s_i},t)\right]= E\left[\exp(Y(s_i,t))\right]+\eta E\left[Z(\boldsymbol{s_i},t-1)\right] +\kappa E\left[\lambda(\boldsymbol{s_i},t-1)\right] \nonumber\\
	& = \exp\left(\alpha+\frac{\Sigma_{1,1}}{2}\right)+\eta E\left[\lambda(\boldsymbol{s_i},t-1)\right] +\kappa E\left[\lambda(\boldsymbol{s_i},t-1)\right] \label{eq:Expectation},
\end{align}
which, at stationarity, yields, $E\left[Z(\boldsymbol{s_i},t)\right]=\frac{1}{1-\eta-\kappa}\exp(\alpha+\frac{\Sigma_{1,1}}{2})$.  The existence of self-excitation within the data model, or $\eta>0$, increases the marginal expectation for the data model in a manner similar to the INGARCH(1,1) model.

\subsection{Second Order Properties}
The SPINGARCH(1,1) model allows for flexible modeling of variances.  In particular, the variance to mean ratio can be manipulated without impacting the autocorrelation in time, which distinguishes it from an INGARCH(1,1) model.  In addition, spatial structure may be modeled through a combination of large-scale and small-scale spatial processes.
\subsubsection{Variance}
To see how the variance to mean ratio can be adjusted under the SPINGARCH(1,1) model we can first compute the marginal variance of $Z(\boldsymbol{s_i},t)$. To find this value we exploit the independence of $Y(\boldsymbol{s_i},t)$ and $Z(\boldsymbol{s_i},t-1)$ yielding

\begin{align}
	\mbox{Var}(Z(\boldsymbol{s_i},t+1))  =& \mbox{Var}(E(Z(\boldsymbol{s_i},t+1)|\lambda(\boldsymbol{s_i},t+1))+E(\mbox{Var}(Z(\boldsymbol{s_i},t+1)|\lambda(\boldsymbol{s_i},t+1))\\
	=& \mbox{Var}(\lambda(\boldsymbol{s_i},t+1))+ E(\lambda(\boldsymbol{s_i},t+1))\nonumber\\
	= & \kappa^2 \mbox{Var} (\lambda(\boldsymbol{s_i},t))+\eta^2 \mbox{Var}(Z(\boldsymbol{s_i},t))+2 \kappa \eta \mbox{Var }(\lambda(\boldsymbol{s_i},t))+\nonumber\\
	& \mbox{Var}(\exp(Y(\boldsymbol{s_i},t)))+E(Z(\boldsymbol{s_i},t))\\
	=&\kappa^2 \mbox{Var} (Z(\boldsymbol{s_i},t))+\eta^2 \mbox{Var}(Z(\boldsymbol{s_i},t))+2 \kappa \eta \mbox{Var }(Z(\boldsymbol{s_i},t))+\nonumber \nonumber\\
	& - \kappa^2 E(Z(\boldsymbol{s_i},t))-2 \kappa \eta E(Z(\boldsymbol{s_i},t))+\mbox{Var} (\exp(Y(\boldsymbol{s_i},t))).
\end{align}
Under the conditions in Proposition 1 we have second order temporal stationarity and subsequently,
\begin{equation}
	\mbox{Var }(Z(\boldsymbol{s_i},t))=\frac{1}{1-(\kappa+\eta)^2} \mbox{Var }(\exp(Y(\boldsymbol{s_i},t)))+\frac{1-\kappa^2-2\kappa \eta}{1-(\kappa+\eta)^2}E(Z(\boldsymbol{s_i},t)).
\end{equation}

Therefore, similar to the INGARCH(1,1) process, the SPINGARCH(1,1) process allows for the modeling of overdispersion.  Furthermore, as we will show below, overdispersion can be accounted for without impacting the range of possible autocorrelation.


\subsubsection{Temporal Covariance}

To see the impact of adjusting the mean to variance ratio on the temporal covariance we find the lag-one autocorrelation by relying on the second order stationarity implicit in Proposition 1.  As derived in Appendix B the autocovariance under the SPINGARCH(1,1) model is
\begin{equation}
	\mbox{Cov}(Z(\boldsymbol{s_i},t),Z(\boldsymbol{s_i},t+1))=\left(\eta+\kappa\right)\mbox{Var} Z(\boldsymbol{s_i},t)-\kappa E[Z(\boldsymbol{s_i},t)] \label{eq:Autocov}.
\end{equation}
In particular, if $\kappa=0$, i.e. SPINGARCH(0,1), the lag-one autocorrelation for the process is $\eta$ and in general, the lag-$h$ autocorrelation is $\eta^h$.

The significance of this is that it allows a great deal of flexibility in capturing second order properties of the data.  The SPINGARCH (0,1) process, for example, has a lag-one auto correlation of $\eta$, and a variance to mean ratio of $\frac{\mbox{Var} \left(\exp(Y(\boldsymbol{s_i},t))
	\right)}{(1-\eta) \mbox{E}\left[\exp(Y(\boldsymbol{s_i},t))\right]} + \frac{1}{1-\eta^2}$.  Therefore, through manipulating the $\frac{ \mbox{Var} \left(\exp(Y(\boldsymbol{s_i},t))\right)}{ \mbox{E}\left[\exp(Y(\boldsymbol{s_i},t))\right]}$ we can manipulate the variance to mean ratio without impacting the autocorrelation.


\subsubsection{Spatial Covariance and Correlation}
The SPINGARCH(1,1) model also allows for limited spatial correlation Recalling that $\Sigma_{i,j}$ is the marginal covariance between $Y(\boldsymbol{s_i},t)$ and $Y(\boldsymbol{s_j},t)$, the spatial covariance between $Z(\boldsymbol{s_i},t)$ and $Z(\boldsymbol{s_j},t)$ is

\begin{align}
	& \mbox{Cov}(Z(\boldsymbol{s_i},t),Z(\boldsymbol{s_j},t):\forall i \neq j) =  \frac{\exp(2\alpha)}{1-(\eta+\kappa)^2}\left[\exp(\Sigma_{i,i}+\Sigma_{i,j}) -\exp(\Sigma_{i,i})\right] \label{eq:SpatCov}.
\end{align}
A proof of this is is given in Appendix B.  From \eqref{eq:SpatCov} it is clear that the spatial covariance is zero if the marginal covariance between $Y(\boldsymbol{s_i},t)$ and $Y(\boldsymbol{s_j},t)$ is zero.  However, if there is a non-zero marginal covariance between the spatial locations, $\alpha,\eta$ and $\kappa$ influence the spatial correlation in the data. As $\Sigma_{i,j}$ can be either positive or negative, the spatial covariance, unlike the temporal covariance, can be either positive or negative.

The spatial correlation is therefore
\begin{align}
	& \mbox{Corr}(Z(\boldsymbol{s_i},t),Z(\boldsymbol{s_j},t))  = \frac{\exp(2\alpha)\left(\exp(\Sigma_{i,i}+\Sigma_{i,j}) -\exp(\Sigma_{i,i})\right)}{\mbox{Var} (\exp(Y(\boldsymbol{s_i},t)))+E[Z(\boldsymbol{s_i},t)]}\nonumber \\
	&=\mbox{Corr}(Z(\boldsymbol{s_i},t),Z(\boldsymbol{s_j},t))  = \frac{\left(\exp(\Sigma_{i,i}+\Sigma_{i,j}) -\exp(\Sigma_{i,i})\right)}{ \exp(2\Sigma_{i,i})-\exp(\Sigma_{i,i})+\exp(-\alpha+\frac{\Sigma_{i,i}}{2})\frac{1}{1-(\kappa+\eta)}}\label{eq:SpatCorr}.
\end{align}
The spatial correlation, as seen in \eqref{eq:SpatCorr}, only depends on $\eta$ and $\kappa$ through the expectation of $Z(\boldsymbol{s_i},t)$, however this is a potential limitation as this implies that the range of correlations depends on values of parameters other than the parameters of the CAR process.

\section{Bayesian Inference}\label{Sec:Bayes}
Bayesian analysis of the SPINGARCH model can be accomplished using off the shelf Markov Chain Monte Carlo software such as rStan introduced in \cite{carpenter2016stan} through application of the techniques suggested by \cite{joseph}.
Letting the prior distribution of $\pi(\boldsymbol{\theta})=\pi(\eta|\kappa)\pi(\kappa)\pi(\boldsymbol{\alpha})\pi(\sigma)\pi(\zeta)$., the full condition of $\boldsymbol{\boldsymbol{\theta}}=\left(\eta,\kappa,\alpha,\zeta,\sigma^2\right)^T$ is
\begin{align}
&\small\pi(\boldsymbol{\theta} | \boldsymbol{Z},\boldsymbol{Y})\propto  \prod_{t=1}^T\pi(\boldsymbol{Z}_t|\boldsymbol{\lambda}_t)\pi(\boldsymbol{\lambda}_t|\boldsymbol{\lambda}_{t-1},\boldsymbol{Z}_{t-1},\boldsymbol{\theta} ,\boldsymbol{Y}_t)\pi(\boldsymbol{Y}_t|\boldsymbol{\theta} )
\pi(\boldsymbol{\lambda_0}|\boldsymbol{\theta})\pi(\boldsymbol{Z_0}|\boldsymbol{\lambda_0})\pi(\boldsymbol{\theta})\label{eq:FullCondTheta},
\end{align}
and the full conditional of $\boldsymbol{Y}$ is
\begin{align}
&\small\pi(\boldsymbol{Y} | \boldsymbol{Z},\boldsymbol{\theta})\propto \prod_t\pi(\boldsymbol{Z}_t|\boldsymbol{\lambda}_t)\pi(\boldsymbol{\lambda}_t|\boldsymbol{\lambda}_{t-1},\boldsymbol{Z}_{t-1},\boldsymbol{\theta} ,\boldsymbol{Y}_t)\pi(\boldsymbol{Y}_t|\boldsymbol{\theta} )\pi(\boldsymbol{\lambda_0}|\boldsymbol{\theta})\pi(\boldsymbol{Z_0}|\boldsymbol{\lambda_0})
\end{align}
In general we assume independence in our priors except for $\eta$ and $\kappa$ due to the restriction that $\eta+\kappa <1$. Now note that in order to do any form of Markov Chain Monte Carlo inference we must sample from the density of the full latent state, $\boldsymbol{Y}$ which requires evaluations of
\begin{align}
\log(\boldsymbol{Y}|\boldsymbol{\alpha},\sigma,\zeta)) & \propto \frac{-t \times n_d}{2}\log(2 \pi) + \frac{1}{2} \log | \Sigma_f^{-1}(\boldsymbol{\theta})|\nonumber\\
& - \frac{1}{2}(Y-\alpha)^T\Sigma_f^{-1}(\boldsymbol{\theta})(Y-\alpha) \label{eq:log Y1},
\end{align}
where $\Sigma_f(\boldsymbol{\theta})$ is the full space-time covariance matrix $\left(I_{n_d \times T,n_d \times T}-I_{t,t}\otimes \boldsymbol{C}\right)^{-1}I_{t,t}\otimes \boldsymbol{M}$.   The sparsity of the covariance structure means that the only computations of $\frac{1}{2}(Y-\alpha)^T\Sigma_f^{-1}(\boldsymbol{\theta})(Y-\alpha)$ that need to occur are for spatial neighbors. Therefore, the most difficult part of the computation of the log-density is the computation of the determinant, $\log | \Sigma_f^{-1}(\boldsymbol{\theta})|$.  However, the complicated notation of the covariance structure belies the fact that the precision matrix is both block diagonal and extremely sparse that greatly simplify computations of \eqref{eq:log Y1}.  The specific structure for $\Sigma^{-1}(\boldsymbol{\theta})$ allows us to follow \cite{jin2005generalized}.  In particular, we have $\log|\Sigma^{-1}(\boldsymbol{\theta})|=\frac{n_d}{2\log\sigma^2}+\log|I_{n_d,n_d}-\zeta N|$ where $N$ is the neighborhood or adjacency matrix.  Letting $V \Lambda V^T$ be the spectral decomposition of $N$ we have $|I_{n_d,n_d}-\zeta N|=|V| |I_{n_d,n_d}-\zeta \Lambda| |V^T|=\prod_{j=1}^{n_d}\left(1-\zeta \chi_j\right)$ where $\chi_j$ are the eigenvalues of the neighborhood matrix.  Also, as $ \Sigma_f^{-1}(\boldsymbol{\theta})$ is block diagonal with each block being size $T \times T$ and having structure $\Sigma^{-1}$, it follows that $\log| \Sigma_f^{-1}(\boldsymbol{\theta})|=\frac{n_d \times T}{\log\sigma^2}+T\log| \Sigma^{-1}(\boldsymbol{\theta})|$

\begin{align}
	\log | \Sigma_f^{-1}(\boldsymbol{\theta})|&  = T \log | \Sigma^{-1}(\boldsymbol{\theta})|\\
	& \propto \frac{n_d \times T}{\log\sigma^2}+ T \sum_{j=1}^{n_d}(1-\zeta\chi_j) \label{eq:eig}
\end{align}

The greatest advantage of this approach is that the eigenvalues depend only on the neighborhood structure and do not depend on any parameters, therefore they can be computed ahead of time.  This means that we never need to deal with matrices of the size of  $\Sigma_f(\boldsymbol{\theta})$, rather we just need to find the eigenvalues for the neighborhood matrix.  This allows relatively quick fit for the fully Bayesian model using software such as Stan (\cite{carpenter2016stan}), which allows user defined log-densities and only requires proportional computations to the log-density.

To conduct model assessment under the above framework we rely on posterior predictive P values, see e.g. \cite{gelman1996posterior}.  This technique samples new data sets after sampling parameters from the posterior distribution.  Statistics are calculated on the new data and compared to the statistics from the original dataset.  For each data set we calculate $T_1(\boldsymbol{Z})$, the spatial Moran's I statistic, $T_2(\boldsymbol{Z})$, the log of the variance to mean ratio and $T_3(\boldsymbol{Z})$, the sample lag-1 auto-correlation.  We denote the percentage of times the new test statistic is greater than the statistic from the original data as $p_1$, $p_2$, and $p_3$.  Values that deviate significantly from .5 would suggest the fitted model can not accurately replicate the original data characteristic.

\section{Simulation}\label{Sec:Sim}

Here we demonstrate that the parameters from the SPINGARCH model can be recovered using the Bayesian inference methodology and that different models within the SPINGARCH class can practically be differentiated through their second order properties.  In order to demonstrate this we simulate from a SPINGARCH (0,1) and subsequently fit the simulated data to the SPINGARCH (0,1), the SPINGARCH (1,0) model, and the INGARCH(1,1) model.  We then simulate from the posterior distributions and calculate posterior predictive P-values for the mean to variance ratio, a spatial Moran's I statistic, and a lag-1 autocorrelation as described above.

The spatial domain we use in the simulations is a 1-D spatial domain wrapped on a cylinder, meaning that each spatial location had two neighbors.  This domain restricts $\zeta \in (-0.5,0.5)$ in order to ensure a joint density exists for $\boldsymbol{Y}$.  We use the following data generating process

\begin{align}
	& Z(\boldsymbol{s_i},t)  \sim\mbox{Pois}(\lambda(\boldsymbol{s_i},t))\label{eq:SEPoissonGen}\\
	& \lambda(\boldsymbol{s_i},t) =\exp \left[ Y(\boldsymbol{s_i},t) \right] + 0.66 Z(\boldsymbol{s_i},t-1)\nonumber\\
	& Y(\boldsymbol{s_i},t)|\boldsymbol{y_t}(N_i)  \sim \mbox{Gau}(\mu(s_i,t),0.5)\nonumber\\
	& \mu(\boldsymbol{s_i},t)  = 0+ 0.49 \sum_{\boldsymbol{s_j} \in N_i} \{y(\boldsymbol{s_j},t)\}\nonumber.
\end{align}

A depiction of one simulation from \eqref{eq:SEPoissonGen} is shown in figure \ref{fig:SEPoissonGen}.  The counts generated from \eqref{eq:SEPoissonGen} are overdispersed, temporally correlated, and spatially correlated  with a log variance to mean ratio of approximately 2.8, a lag-one correlation of 0.66, and a Moran's I statistic of 0.56.


\begin{figure}[!htp]
	\centering
	\includegraphics[width=0.5\linewidth, height=0.3\textheight]{SECARSim2}
	\isucaption[Simulated Data from SPINGARCH(1,1) mode]{Simulated Data from \eqref{eq:SEPoissonGen}.  The X axis is Time from 1-100, the Y axis is Space. Note that the spatial domain here is 2-D wrapped on a cylinder.  In the image horizontal streaking is indicative of high temporal autocorrelation while vertical streaking is indicative of spatial correlation. }\label{fig:SEPoissonGen}
\end{figure}

We fit each model using Hamiltonian Monte Carlo through the R package, Rstan of \cite{gelman2015stan} using vague proper priors for all parameters.  For each model 3 Markov Chains of 3000 iterations are used after which the chains exhibited no signs of non-convergence.  When using the inferential procedure outlined in Section \ref{Sec:Bayes}, the
95\% marginal credible intervals obtained from the data depicted in Figure \ref{fig:SEPoissonGen} are \ $\alpha \in (-0.24,0.1)$, $\sigma^2 \in (0.46,0.59)$, $\zeta \in (0.486,0.492)$, and $\eta \in (0.64,0.67)$ all clearly cover the generating parameters.  Posterior predictive P values, using 1000 random draws from each of the posterior densities, are given in Table \ref{tab:pval}.


\begin{table}[h]
	\isucaption[Posterior predictive checks for SPINGARCH(0,1) fit]{Posterior predictive P values from fitting the given models to data generated by a SPINGARCH(0,1) process.} \label{tab:pval}
	\begin{center}
		\begin{tabular}{ |c|c|c|c| }
			\hline
			& INGARCH(1,1) & SPINGARCH(1,0) & SPINGARCH(0,1)\\
			\hline
			$p_1$ - Moran's I Statistic& 0 & .05 & .46 \\
			$p_2$ - Var to Mean Ratio & 0 & .99 & .65 \\
			$p_3$ - Lag 1 Auto Correlation & 0 & .45 & .60\\
			\hline
		\end{tabular}
	\end{center}
\end{table}
The resultant posterior predictive P values, as shown in Table \ref{tab:pval}, demonstrate that clearly the SPINGARCH(0,1) model does the most adequate job of replicating the three statistics.  The reason that the SPINGARCH(1,0) model is able to accurately replicate the Lag 1 auto-correlation in the data is because of the high variance to mean ratio.  In particular, using the parameters in \eqref{eq:SEPoissonGen} the log variance to mean ratio is approximately 2.5.  Therefore, values of $\kappa$ that are near $\eta$ will be able to generate the same autocorrelation as shown in \eqref{eq:Autocov}.  However, the SPINGARCH(1,0) cannot capture the mean to variance ratio, nor the spatial correlation in the data.

In general, if we fit the SPINGARCH(1,0) to data that was generated from the SPINGARCH(0,1) model it is unable to replicate the second order properties.  The fit consistently overestimates the variance to mean ratio, potentially by a considerable amount and underestimates, on average, the Moran's I statistic.  As is expected the INGARCH(1,1) performs poorly on all three measures though we might have expected it to replicate one of the three measures, it fails to do so.  Practically, this means that violence or crime that arises from self-excitation only can be differentiated from violence that arises due to a natural decay in time by examining how well the model is able to replicate the second order statistics from the original dataset.


\section{Burglaries in Chicago}
As a case study we consider a statistical model for burglaries in the south side of Chicago during 2010-2015 using crime data from the city of Chicago. As Chicago is one of the most racially and socio-economically segregated cities in America, we consider only the southside, a relatively racial and socio-economic homogeneous region depicted in figure \ref{fig:SouthSide}.  We aggregated the number of burglaries by Census block group and by month.  Within the south side of Chicago there are 552 census block groups resulting in a spatial domain of $\boldsymbol{s_i} \in \{\boldsymbol{s_1},\cdots,\boldsymbol{s_{552}}\}$ and temporal domain of $t \in \{1,2,\cdots,72\}$.


\begin{figure}[!htp]
	\centering
	\includegraphics[width=.5\linewidth, height=0.3\textheight]{SouthSide}
	\isucaption[South Side of Chicago]{552 Census block groups in South Chicago.  This area of Chicago is relatively racially and economically homogeneous}\label{fig:SouthSide}
\end{figure}


While the racially homogeneity eliminates some sources of socio-economic variability in the data, it does not eliminate all of it.  To account for this, we further consider covariates that address unique socio-economic and population characteristics for each region. Unemployment, for example, has long been shown to have a relationship with crime, see e.g. \cite{britt1994crime} and \cite{raphael2001identifying}, the later showing property crime in particular has a strong relationship with unemployment.  All potential covariates were obtained from the U.S. Census Tiger data available at \href{https://www.census.gov/geo/maps-data/data/tiger-data.html}{https://www.census.gov/geo/maps-data/data/tiger-data.html}.  The maximum number of burglaries in a month in a census block for this subset of the city is 17.  The variance to mean ratio in the data is 1.8, suggesting there is some overdispersion in the data.  There is both temporal and spatial clustering as evident by the average lag-one autocorrelation, .32, and the Moran's I statistic of .20 using a four nearest neighbor structure for the weight matrix.  There is a clear seasonality trend in the data as well as a general downward trend from 2010-2015.  In order to account for this we preprocessed the data to remove the seasonality effect and the trend prior to estimating the impact of the spatial covariates and the process covariates.

One possible model that describes the spread of burglaries is similar to the model of \cite{short2008statistical}.  We might assume that the change in the rate of burglaries at a location is a function of a base attractiveness due to unique geographical features at that location, $\alpha_{\boldsymbol{s_i}}$, a natural decay over time, $\iota$, and repeat victimization, $\eta$.  These assumptions lead to the (stochastic) difference equation

\begin{equation}
	\frac{\lambda(\boldsymbol{s_i},t+1)-\lambda(\boldsymbol{s_i},t)}{\Delta t}=\alpha_{\boldsymbol{s_i}}-\iota \lambda(\boldsymbol{s_i},t)+\eta Z(\boldsymbol{s_i},t)\label{eq:DiscreteModel}.
\end{equation}
Upon assuming $\Delta t=1$, \eqref{eq:DiscreteModel} is clearly an INGARCH(1,1) model with $\kappa=(1-\iota)$.  The unique geographical features that can be viewed as exogenous factors that increase the expectation.  We assume they are structured as
\begin{equation}
	\alpha_{\boldsymbol{s_i}}=\exp\left(\beta_0+\beta_{pop} \log(\mbox{Pop}_{\boldsymbol{s_i}})+\beta_{ym}\mbox{Young Men}_{\boldsymbol{s_i}}+\beta_{wealth}\mbox{Wealth}_{\boldsymbol{s_i}}+\beta_{unemp}\mbox{Unemp}_{\boldsymbol{s_i}}\right) \label{eq:mean strucutre}.
\end{equation}

In comparison to this process, we also consider a SPINGARCH(1,1) process with the following structure,
\begin{align}
	& Z(\boldsymbol{s_i},t) \sim \mbox{Pois}(\lambda(\boldsymbol{s_i},t)) \label{eq:SPINGARCHCHI} \\
	& E[Z(\boldsymbol{s_i},t)]=\lambda(\boldsymbol{s_i},t)\nonumber\\
	& \boldsymbol{\lambda_t} = \exp(\boldsymbol{Y_t}+\boldsymbol{U})+\eta \boldsymbol{Z_{t-1}}+\kappa \boldsymbol{\lambda_{t-1}}\nonumber\\