HDS5106/iid_null.stan at main · SAHagen/HDS5106 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
//Independent and Indentically Distributed Fay-Herroit Stan Model with no covariates

// The input data is a vector 'y' of length 'N'.
data{

  // dimensions of the data

  int<lower=0> C; //number of clusters C should be greater than 0
  int<lower=0> D; //number of districts for ghana

  //outcome
  array[C] int<lower=0> y; //positive cases per cluster greater or equal to 0, never negative
  array[C] int<lower=0> n; // children tested per cluster

  //luster to district mapping
  array[C] int<lower=1, upper=D> district; // which district does cluster c belong to

  // aggregation
  matrix[D, C] A;             // D x C matrix
  vector[D] district_counts;  // number of cluster per districts
}

// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters{
  //grand mean (overall mean)
  real mu;   //overall log-odds of prevalence in ghana

  // random effects
  vector[D] u;         // district random effects (deviations from the grand mean IID)
  vector[C] epsilon;    // cluster deviations

  //variance components
  real<lower=0> sigma_u; // how much district vary around the grand mean
  real<lower=0> sigma_v; // how much cluster vary within districts

}


// The model to be estimated. We model the output
// 'y' to be normally distributed with mean 'mu'
// and standard deviation 'sigma'.
model{
//Priors
mu ~ normal(0, 1);

//iid district effects - each district drawn from a normal N(0, 1)
// something like u_d <- rnorm(216, mean = 0, sd=1) in R
u ~ normal(0, 1);

// cluster overdisperions
epsilon ~ normal(0, 1);

// variance components
sigma_u ~ exponential(2);
sigma_v ~ exponential(2);

// linear predictors
vector[C] eta = mu + sigma_u * u[district] + sigma_v * epsilon;

//likelihood
y ~ binomial_logit(n, eta);
}

// generated quantities {
//
//   vector[C] pi_c = inv_logit( # pi_c are the cluster level fitted prevelance
//     mu + sigma_u * u[district] + sigma_v * epsilon
//   );
//
// # vector pi_ci is the model's fitted probability of malaria for each cluster — the posterior estimate of true prevalence in cluster cc
// #c after accounting for all parameters. (smootheed estimates)
//
//   vector[D] pi_d = (A * pi_c) ./ district_counts; # pi_d is a vector of district level prevelence aggregated from the above cclutser level prevalance
//
//
//   real pi_ghana = inv_logit(mu);
//
//   vector[C] log_lik;
//   for (c in 1:C) {
//     log_lik[c] = binomial_logit_lpmf(y[c] | n[c],
//                    mu + sigma_u * u[district[c]] + sigma_v * epsilon[c]);
//   }
// }


generated quantities {
  vector[C] pi_c = inv_logit(
    mu + sigma_u * u[district] + sigma_v * epsilon
  );

  // district prevalence — only for sampled districts
  vector[D] pi_d;
  for (d in 1:D) {
    if (district_counts[d] > 0) {
      // sampled district — average over clusters
      pi_d[d] = dot_product(A[d], pi_c) / district_counts[d];
    } else {
      // unsampled district — prior prediction only
      // grand mean + district random effect, no cluster data
      pi_d[d] = inv_logit(mu + sigma_u * u[d]);
    }
  }

  real pi_ghana = inv_logit(mu);

  vector[C] log_lik;
  for (c in 1:C) {
    log_lik[c] = binomial_logit_lpmf(y[c] | n[c],
                   mu + sigma_u * u[district[c]]
                   + sigma_v * epsilon[c]);
  }
}