################################################################
# Carl 10 Dec 2016
#
# Monte Carlo study of TOPALS and indirect standardization estimators
# on simulated mortality data for which the true log(mx) schedule is known
#
# - Uses the male log(mx) schedule for Brazil as the "truth"
#
# - Generates simulated small-area samples for different population sizes n,
# where n=# of people in each single-year age group. (For ex., if n=10 then
# there are (10 infants, 10 1-yr-olds, ..., 10 99 yr-olds exposed to the
# true mortality rates, and the observed numbers fo deaths are Poisson
# random variables Dx ~ 10*(true mx)
#
# - Fits TOPALS and IS models for each sample using
# (1) the HMD male average logmx values as the standard
# (2) the true schedule as the standard
#
# - Calculates errors abs(estimated logmx - true logmx) for each (sample, method, std, age)
# - Calculates errors abs(estimated e0 - true e0) for each (sample, method, std)
#
# - Summarizes errors by (sample size, method)
################################################################
# HOUSEKEEPING
graphics.off()
if (.Platform$OS.type == 'windows') windows(width=12, height=8, record=TRUE)
rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(splines)
set.seed(6447100)
# GET BRAZILIAN DATA
### Observed deaths and populations for UF, Meso, Micro and Municipalities in Brazil in 2010
### (deaths are totals over 2009-2011)
big.df = read.csv("../big2010.df.csv",skip=10, header=T, sep=",", dec=".", stringsAsFactors = FALSE)
### municipio-level geographic data (names and codes for state, meso, micro, etc.)
muni.df = read.csv("../muni.df.csv", header=T, sep=",", dec=".", stringsAsFactors = FALSE)
### build linear B-spline matrix (100 x 7)
age = 0:99
B = bs( age, knots=c(0,1,10,20,40,70), degree=1)
nalpha = ncol(B)
### calculate a national-level male logmx schedule for Brazil
BR <- left_join(big.df, muni.df, by='munid') %>%
filter(sex=='m') %>%
select(munid, municode, muniname, ufabb, sex,age,death,expos) %>%
group_by(sex, age) %>%
summarise(expos=sum(expos), death=sum(death)) %>%
mutate(mx=death/expos, logmx=log(mx))
head(BR)
## Source: local data frame [6 x 6]
## Groups: sex [1]
##
## sex age expos death mx logmx
## <chr> <int> <dbl> <int> <dbl> <dbl>
## 1 m 0 4133216 67803 0.0164044189 -4.110205
## 2 m 1 4142143 4921 0.0011880323 -6.735457
## 3 m 2 4189374 2809 0.0006705060 -7.307478
## 4 m 3 4280676 2040 0.0004765602 -7.648917
## 5 m 4 4376254 1726 0.0003944013 -7.838142
## 6 m 5 4432177 1554 0.0003506178 -7.955814
plot(0:99, BR$logmx, xlab='Age', ylab='log(mx)',pch=15, cex=.75,
main='True Schedule for Simulations\nLog Mortality, Brazil Males 2009-2011')
### standard rates by sex from HMD dataframe (calculated elsewhere)
HMDstd = dget('../HMDstd.dput') %>%
filter(age %in% 0:99)
# SIMULATED DATA: deaths by age (100), sample# (5000), sample size (3)
sample.sizes = c(1,100,10000)
nsamp = 5000
D = array(NA, c(100, nsamp, length(sample.sizes)),
dimnames=list(0:99,paste0('samp#',seq(nsamp)),paste(sample.sizes)))
for (n in sample.sizes) {
D[,,paste(n)] = sapply(1:nsamp, function(x) rpois(100, n*exp(BR$logmx)) )
}
## display a bit of the simulated death data (first 5 samples at each sample size)
D[,1:5,]
## , , 1
##
## samp#1 samp#2 samp#3 samp#4 samp#5
## 0 0 0 0 1 0
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## 9 0 0 0 0 0
## 10 0 0 0 0 0
## 11 0 0 0 0 0
## 12 0 0 0 0 0
## 13 0 0 0 0 0
## 14 0 0 0 0 0
## 15 0 0 0 0 0
## 16 0 0 0 0 0
## 17 0 0 0 0 0
## 18 0 0 0 0 0
## 19 0 0 0 0 0
## 20 0 0 0 0 0
## 21 0 0 0 0 0
## 22 0 0 0 0 0
## 23 0 0 0 0 0
## 24 0 0 0 0 0
## 25 0 0 0 0 0
## 26 0 0 0 0 0
## 27 0 0 0 0 0
## 28 0 0 0 0 0
## 29 0 0 0 0 0
## 30 0 0 0 0 0
## 31 0 0 0 0 0
## 32 0 0 0 0 0
## 33 0 0 0 0 0
## 34 0 0 0 0 0
## 35 0 0 0 0 0
## 36 0 0 0 0 0
## 37 0 0 0 0 0
## 38 0 0 0 0 0
## 39 0 0 0 0 0
## 40 0 0 0 0 0
## 41 0 0 0 0 0
## 42 0 0 0 0 0
## 43 0 0 0 0 0
## 44 0 0 0 0 0
## 45 0 1 0 0 0
## 46 0 0 0 0 0
## 47 0 0 0 0 0
## 48 0 0 0 0 0
## 49 0 0 0 0 0
## 50 0 0 0 0 0
## 51 0 0 0 0 0
## 52 0 0 0 0 0
## 53 0 0 0 0 0
## 54 0 0 0 0 0
## 55 0 0 0 0 0
## 56 0 0 0 0 0
## 57 0 0 0 0 0
## 58 0 0 0 0 0
## 59 0 0 0 0 0
## 60 0 0 0 0 0
## 61 0 0 0 0 0
## 62 0 0 0 0 0
## 63 0 0 0 0 0
## 64 0 0 0 0 0
## 65 0 0 0 0 0
## 66 0 0 0 0 0
## 67 0 0 0 0 1
## 68 0 0 0 0 0
## 69 0 0 0 0 0
## 70 0 0 0 0 0
## 71 1 0 0 0 0
## 72 0 0 0 0 0
## 73 0 0 0 0 0
## 74 1 0 0 0 0
## 75 0 0 0 0 0
## 76 0 1 0 0 0
## 77 0 0 0 1 0
## 78 0 0 0 0 1
## 79 0 1 0 0 1
## 80 0 0 0 0 0
## 81 0 0 0 0 0
## 82 0 0 0 0 1
## 83 1 0 0 0 0
## 84 0 0 0 1 2
## 85 1 0 0 0 1
## 86 0 0 0 1 0
## 87 0 1 0 1 0
## 88 0 1 0 1 0
## 89 0 1 0 0 0
## 90 0 0 0 0 0
## 91 0 1 0 0 0
## 92 0 1 0 1 0
## 93 1 1 0 0 0
## 94 0 0 0 0 0
## 95 0 1 0 0 0
## 96 1 0 0 0 0
## 97 0 1 2 0 0
## 98 1 0 0 0 1
## 99 0 0 0 1 0
##
## , , 100
##
## samp#1 samp#2 samp#3 samp#4 samp#5
## 0 3 2 0 1 5
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## 9 0 0 0 0 0
## 10 0 0 0 0 0
## 11 0 0 0 0 0
## 12 0 0 0 0 0
## 13 0 0 0 0 0
## 14 0 0 0 0 0
## 15 0 0 0 0 0
## 16 0 0 0 0 0
## 17 0 1 0 0 0
## 18 0 1 0 0 0
## 19 0 0 0 0 0
## 20 0 1 0 0 0
## 21 0 0 0 0 0
## 22 0 0 0 0 0
## 23 0 0 2 0 1
## 24 0 1 0 0 0
## 25 0 1 0 0 0
## 26 0 0 1 0 1
## 27 0 0 0 1 0
## 28 1 0 0 0 0
## 29 0 0 0 1 0
## 30 0 0 0 0 2
## 31 0 0 0 0 0
## 32 0 0 0 0 0
## 33 0 0 0 0 0
## 34 0 2 0 0 0
## 35 0 0 1 0 0
## 36 2 0 0 1 0
## 37 0 0 0 1 0
## 38 1 1 0 0 0
## 39 1 1 1 0 0
## 40 0 2 1 0 0
## 41 0 2 0 2 0
## 42 0 0 0 1 0
## 43 2 0 0 0 0
## 44 1 0 2 2 0
## 45 0 0 1 0 0
## 46 0 0 0 0 0
## 47 1 1 0 1 0
## 48 0 2 1 0 1
## 49 3 1 1 0 2
## 50 1 0 2 0 0
## 51 1 1 1 2 0
## 52 0 0 2 1 0
## 53 1 0 1 1 1
## 54 3 4 1 1 2
## 55 0 2 2 0 2
## 56 2 1 1 2 4
## 57 2 0 2 5 1
## 58 1 0 2 4 1
## 59 4 2 2 1 1
## 60 2 1 4 1 0
## 61 2 1 0 4 3
## 62 3 0 2 1 0
## 63 2 2 2 1 1
## 64 1 2 3 3 3
## 65 2 0 1 1 4
## 66 2 2 5 2 4
## 67 3 3 1 1 2
## 68 3 3 2 4 2
## 69 2 2 2 5 2
## 70 4 3 4 2 1
## 71 4 4 6 4 6
## 72 5 5 2 10 3
## 73 1 7 4 7 3
## 74 6 5 4 8 10
## 75 6 7 7 2 7
## 76 6 5 4 6 6
## 77 6 6 8 4 6
## 78 4 5 7 5 4
## 79 5 8 5 6 4
## 80 8 5 8 7 6
## 81 8 4 3 9 7
## 82 12 12 6 10 3
## 83 7 7 12 11 6
## 84 5 12 7 12 11
## 85 7 15 15 5 15
## 86 22 9 4 7 14
## 87 17 10 11 9 16
## 88 13 9 15 19 12
## 89 14 14 12 14 16
## 90 16 15 13 18 16
## 91 15 14 18 16 14
## 92 22 20 19 16 18
## 93 20 18 30 23 22
## 94 26 16 13 13 26
## 95 17 17 27 16 26
## 96 18 28 22 25 23
## 97 27 17 18 14 22
## 98 31 33 18 31 22
## 99 31 22 27 19 23
##
## , , 10000
##
## samp#1 samp#2 samp#3 samp#4 samp#5
## 0 158 170 151 156 172
## 1 14 4 12 14 10
## 2 10 8 10 7 8
## 3 2 5 5 3 3
## 4 6 4 3 5 8
## 5 3 5 4 3 5
## 6 2 2 0 3 4
## 7 2 2 3 3 0
## 8 1 3 4 1 4
## 9 3 5 2 1 2
## 10 3 3 1 3 3
## 11 2 5 2 2 3
## 12 5 5 4 5 4
## 13 9 5 6 8 5
## 14 5 9 6 12 9
## 15 12 5 9 12 8
## 16 17 17 14 15 8
## 17 15 20 29 21 19
## 18 24 30 24 22 19
## 19 24 20 31 14 17
## 20 24 27 26 24 25
## 21 17 35 34 19 30
## 22 37 22 28 30 17
## 23 24 21 29 16 32
## 24 20 24 28 35 23
## 25 24 20 30 33 22
## 26 20 20 21 26 24
## 27 21 24 23 28 23
## 28 26 31 23 30 17
## 29 31 29 30 27 25
## 30 29 30 30 35 18
## 31 29 27 29 30 20
## 32 39 27 23 26 38
## 33 34 27 24 19 32
## 34 34 29 32 31 22
## 35 38 36 30 28 28
## 36 31 44 32 30 34
## 37 36 48 31 37 28
## 38 28 31 42 25 34
## 39 28 21 37 30 34
## 40 35 45 30 32 38
## 41 35 42 57 43 27
## 42 54 48 52 42 38
## 43 51 59 39 58 33
## 44 50 45 39 54 41
## 45 58 42 66 49 63
## 46 51 62 40 60 65
## 47 44 57 64 77 70
## 48 67 39 58 64 75
## 49 65 61 57 73 65
## 50 73 57 71 84 77
## 51 84 83 71 72 74
## 52 73 82 85 91 82
## 53 101 112 102 109 103
## 54 87 96 99 100 103
## 55 95 120 96 115 109
## 56 130 105 113 128 100
## 57 116 104 129 105 150
## 58 130 133 131 137 145
## 59 152 153 128 148 131
## 60 144 172 141 146 133
## 61 184 163 168 171 161
## 62 186 172 182 169 176
## 63 172 173 198 204 192
## 64 188 189 195 183 173
## 65 220 203 221 186 214
## 66 252 212 214 237 258
## 67 274 250 254 260 243
## 68 272 283 236 324 308
## 69 276 311 278 295 296
## 70 339 356 308 280 327
## 71 384 338 388 419 321
## 72 386 389 387 395 362
## 73 424 395 405 378 416
## 74 437 466 445 411 421
## 75 467 433 515 511 471
## 76 530 573 503 510 499
## 77 553 565 572 597 554
## 78 634 584 665 634 655
## 79 668 700 683 729 651
## 80 728 779 747 739 730
## 81 799 794 820 787 828
## 82 883 833 926 919 921
## 83 913 918 954 975 935
## 84 976 1034 1016 967 1054
## 85 1084 1101 1074 1125 1150
## 86 1139 1235 1257 1214 1212
## 87 1335 1360 1230 1285 1322
## 88 1404 1414 1377 1362 1407
## 89 1412 1437 1441 1470 1422
## 90 1553 1505 1633 1505 1534
## 91 1773 1690 1663 1657 1651
## 92 1915 1900 1829 1920 1850
## 93 1948 2029 2051 2027 2016
## 94 2226 2187 2179 2140 2135
## 95 2149 2094 2188 2232 2171
## 96 2300 2273 2372 2369 2274
## 97 2228 2307 2315 2349 2331
## 98 2417 2408 2460 2327 2416
## 99 2558 2428 2584 2553 2525
# IS and TOPALS fits for logmx
logmx.hat = array(NA, c(100,3,nsamp,2,2),
dimnames=list(0:99,paste(sample.sizes),paste0('samp#',1:nsamp), c('TOPALS','IS'),c('BR','HMD')))
# calculate IS estimates of logmx
# for a column of observed deaths D0...D99, sample size n, and standard schedule
IS.fit = function(Dx, n, std) {
expected.deaths = sum(n * exp(std))
offset = log( sum(Dx) / expected.deaths)
return( std + offset)
}
for (n in sample.sizes) {
logmx.hat[,paste(n),,'IS','BR'] = apply(D[,,paste(n)],2, IS.fit, n=n, std=BR$logmx)
logmx.hat[,paste(n),,'IS','HMD'] = apply(D[,,paste(n)],2, IS.fit, n=n, std=HMDstd[,'male'])
}
# calculate TOPALS estimates of logmx
# for a column of observed deaths D0...D99, sample size n, and standard schedule
## log likelihood function L(alpha | D,N)
penalized.logL = function( alpha, D, N, std, Basis=B, k=1) {
logmx.hat = std + (Basis %*% alpha)
penalty = k * sum(diff(alpha)^2)
return( sum( -N * exp(logmx.hat) + D * logmx.hat) - penalty)
}
TOPALS.fit = function(Dx, n, std) {
# control parameters and initial values for nonlinear fit
this.control = list(maxit=1000,fnscale= -1, parscale=rep(.01,ncol(B)))
alpha0 = rep(0, ncol(B))
fit = optim( alpha0, penalized.logL, D=Dx, N=rep(n,100), std=std, Basis=B,
method='BFGS', control=this.control)
converged = (fit$convergence == 0)
if (converged) { result = std + B %*% fit$par }
if (!converged) { result = std + NA}
return( result)
}
for (n in sample.sizes) {
logmx.hat[,paste(n),,'TOPALS','BR'] = apply(D[,,paste(n)],2, TOPALS.fit, n=n, std=BR$logmx)
logmx.hat[,paste(n),,'TOPALS','HMD'] = apply(D[,,paste(n)],2, TOPALS.fit, n=n, std=HMDstd[,'male'])
}
## change any bad data (caused by samples with a TOTAL of zero deaths) to missing
logmx.hat[ !is.finite(logmx.hat)] = NA
##########################################################################
## display some randomly selected samples and fits (n=100)
op = par(no.readonly = TRUE) # save graphical settings
par(mfrow=c(1,2))
sel = sort( sample(nsamp, 15) )
for (i in sel) {
for (method in c('IS','TOPALS')) {
plot(0:99, BR$logmx, type='p', xlab='Age', ylab='log(mx)', pch=15,cex=.75,
main=paste('Sample',i,'with n=100 people per single-year age\n',
method,'(green solid=correct std, red dashed=wrong std)'),
cex.main = .80, ylim=c(-10,0) )
points(0:99, log( D[,i,'100'] / 100) )
lines(0:99, logmx.hat[,'100',i,method,'BR'], col='forestgreen', lwd=3)
lines(0:99, logmx.hat[,'100',i,method,'HMD'], col='red', lwd=3,lty=2)
zero.ages = (0:99)[ which(D[,i,'100'] == 0) ]
rug( zero.ages)
} # for method
} # for i
par(op)
##########################################################################
# ERROR SUMMARIES
##########################################################################
## calculate and print the mean abs(logmx.hat - logmx) for each
## (sample size, method, std)
logmx.errors= apply( abs( logmx.hat - BR$logmx), c(2,4,5), mean, na.rm=TRUE)
round(logmx.errors, 3)
## , , BR
##
## TOPALS IS
## 1 0.417 0.415
## 100 0.142 0.040
## 10000 0.028 0.004
##
## , , HMD
##
## TOPALS IS
## 1 0.692 0.733
## 100 0.187 0.539
## 10000 0.057 0.538
## calculate and print the mean abs(e0.hat - e0) for each
## (sample size, method, std)
## log mortality schedule -> life table {dx} column
dx = function(lambda) {
mx = exp(lambda)
Hx = cumsum(mx)
lx = c(1, exp(-Hx))
tmp = -diff(lx)
return(tmp/sum(tmp))
}
# ## life table e0
e0 = function(lambda) {
this.dx = dx(lambda)
## approx ages at death for those dying in each interval
death.age = 0:99+.5
death.age[1] = 0.10 # infants
return (sum( death.age * this.dx))
}
# ## true life expectancy used in all simulations
e0.BR = e0(BR$logmx)
e0.BR
## [1] 70.71041
e0.hat = apply(logmx.hat, 2:5, e0)
e0.errors= apply( abs( e0.hat - e0.BR), c(1,3,4), mean, na.rm=TRUE)
round(e0.errors, 2)
## , , BR
##
## TOPALS IS
## 1 4.94 4.79
## 100 1.29 0.52
## 10000 0.15 0.05
##
## , , HMD
##
## TOPALS IS
## 1 6.10 6.31
## 100 1.43 5.54
## 10000 0.15 5.54