# install.packages("haven")
# install.packages("reticulate")
Import Libraries
library(haven) # read .dta
source("../acro.R") # ACRO
INFO:acro:version: 0.4.0
INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}
INFO:acro:automatic suppression: False
Load Data
data = read_dta("../data/test_data.dta")
data = as.data.frame(data)
data = zap_labels(data) # Stata data files include extra labels
head(data)
Tables
ACRO Crosstab
index = data[, c("year")]
columns = data[, c("grant_type")]
values = data[, c("inc_grants")]
aggfunc = "mean"
table = acro_crosstab(index, columns, values=values, aggfunc=aggfunc)
INFO:acro:get_summary(): fail; threshold: 6 cells may need suppressing; p-ratio: 1 cells may need suppressing; nk-rule: 1 cells may need suppressing;
INFO:acro:outcome_df:
col_0 G N R R/G
row_0
2010.0 ok ok ok threshold; p-ratio; nk-rule;
2011.0 ok ok ok threshold;
2012.0 ok ok ok threshold;
2013.0 ok ok ok threshold;
2014.0 ok ok ok threshold;
2015.0 ok ok ok threshold;
INFO:acro:records:add(): output_0
table
ACRO Pivot Table
index = "grant_type"
values = "inc_grants"
aggfunc = list("mean", "std")
table = acro_pivot_table(data, values=values, index=index, aggfunc=aggfunc)
INFO:acro:get_summary(): pass
INFO:acro:outcome_df:
mean std
inc_grants inc_grants
grant_type
G ok ok
N ok ok
R ok ok
R/G ok ok
INFO:acro:records:add(): output_1
table
Linear Models
# extract relevant columns
df = data[, c("inc_activity", "inc_grants", "inc_donations", "total_costs")]
# drop rows with missing values
df = df[complete.cases(df), ]
# formula to fit
formula = "inc_activity ~ inc_grants + inc_donations + total_costs"
Fit Linear Model
model = lm(formula=formula, data=df)
summary(model)
Call:
lm(formula = formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-74574494 -633828 -396927 -259424 277064227
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.994e+05 5.313e+05 0.752 0.452
inc_grants -8.856e-01 2.451e-02 -36.128 <2e-16 ***
inc_donations -6.659e-01 1.628e-02 -40.905 <2e-16 ***
total_costs 8.318e-01 1.054e-02 78.937 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 13990000 on 807 degrees of freedom
Multiple R-squared: 0.8943, Adjusted R-squared: 0.8939
F-statistic: 2276 on 3 and 807 DF, p-value: < 2.2e-16
ACRO Linear Model
acro_lm(formula=formula, data=df)
INFO:acro:olsr() outcome: pass; dof=807.0 >= 10
INFO:acro:records:add(): output_2
<class 'statsmodels.iolib.summary.Summary'>
"""
OLS Regression Results
==============================================================================
Dep. Variable: inc_activity R-squared: 0.894
Model: OLS Adj. R-squared: 0.894
Method: Least Squares F-statistic: 2276.
Date: Thu, 13 Jul 2023 Prob (F-statistic): 0.00
Time: 14:00:28 Log-Likelihood: -14493.
No. Observations: 811 AIC: 2.899e+04
Df Residuals: 807 BIC: 2.901e+04
Df Model: 3
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 3.994e+05 5.31e+05 0.752 0.452 -6.43e+05 1.44e+06
inc_grants -0.8856 0.025 -36.128 0.000 -0.934 -0.837
inc_donations -0.6659 0.016 -40.905 0.000 -0.698 -0.634
total_costs 0.8318 0.011 78.937 0.000 0.811 0.853
==============================================================================
Omnibus: 1348.637 Durbin-Watson: 1.424
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1298161.546
Skew: 10.026 Prob(JB): 0.00
Kurtosis: 197.973 Cond. No. 1.06e+08
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.06e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
"""
Logit/Probit Models
# extract relevant columns
df = data[, c("survivor", "inc_activity", "inc_grants", "inc_donations", "total_costs")]
# drop rows with missing values
df = df[complete.cases(df), ]
# convert survivor to numeric
df = transform(df, survivor = as.numeric(survivor))
# formula to fit
formula = "survivor ~ inc_activity + inc_grants + inc_donations + total_costs"
Fit Logit Model
model = glm(formula=formula, data=df, family=binomial(link="logit"))
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
Call:
glm(formula = formula, family = binomial(link = "logit"), data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.5701 -1.1953 0.0203 1.0948 1.3215
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.364e-02 9.091e-02 0.480 0.63123
inc_activity 5.790e-08 1.129e-07 0.513 0.60807
inc_grants -7.977e-08 1.045e-07 -0.764 0.44513
inc_donations 3.638e-07 1.312e-07 2.772 0.00557 **
total_costs 5.649e-08 1.002e-07 0.564 0.57273
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1019.00 on 810 degrees of freedom
Residual deviance: 800.92 on 806 degrees of freedom
AIC: 810.92
Number of Fisher Scoring iterations: 10
ACRO Logit Model
acro_glm(formula=formula, data=df, family="logit")
INFO:acro:logitr() outcome: pass; dof=806.0 >= 10
INFO:acro:records:add(): output_3
<class 'statsmodels.iolib.summary.Summary'>
"""
Logit Regression Results
==============================================================================
Dep. Variable: survivor No. Observations: 811
Model: Logit Df Residuals: 806
Method: MLE Df Model: 4
Date: Thu, 13 Jul 2023 Pseudo R-squ.: 0.2140
Time: 14:00:28 Log-Likelihood: -400.46
converged: True LL-Null: -509.50
Covariance Type: nonrobust LLR p-value: 4.862e-46
=================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 0.0436 0.091 0.480 0.631 -0.135 0.222
inc_activity 5.79e-08 1.13e-07 0.513 0.608 -1.63e-07 2.79e-07
inc_grants -7.977e-08 1.04e-07 -0.764 0.445 -2.85e-07 1.25e-07
inc_donations 3.638e-07 1.31e-07 2.772 0.006 1.07e-07 6.21e-07
total_costs 5.649e-08 1e-07 0.564 0.573 -1.4e-07 2.53e-07
=================================================================================
Possibly complete quasi-separation: A fraction 0.17 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
"""
Optimization terminated successfully.
Current function value: 0.493788
Iterations 12
Fit Probit Model
model = glm(formula=formula, data=df, family=binomial(link="probit"))
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
Call:
glm(formula = formula, family = binomial(link = "probit"), data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.3475 -1.2076 0.0083 1.1017 1.2938
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.454e-02 5.628e-02 0.791 0.4288
inc_activity 4.114e-08 6.351e-08 0.648 0.5171
inc_grants -4.297e-08 5.884e-08 -0.730 0.4652
inc_donations 1.358e-07 6.392e-08 2.124 0.0337 *
total_costs 3.473e-08 5.657e-08 0.614 0.5393
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1019.00 on 810 degrees of freedom
Residual deviance: 806.49 on 806 degrees of freedom
AIC: 816.49
Number of Fisher Scoring iterations: 12
ACRO Probit Model
acro_glm(formula=formula, data=df, family="probit")
INFO:acro:probitr() outcome: pass; dof=806.0 >= 10
INFO:acro:records:add(): output_4
<class 'statsmodels.iolib.summary.Summary'>
"""
Probit Regression Results
==============================================================================
Dep. Variable: survivor No. Observations: 811
Model: Probit Df Residuals: 806
Method: MLE Df Model: 4
Date: Thu, 13 Jul 2023 Pseudo R-squ.: 0.2086
Time: 14:00:28 Log-Likelihood: -403.24
converged: True LL-Null: -509.50
Covariance Type: nonrobust LLR p-value: 7.648e-45
=================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 0.0445 0.056 0.791 0.429 -0.066 0.155
inc_activity 4.114e-08 6.54e-08 0.629 0.530 -8.71e-08 1.69e-07
inc_grants -4.297e-08 6.07e-08 -0.708 0.479 -1.62e-07 7.6e-08
inc_donations 1.357e-07 6.27e-08 2.163 0.031 1.28e-08 2.59e-07
total_costs 3.473e-08 5.84e-08 0.595 0.552 -7.96e-08 1.49e-07
=================================================================================
Possibly complete quasi-separation: A fraction 0.18 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
"""
Optimization terminated successfully.
Current function value: 0.497218
Iterations 10
Add Custom Output
acro_custom_output("XandY.jfif", "This output is an image showing the relationship between X and Y")
INFO:acro:records:add_custom(): output_5
Rename Output
acro_rename_output("output_5", "xy_plot")
INFO:acro:records:rename_output(): output_5 renamed to xy_plot
Remove Output
acro_remove_output("output_3")
INFO:acro:records:remove(): output_3 removed
Finalise
acro_finalise("RTEST", "json")
INFO:acro:records:
uid: output_0
status: fail
type: table
properties: {'method': 'crosstab'}
sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 6, 'p-ratio': 1, 'nk-rule': 1}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 3], [1, 3], [2, 3], [3, 3], [4, 3], [5, 3]], 'p-ratio': [[0, 3]], 'nk-rule': [[0, 3]]}}
command: crosstab()
summary: fail; threshold: 6 cells may need suppressing; p-ratio: 1 cells may need suppressing; nk-rule: 1 cells may need suppressing;
outcome: col_0 G N R R/G
row_0
2010.0 ok ok ok threshold; p-ratio; nk-rule;
2011.0 ok ok ok threshold;
2012.0 ok ok ok threshold;
2013.0 ok ok ok threshold;
2014.0 ok ok ok threshold;
2015.0 ok ok ok threshold;
output: [col_0 G N R R/G
row_0
2010.0 9.921906e+06 0.000000 8.402284e+06 11636000.0
2011.0 8.502247e+06 124013.862069 7.716880e+06 16047500.0
2012.0 1.145858e+07 131859.067797 6.958050e+06 16810000.0
2013.0 1.355715e+07 147937.796610 7.202274e+06 16765625.0
2014.0 1.374815e+07 133198.254237 8.277525e+06 17845750.0
2015.0 1.113343e+07 146572.189655 1.081289e+07 18278625.0]
timestamp: 2023-07-13T14:00:28.377038
comments: ['This is a crosstab on the nursery dataset.']
exception:
The status of the record above is: fail.
Please explain why an exception should be granted.
I really need this one.
INFO:acro:records:
uid: xy_plot
status: review
type: custom
properties: {}
sdc: {}
command: custom
summary: review
outcome: Empty DataFrame
Columns: []
Index: []
output: ['XandY.jfif']
timestamp: 2023-07-13T14:00:28.765323
comments: ['This output is an image showing the relationship between X and Y']
exception:
The status of the record above is: review.
Please explain why an exception should be granted.
It's just a plot of X and Y
INFO:acro:records:outputs written to: RTEST
<acro.record.Records object at 0x7fcf501b8d60>
LS0tCnRpdGxlOiAiQUNSTyBSIE5vdGVib29rIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7cn0KIyBpbnN0YWxsLnBhY2thZ2VzKCJoYXZlbiIpCiMgaW5zdGFsbC5wYWNrYWdlcygicmV0aWN1bGF0ZSIpCmBgYAoKIyMgSW1wb3J0IExpYnJhcmllcwoKYGBge3J9CmxpYnJhcnkoaGF2ZW4pICAgIyByZWFkIC5kdGEKc291cmNlKCIuLi9hY3JvLlIiKSAgIyBBQ1JPCmBgYAoKIyMgTG9hZCBEYXRhCgpgYGB7cn0KZGF0YSA9IHJlYWRfZHRhKCIuLi9kYXRhL3Rlc3RfZGF0YS5kdGEiKQpkYXRhID0gYXMuZGF0YS5mcmFtZShkYXRhKQpkYXRhID0gemFwX2xhYmVscyhkYXRhKSAgIyBTdGF0YSBkYXRhIGZpbGVzIGluY2x1ZGUgZXh0cmEgbGFiZWxzCgpoZWFkKGRhdGEpCmBgYAoKIyMgVGFibGVzCgojIyMgQUNSTyBDcm9zc3RhYgoKYGBge3J9CmluZGV4ID0gZGF0YVssIGMoInllYXIiKV0KY29sdW1ucyA9IGRhdGFbLCBjKCJncmFudF90eXBlIildCnZhbHVlcyA9IGRhdGFbLCBjKCJpbmNfZ3JhbnRzIildCmFnZ2Z1bmMgPSAibWVhbiIKCnRhYmxlID0gYWNyb19jcm9zc3RhYihpbmRleCwgY29sdW1ucywgdmFsdWVzPXZhbHVlcywgYWdnZnVuYz1hZ2dmdW5jKQpgYGAKCmBgYHtyfQp0YWJsZQpgYGAKCiMjIyBBZGQgQ29tbWVudHMgdG8gT3V0cHV0CgpgYGB7cn0KYWNyb19hZGRfY29tbWVudHMoIm91dHB1dF8wIiwgIlRoaXMgaXMgYSBjcm9zc3RhYiBvbiB0aGUgbnVyc2VyeSBkYXRhc2V0LiIpCmBgYAoKIyMjIEFDUk8gUGl2b3QgVGFibGUKCmBgYHtyfQppbmRleCA9ICJncmFudF90eXBlIgp2YWx1ZXMgPSAiaW5jX2dyYW50cyIKYWdnZnVuYyA9IGxpc3QoIm1lYW4iLCAic3RkIikKCnRhYmxlID0gYWNyb19waXZvdF90YWJsZShkYXRhLCB2YWx1ZXM9dmFsdWVzLCBpbmRleD1pbmRleCwgYWdnZnVuYz1hZ2dmdW5jKQpgYGAKCmBgYHtyfQp0YWJsZQpgYGAKCiMjIExpbmVhciBNb2RlbHMKCmBgYHtyfQojIGV4dHJhY3QgcmVsZXZhbnQgY29sdW1ucwpkZiA9IGRhdGFbLCBjKCJpbmNfYWN0aXZpdHkiLCAiaW5jX2dyYW50cyIsICJpbmNfZG9uYXRpb25zIiwgInRvdGFsX2Nvc3RzIildCiMgZHJvcCByb3dzIHdpdGggbWlzc2luZyB2YWx1ZXMKZGYgPSBkZltjb21wbGV0ZS5jYXNlcyhkZiksIF0KIyBmb3JtdWxhIHRvIGZpdApmb3JtdWxhID0gImluY19hY3Rpdml0eSB+IGluY19ncmFudHMgKyBpbmNfZG9uYXRpb25zICsgdG90YWxfY29zdHMiCmBgYAoKIyMjIEZpdCBMaW5lYXIgTW9kZWwKCmBgYHtyfQptb2RlbCA9IGxtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZikKc3VtbWFyeShtb2RlbCkKYGBgCgojIyMgQUNSTyBMaW5lYXIgTW9kZWwKCmBgYHtyfQphY3JvX2xtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZikKYGBgCgojIyBMb2dpdC9Qcm9iaXQgTW9kZWxzCgpgYGB7cn0KIyBleHRyYWN0IHJlbGV2YW50IGNvbHVtbnMKZGYgPSBkYXRhWywgYygic3Vydml2b3IiLCAiaW5jX2FjdGl2aXR5IiwgImluY19ncmFudHMiLCAiaW5jX2RvbmF0aW9ucyIsICJ0b3RhbF9jb3N0cyIpXQojIGRyb3Agcm93cyB3aXRoIG1pc3NpbmcgdmFsdWVzCmRmID0gZGZbY29tcGxldGUuY2FzZXMoZGYpLCBdCiMgY29udmVydCBzdXJ2aXZvciB0byBudW1lcmljCmRmID0gdHJhbnNmb3JtKGRmLCBzdXJ2aXZvciA9IGFzLm51bWVyaWMoc3Vydml2b3IpKQojIGZvcm11bGEgdG8gZml0CmZvcm11bGEgPSAic3Vydml2b3IgfiBpbmNfYWN0aXZpdHkgKyBpbmNfZ3JhbnRzICsgaW5jX2RvbmF0aW9ucyArIHRvdGFsX2Nvc3RzIgpgYGAKCiMjIyBGaXQgTG9naXQgTW9kZWwKCmBgYHtyfQptb2RlbCA9IGdsbShmb3JtdWxhPWZvcm11bGEsIGRhdGE9ZGYsIGZhbWlseT1iaW5vbWlhbChsaW5rPSJsb2dpdCIpKQpzdW1tYXJ5KG1vZGVsKQpgYGAKCiMjIyBBQ1JPIExvZ2l0IE1vZGVsCgpgYGB7cn0KYWNyb19nbG0oZm9ybXVsYT1mb3JtdWxhLCBkYXRhPWRmLCBmYW1pbHk9ImxvZ2l0IikKYGBgCgojIyMgRml0IFByb2JpdCBNb2RlbAoKYGBge3J9Cm1vZGVsID0gZ2xtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZiwgZmFtaWx5PWJpbm9taWFsKGxpbms9InByb2JpdCIpKQpzdW1tYXJ5KG1vZGVsKQpgYGAKCiMjIyBBQ1JPIFByb2JpdCBNb2RlbAoKYGBge3J9CmFjcm9fZ2xtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZiwgZmFtaWx5PSJwcm9iaXQiKQpgYGAKCiMjIyBBZGQgQ3VzdG9tIE91dHB1dAoKYGBge3J9CmFjcm9fY3VzdG9tX291dHB1dCgiWGFuZFkuamZpZiIsICJUaGlzIG91dHB1dCBpcyBhbiBpbWFnZSBzaG93aW5nIHRoZSByZWxhdGlvbnNoaXAgYmV0d2VlbiBYIGFuZCBZIikKYGBgCgojIyMgUmVuYW1lIE91dHB1dAoKYGBge3J9CmFjcm9fcmVuYW1lX291dHB1dCgib3V0cHV0XzUiLCAieHlfcGxvdCIpCmBgYAoKIyMjIFJlbW92ZSBPdXRwdXQKCmBgYHtyfQphY3JvX3JlbW92ZV9vdXRwdXQoIm91dHB1dF8zIikKYGBgCgojIyBGaW5hbGlzZQoKYGBge3J9CmFjcm9fZmluYWxpc2UoIlJURVNUIiwgImpzb24iKQpgYGAK