ACRO R Notebook

# install.packages("haven")
# install.packages("reticulate")

Import Libraries

library(haven)   # read .dta
source("../acro.R")  # ACRO

INFO:acro:version: 0.4.0
INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}
INFO:acro:automatic suppression: False

Load Data

data = read_dta("../data/test_data.dta")
data = as.data.frame(data)
data = zap_labels(data)  # Stata data files include extra labels

head(data)

Tables

ACRO Crosstab

index = data[, c("year")]
columns = data[, c("grant_type")]
values = data[, c("inc_grants")]
aggfunc = "mean"

table = acro_crosstab(index, columns, values=values, aggfunc=aggfunc)

INFO:acro:get_summary(): fail; threshold: 6 cells may need suppressing; p-ratio: 1 cells may need suppressing; nk-rule: 1 cells may need suppressing;
INFO:acro:outcome_df:
col_0    G   N   R                            R/G
row_0
2010.0  ok  ok  ok  threshold; p-ratio; nk-rule;
2011.0  ok  ok  ok                    threshold;
2012.0  ok  ok  ok                    threshold;
2013.0  ok  ok  ok                    threshold;
2014.0  ok  ok  ok                    threshold;
2015.0  ok  ok  ok                    threshold;
INFO:acro:records:add(): output_0

table

Add Comments to Output

acro_add_comments("output_0", "This is a crosstab on the nursery dataset.")

INFO:acro:records:a comment was added to output_0

ACRO Pivot Table

index = "grant_type"
values = "inc_grants"
aggfunc = list("mean", "std")

table = acro_pivot_table(data, values=values, index=index, aggfunc=aggfunc)

INFO:acro:get_summary(): pass
INFO:acro:outcome_df:
                 mean        std
           inc_grants inc_grants
grant_type
G                  ok         ok
N                  ok         ok
R                  ok         ok
R/G                ok         ok
INFO:acro:records:add(): output_1

table

Linear Models

# extract relevant columns
df = data[, c("inc_activity", "inc_grants", "inc_donations", "total_costs")]
# drop rows with missing values
df = df[complete.cases(df), ]
# formula to fit
formula = "inc_activity ~ inc_grants + inc_donations + total_costs"

Fit Linear Model

model = lm(formula=formula, data=df)
summary(model)


Call:
lm(formula = formula, data = df)

Residuals:
      Min        1Q    Median        3Q       Max
-74574494   -633828   -396927   -259424 277064227

Coefficients:
                Estimate Std. Error t value Pr(>|t|)
(Intercept)    3.994e+05  5.313e+05   0.752    0.452
inc_grants    -8.856e-01  2.451e-02 -36.128   <2e-16 ***
inc_donations -6.659e-01  1.628e-02 -40.905   <2e-16 ***
total_costs    8.318e-01  1.054e-02  78.937   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 13990000 on 807 degrees of freedom
Multiple R-squared:  0.8943,    Adjusted R-squared:  0.8939
F-statistic:  2276 on 3 and 807 DF,  p-value: < 2.2e-16

ACRO Linear Model

acro_lm(formula=formula, data=df)

INFO:acro:olsr() outcome: pass; dof=807.0 >= 10
INFO:acro:records:add(): output_2

<class 'statsmodels.iolib.summary.Summary'>
"""
                            OLS Regression Results
==============================================================================
Dep. Variable:           inc_activity   R-squared:                       0.894
Model:                            OLS   Adj. R-squared:                  0.894
Method:                 Least Squares   F-statistic:                     2276.
Date:                Thu, 13 Jul 2023   Prob (F-statistic):               0.00
Time:                        14:00:28   Log-Likelihood:                -14493.
No. Observations:                 811   AIC:                         2.899e+04
Df Residuals:                     807   BIC:                         2.901e+04
Df Model:                           3
Covariance Type:            nonrobust
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      3.994e+05   5.31e+05      0.752      0.452   -6.43e+05    1.44e+06
inc_grants       -0.8856      0.025    -36.128      0.000      -0.934      -0.837
inc_donations    -0.6659      0.016    -40.905      0.000      -0.698      -0.634
total_costs       0.8318      0.011     78.937      0.000       0.811       0.853
==============================================================================
Omnibus:                     1348.637   Durbin-Watson:                   1.424
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1298161.546
Skew:                          10.026   Prob(JB):                         0.00
Kurtosis:                     197.973   Cond. No.                     1.06e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.06e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
"""

Logit/Probit Models

# extract relevant columns
df = data[, c("survivor", "inc_activity", "inc_grants", "inc_donations", "total_costs")]
# drop rows with missing values
df = df[complete.cases(df), ]
# convert survivor to numeric
df = transform(df, survivor = as.numeric(survivor))
# formula to fit
formula = "survivor ~ inc_activity + inc_grants + inc_donations + total_costs"

Fit Logit Model

model = glm(formula=formula, data=df, family=binomial(link="logit"))

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)


Call:
glm(formula = formula, family = binomial(link = "logit"), data = df)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-3.5701  -1.1953   0.0203   1.0948   1.3215

Coefficients:
                Estimate Std. Error z value Pr(>|z|)
(Intercept)    4.364e-02  9.091e-02   0.480  0.63123
inc_activity   5.790e-08  1.129e-07   0.513  0.60807
inc_grants    -7.977e-08  1.045e-07  -0.764  0.44513
inc_donations  3.638e-07  1.312e-07   2.772  0.00557 **
total_costs    5.649e-08  1.002e-07   0.564  0.57273
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1019.00  on 810  degrees of freedom
Residual deviance:  800.92  on 806  degrees of freedom
AIC: 810.92

Number of Fisher Scoring iterations: 10

ACRO Logit Model

acro_glm(formula=formula, data=df, family="logit")

INFO:acro:logitr() outcome: pass; dof=806.0 >= 10
INFO:acro:records:add(): output_3

<class 'statsmodels.iolib.summary.Summary'>
"""
                           Logit Regression Results
==============================================================================
Dep. Variable:               survivor   No. Observations:                  811
Model:                          Logit   Df Residuals:                      806
Method:                           MLE   Df Model:                            4
Date:                Thu, 13 Jul 2023   Pseudo R-squ.:                  0.2140
Time:                        14:00:28   Log-Likelihood:                -400.46
converged:                       True   LL-Null:                       -509.50
Covariance Type:            nonrobust   LLR p-value:                 4.862e-46
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0436      0.091      0.480      0.631      -0.135       0.222
inc_activity    5.79e-08   1.13e-07      0.513      0.608   -1.63e-07    2.79e-07
inc_grants    -7.977e-08   1.04e-07     -0.764      0.445   -2.85e-07    1.25e-07
inc_donations  3.638e-07   1.31e-07      2.772      0.006    1.07e-07    6.21e-07
total_costs    5.649e-08      1e-07      0.564      0.573    -1.4e-07    2.53e-07
=================================================================================

Possibly complete quasi-separation: A fraction 0.17 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
"""
Optimization terminated successfully.
         Current function value: 0.493788
         Iterations 12

Fit Probit Model

model = glm(formula=formula, data=df, family=binomial(link="probit"))

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)


Call:
glm(formula = formula, family = binomial(link = "probit"), data = df)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-3.3475  -1.2076   0.0083   1.1017   1.2938

Coefficients:
                Estimate Std. Error z value Pr(>|z|)
(Intercept)    4.454e-02  5.628e-02   0.791   0.4288
inc_activity   4.114e-08  6.351e-08   0.648   0.5171
inc_grants    -4.297e-08  5.884e-08  -0.730   0.4652
inc_donations  1.358e-07  6.392e-08   2.124   0.0337 *
total_costs    3.473e-08  5.657e-08   0.614   0.5393
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1019.00  on 810  degrees of freedom
Residual deviance:  806.49  on 806  degrees of freedom
AIC: 816.49

Number of Fisher Scoring iterations: 12

ACRO Probit Model

acro_glm(formula=formula, data=df, family="probit")

INFO:acro:probitr() outcome: pass; dof=806.0 >= 10
INFO:acro:records:add(): output_4

<class 'statsmodels.iolib.summary.Summary'>
"""
                          Probit Regression Results
==============================================================================
Dep. Variable:               survivor   No. Observations:                  811
Model:                         Probit   Df Residuals:                      806
Method:                           MLE   Df Model:                            4
Date:                Thu, 13 Jul 2023   Pseudo R-squ.:                  0.2086
Time:                        14:00:28   Log-Likelihood:                -403.24
converged:                       True   LL-Null:                       -509.50
Covariance Type:            nonrobust   LLR p-value:                 7.648e-45
=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0445      0.056      0.791      0.429      -0.066       0.155
inc_activity   4.114e-08   6.54e-08      0.629      0.530   -8.71e-08    1.69e-07
inc_grants    -4.297e-08   6.07e-08     -0.708      0.479   -1.62e-07     7.6e-08
inc_donations  1.357e-07   6.27e-08      2.163      0.031    1.28e-08    2.59e-07
total_costs    3.473e-08   5.84e-08      0.595      0.552   -7.96e-08    1.49e-07
=================================================================================

Possibly complete quasi-separation: A fraction 0.18 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
"""
Optimization terminated successfully.
         Current function value: 0.497218
         Iterations 10

Add Custom Output

acro_custom_output("XandY.jfif", "This output is an image showing the relationship between X and Y")

INFO:acro:records:add_custom(): output_5

Rename Output

acro_rename_output("output_5", "xy_plot")

INFO:acro:records:rename_output(): output_5 renamed to xy_plot

Remove Output

acro_remove_output("output_3")

INFO:acro:records:remove(): output_3 removed

Finalise

acro_finalise("RTEST", "json")

INFO:acro:records:
uid: output_0
status: fail
type: table
properties: {'method': 'crosstab'}
sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 6, 'p-ratio': 1, 'nk-rule': 1}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 3], [1, 3], [2, 3], [3, 3], [4, 3], [5, 3]], 'p-ratio': [[0, 3]], 'nk-rule': [[0, 3]]}}
command: crosstab()
summary: fail; threshold: 6 cells may need suppressing; p-ratio: 1 cells may need suppressing; nk-rule: 1 cells may need suppressing;
outcome: col_0    G   N   R                            R/G
row_0
2010.0  ok  ok  ok  threshold; p-ratio; nk-rule;
2011.0  ok  ok  ok                    threshold;
2012.0  ok  ok  ok                    threshold;
2013.0  ok  ok  ok                    threshold;
2014.0  ok  ok  ok                    threshold;
2015.0  ok  ok  ok                    threshold;
output: [col_0              G              N             R         R/G
row_0
2010.0  9.921906e+06       0.000000  8.402284e+06  11636000.0
2011.0  8.502247e+06  124013.862069  7.716880e+06  16047500.0
2012.0  1.145858e+07  131859.067797  6.958050e+06  16810000.0
2013.0  1.355715e+07  147937.796610  7.202274e+06  16765625.0
2014.0  1.374815e+07  133198.254237  8.277525e+06  17845750.0
2015.0  1.113343e+07  146572.189655  1.081289e+07  18278625.0]
timestamp: 2023-07-13T14:00:28.377038
comments: ['This is a crosstab on the nursery dataset.']
exception:

The status of the record above is: fail.
Please explain why an exception should be granted.

I really need this one.

INFO:acro:records:
uid: xy_plot
status: review
type: custom
properties: {}
sdc: {}
command: custom
summary: review
outcome: Empty DataFrame
Columns: []
Index: []
output: ['XandY.jfif']
timestamp: 2023-07-13T14:00:28.765323
comments: ['This output is an image showing the relationship between X and Y']
exception:

The status of the record above is: review.
Please explain why an exception should be granted.

It's just a plot of X and Y

INFO:acro:records:outputs written to: RTEST

<acro.record.Records object at 0x7fcf501b8d60>

LS0tCnRpdGxlOiAiQUNSTyBSIE5vdGVib29rIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7cn0KIyBpbnN0YWxsLnBhY2thZ2VzKCJoYXZlbiIpCiMgaW5zdGFsbC5wYWNrYWdlcygicmV0aWN1bGF0ZSIpCmBgYAoKIyMgSW1wb3J0IExpYnJhcmllcwoKYGBge3J9CmxpYnJhcnkoaGF2ZW4pICAgIyByZWFkIC5kdGEKc291cmNlKCIuLi9hY3JvLlIiKSAgIyBBQ1JPCmBgYAoKIyMgTG9hZCBEYXRhCgpgYGB7cn0KZGF0YSA9IHJlYWRfZHRhKCIuLi9kYXRhL3Rlc3RfZGF0YS5kdGEiKQpkYXRhID0gYXMuZGF0YS5mcmFtZShkYXRhKQpkYXRhID0gemFwX2xhYmVscyhkYXRhKSAgIyBTdGF0YSBkYXRhIGZpbGVzIGluY2x1ZGUgZXh0cmEgbGFiZWxzCgpoZWFkKGRhdGEpCmBgYAoKIyMgVGFibGVzCgojIyMgQUNSTyBDcm9zc3RhYgoKYGBge3J9CmluZGV4ID0gZGF0YVssIGMoInllYXIiKV0KY29sdW1ucyA9IGRhdGFbLCBjKCJncmFudF90eXBlIildCnZhbHVlcyA9IGRhdGFbLCBjKCJpbmNfZ3JhbnRzIildCmFnZ2Z1bmMgPSAibWVhbiIKCnRhYmxlID0gYWNyb19jcm9zc3RhYihpbmRleCwgY29sdW1ucywgdmFsdWVzPXZhbHVlcywgYWdnZnVuYz1hZ2dmdW5jKQpgYGAKCmBgYHtyfQp0YWJsZQpgYGAKCiMjIyBBZGQgQ29tbWVudHMgdG8gT3V0cHV0CgpgYGB7cn0KYWNyb19hZGRfY29tbWVudHMoIm91dHB1dF8wIiwgIlRoaXMgaXMgYSBjcm9zc3RhYiBvbiB0aGUgbnVyc2VyeSBkYXRhc2V0LiIpCmBgYAoKIyMjIEFDUk8gUGl2b3QgVGFibGUKCmBgYHtyfQppbmRleCA9ICJncmFudF90eXBlIgp2YWx1ZXMgPSAiaW5jX2dyYW50cyIKYWdnZnVuYyA9IGxpc3QoIm1lYW4iLCAic3RkIikKCnRhYmxlID0gYWNyb19waXZvdF90YWJsZShkYXRhLCB2YWx1ZXM9dmFsdWVzLCBpbmRleD1pbmRleCwgYWdnZnVuYz1hZ2dmdW5jKQpgYGAKCmBgYHtyfQp0YWJsZQpgYGAKCiMjIExpbmVhciBNb2RlbHMKCmBgYHtyfQojIGV4dHJhY3QgcmVsZXZhbnQgY29sdW1ucwpkZiA9IGRhdGFbLCBjKCJpbmNfYWN0aXZpdHkiLCAiaW5jX2dyYW50cyIsICJpbmNfZG9uYXRpb25zIiwgInRvdGFsX2Nvc3RzIildCiMgZHJvcCByb3dzIHdpdGggbWlzc2luZyB2YWx1ZXMKZGYgPSBkZltjb21wbGV0ZS5jYXNlcyhkZiksIF0KIyBmb3JtdWxhIHRvIGZpdApmb3JtdWxhID0gImluY19hY3Rpdml0eSB+IGluY19ncmFudHMgKyBpbmNfZG9uYXRpb25zICsgdG90YWxfY29zdHMiCmBgYAoKIyMjIEZpdCBMaW5lYXIgTW9kZWwKCmBgYHtyfQptb2RlbCA9IGxtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZikKc3VtbWFyeShtb2RlbCkKYGBgCgojIyMgQUNSTyBMaW5lYXIgTW9kZWwKCmBgYHtyfQphY3JvX2xtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZikKYGBgCgojIyBMb2dpdC9Qcm9iaXQgTW9kZWxzCgpgYGB7cn0KIyBleHRyYWN0IHJlbGV2YW50IGNvbHVtbnMKZGYgPSBkYXRhWywgYygic3Vydml2b3IiLCAiaW5jX2FjdGl2aXR5IiwgImluY19ncmFudHMiLCAiaW5jX2RvbmF0aW9ucyIsICJ0b3RhbF9jb3N0cyIpXQojIGRyb3Agcm93cyB3aXRoIG1pc3NpbmcgdmFsdWVzCmRmID0gZGZbY29tcGxldGUuY2FzZXMoZGYpLCBdCiMgY29udmVydCBzdXJ2aXZvciB0byBudW1lcmljCmRmID0gdHJhbnNmb3JtKGRmLCBzdXJ2aXZvciA9IGFzLm51bWVyaWMoc3Vydml2b3IpKQojIGZvcm11bGEgdG8gZml0CmZvcm11bGEgPSAic3Vydml2b3IgfiBpbmNfYWN0aXZpdHkgKyBpbmNfZ3JhbnRzICsgaW5jX2RvbmF0aW9ucyArIHRvdGFsX2Nvc3RzIgpgYGAKCiMjIyBGaXQgTG9naXQgTW9kZWwKCmBgYHtyfQptb2RlbCA9IGdsbShmb3JtdWxhPWZvcm11bGEsIGRhdGE9ZGYsIGZhbWlseT1iaW5vbWlhbChsaW5rPSJsb2dpdCIpKQpzdW1tYXJ5KG1vZGVsKQpgYGAKCiMjIyBBQ1JPIExvZ2l0IE1vZGVsCgpgYGB7cn0KYWNyb19nbG0oZm9ybXVsYT1mb3JtdWxhLCBkYXRhPWRmLCBmYW1pbHk9ImxvZ2l0IikKYGBgCgojIyMgRml0IFByb2JpdCBNb2RlbAoKYGBge3J9Cm1vZGVsID0gZ2xtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZiwgZmFtaWx5PWJpbm9taWFsKGxpbms9InByb2JpdCIpKQpzdW1tYXJ5KG1vZGVsKQpgYGAKCiMjIyBBQ1JPIFByb2JpdCBNb2RlbAoKYGBge3J9CmFjcm9fZ2xtKGZvcm11bGE9Zm9ybXVsYSwgZGF0YT1kZiwgZmFtaWx5PSJwcm9iaXQiKQpgYGAKCiMjIyBBZGQgQ3VzdG9tIE91dHB1dAoKYGBge3J9CmFjcm9fY3VzdG9tX291dHB1dCgiWGFuZFkuamZpZiIsICJUaGlzIG91dHB1dCBpcyBhbiBpbWFnZSBzaG93aW5nIHRoZSByZWxhdGlvbnNoaXAgYmV0d2VlbiBYIGFuZCBZIikKYGBgCgojIyMgUmVuYW1lIE91dHB1dAoKYGBge3J9CmFjcm9fcmVuYW1lX291dHB1dCgib3V0cHV0XzUiLCAieHlfcGxvdCIpCmBgYAoKIyMjIFJlbW92ZSBPdXRwdXQKCmBgYHtyfQphY3JvX3JlbW92ZV9vdXRwdXQoIm91dHB1dF8zIikKYGBgCgojIyBGaW5hbGlzZQoKYGBge3J9CmFjcm9fZmluYWxpc2UoIlJURVNUIiwgImpzb24iKQpgYGAK