# install.packages("haven")
# install.packages("reticulate")
# install.packages("farff")
source("../acro.R") # ACRO
INFO:acro:version: 0.4.0
INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}
INFO:acro:automatic suppression: False
data = farff::readARFF("../data/nursery.arff")
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Parse with reader=readr : ../data/nursery.arff
Loading required package: readr
header: 0.005000; preproc: 0.002000; data: 0.069000; postproc: 0.001000; total: 0.077000
data = as.data.frame(data)
names(data)[names(data) == "class"] <- "recommend"
unique(data$children)
[1] 1 2 3 more
Levels: 1 2 3 more
#data$children <- sub("more","6",data$children)
unique(data$children)
[1] 1 2 3 more
Levels: 1 2 3 more
data$children <-as.numeric(as.character(data$children))
Warning: NAs introduced by coercion
unique(data$children)
[1] 1 2 3 NA
data[is.na(data)] <- round(runif(sum(is.na(data)), min = 4, max = 10),0)
unique(data$children)
[1] 1 2 3 9 10 8 6 7 5 4
head(data)
#sapply(data)
index = data[, c("recommend")]
columns = data[, c("parents")]
values = data[, c("children")]
aggfunc = "mean"
table = acro_crosstab(index, columns, values=values, aggfunc=aggfunc)
INFO:acro:get_summary(): fail; threshold: 1 cells may need suppressing; p-ratio: 4 cells may need suppressing; nk-rule: 4 cells may need suppressing;
INFO:acro:outcome_df:
col_0 great_pret ... usual
row_0 ...
not_recom ok ... ok
priority ok ... ok
recommend p-ratio; nk-rule; ... threshold; p-ratio; nk-rule;
spec_prior ok ... ok
very_recom p-ratio; nk-rule; ... ok
[5 rows x 3 columns]
INFO:acro:records:add(): output_0
table
acro_add_comments("output_0", "This is a crosstab on the nursery dataset.")
INFO:acro:records:a comment was added to output_0
index = "parents"
values = "children"
aggfunc = list("mean", "std")
table = acro_pivot_table(data, values=values, index=index, aggfunc=aggfunc)
INFO:acro:get_summary(): pass
INFO:acro:outcome_df:
mean std
children children
parents
usual ok ok
pretentious ok ok
great_pret ok ok
INFO:acro:records:add(): output_1
table
Again there is an industry-standard package in python, this time called statsmodels.
The examples below illustrate the use of the ACRO wrapper standard statsmodel functions
Note that statsmodels can be called using an ‘R-like’ format (using an ‘r’ suffix on the command names)
most statsmodels functiobns return a “results object” which has a “summary” function that produces printable/saveable outputs
data$recommend <- as.character(data$recommend)
data$recommend[which(data$recommend=="not_recom")] <- "0"
data$recommend[which(data$recommend=="recommend")] <- "1"
data$recommend[which(data$recommend=="very_recom")] <- "2"
data$recommend[which(data$recommend=="priority")] <- "3"
data$recommend[which(data$recommend=="spec_prior")] <- "4"
data$recommend <- as.numeric(data$recommend)
# extract relevant columns
df = data[, c("recommend", "children")]
# drop rows with missing values
df = df[complete.cases(df), ]
# formula to fit
formula = "recommend ~ children"
model = lm(formula=formula, data=df)
summary(model)
Call:
lm(formula = formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-2.4091 -2.2500 0.7181 1.6636 1.7545
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.227332 0.024582 90.609 < 2e-16 ***
children 0.018177 0.006045 3.007 0.00264 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.681 on 12958 degrees of freedom
Multiple R-squared: 0.0006973, Adjusted R-squared: 0.0006202
F-statistic: 9.042 on 1 and 12958 DF, p-value: 0.002643
acro_lm(formula=formula, data=df)
INFO:acro:olsr() outcome: pass; dof=12958.0 >= 10
INFO:acro:records:add(): output_2
<class 'statsmodels.iolib.summary.Summary'>
"""
OLS Regression Results
==============================================================================
Dep. Variable: recommend R-squared: 0.001
Model: OLS Adj. R-squared: 0.001
Method: Least Squares F-statistic: 9.042
Date: Thu, 13 Jul 2023 Prob (F-statistic): 0.00264
Time: 13:53:07 Log-Likelihood: -25123.
No. Observations: 12960 AIC: 5.025e+04
Df Residuals: 12958 BIC: 5.027e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.2273 0.025 90.609 0.000 2.179 2.276
children 0.0182 0.006 3.007 0.003 0.006 0.030
==============================================================================
Omnibus: 76861.436 Durbin-Watson: 2.883
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1742.771
Skew: -0.485 Prob(JB): 0.00
Kurtosis: 1.488 Cond. No. 7.04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
"""
This is an example of logit/probit regression using ACRO
We use a different combination of variables from the original
dataset.
# extract relevant columns
df = data[, c("finance", "children")]
# drop rows with missing values
df = df[complete.cases(df), ]
# convert finance to numeric
df = transform(df, finance = as.numeric(finance))
# subtract 1 to make 1s and 2S into 0a and 1s
df$finance <- df$finance -1
# formula to fit
formula = "finance ~ children"
model = glm(formula=formula, data=df, family=binomial(link="logit"))
summary(model)
Call:
glm(formula = formula, family = binomial(link = "logit"), data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.17823 -1.17729 -0.00027 1.17747 1.17768
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0009243 0.0292390 -0.032 0.975
children 0.0002843 0.0071900 0.040 0.968
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 17966 on 12959 degrees of freedom
Residual deviance: 17966 on 12958 degrees of freedom
AIC: 17970
Number of Fisher Scoring iterations: 2
acro_glm(formula=formula, data=df, family="logit")
INFO:acro:logitr() outcome: pass; dof=12958.0 >= 10
INFO:acro:records:add(): output_3
<class 'statsmodels.iolib.summary.Summary'>
"""
Logit Regression Results
==============================================================================
Dep. Variable: finance No. Observations: 12960
Model: Logit Df Residuals: 12958
Method: MLE Df Model: 1
Date: Thu, 13 Jul 2023 Pseudo R-squ.: 8.704e-08
Time: 13:53:07 Log-Likelihood: -8983.2
converged: True LL-Null: -8983.2
Covariance Type: nonrobust LLR p-value: 0.9685
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.0009 0.029 -0.032 0.975 -0.058 0.056
children 0.0003 0.007 0.040 0.968 -0.014 0.014
==============================================================================
"""
Optimization terminated successfully.
Current function value: 0.693147
Iterations 2
model = glm(formula=formula, data=df, family=binomial(link="probit"))
summary(model)
Call:
glm(formula = formula, family = binomial(link = "probit"), data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.17823 -1.17729 -0.00027 1.17747 1.17768
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0005792 0.0183228 -0.032 0.975
children 0.0001782 0.0045057 0.040 0.968
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 17966 on 12959 degrees of freedom
Residual deviance: 17966 on 12958 degrees of freedom
AIC: 17970
Number of Fisher Scoring iterations: 2
acro_glm(formula=formula, data=df, family="probit")
INFO:acro:probitr() outcome: pass; dof=12958.0 >= 10
INFO:acro:records:add(): output_4
<class 'statsmodels.iolib.summary.Summary'>
"""
Probit Regression Results
==============================================================================
Dep. Variable: finance No. Observations: 12960
Model: Probit Df Residuals: 12958
Method: MLE Df Model: 1
Date: Thu, 13 Jul 2023 Pseudo R-squ.: 8.704e-08
Time: 13:53:07 Log-Likelihood: -8983.2
converged: True LL-Null: -8983.2
Covariance Type: nonrobust LLR p-value: 0.9685
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.0006 0.018 -0.032 0.975 -0.036 0.035
children 0.0002 0.005 0.040 0.968 -0.009 0.009
==============================================================================
"""
Optimization terminated successfully.
Current function value: 0.693147
Iterations 2
acro_custom_output("XandY.jfif", "This output is an image showing the relationship between X and Y")
INFO:acro:records:add_custom(): output_5
acro_rename_output("output_5", "xy_plot")
INFO:acro:records:rename_output(): output_5 renamed to xy_plot
acro_remove_output("output_3")
INFO:acro:records:remove(): output_3 removed
acro_print_outputs()
[1] "uid: output_0\nstatus: fail\ntype: table\nproperties: {'method': 'crosstab'}\nsdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 1, 'p-ratio': 4, 'nk-rule': 4}, 'cells': {'negative': [], 'missing': [], 'threshold': [[2, 2]], 'p-ratio': [[2, 0], [2, 1], [2, 2], [4, 0]], 'nk-rule': [[2, 0], [2, 1], [2, 2], [4, 0]]}}\ncommand: crosstab()\nsummary: fail; threshold: 1 cells may need suppressing; p-ratio: 4 cells may need suppressing; nk-rule: 4 cells may need suppressing; \noutcome: col_0 great_pret ... usual\nrow_0 ... \nnot_recom ok ... ok\npriority ok ... ok\nrecommend p-ratio; nk-rule; ... threshold; p-ratio; nk-rule; \nspec_prior ok ... ok\nvery_recom p-ratio; nk-rule; ... ok\n\n[5 rows x 3 columns]\noutput: [col_0 great_pret pretentious usual\nrow_0 \nnot_recom 3.227083 3.270139 3.237500\npriority 2.677156 3.109164 3.268191\nrecommend NaN NaN 1.000000\nspec_prior 3.492582 3.501582 3.519789\nvery_recom NaN 2.303030 2.295918]\ntimestamp: 2023-07-13T13:53:06.960935\ncomments: ['This is a crosstab on the nursery dataset.']\nexception: \n\nuid: output_1\nstatus: pass\ntype: table\nproperties: {'method': 'pivot_table'}\nsdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 0, 'p-ratio': 0, 'nk-rule': 0}, 'cells': {'negative': [], 'missing': [], 'threshold': [], 'p-ratio': [], 'nk-rule': []}}\ncommand: pivot_table()\nsummary: pass\noutcome: mean std\n children children\nparents \nusual ok ok\npretentious ok ok\ngreat_pret ok ok\noutput: [ mean std\n children children\nparents \nusual 3.256944 2.449811\npretentious 3.253009 2.449417\ngreat_pret 3.242130 2.431784]\ntimestamp: 2023-07-13T13:53:07.105779\ncomments: []\nexception: \n\nuid: output_2\nstatus: pass\ntype: regression\nproperties: {'method': 'olsr', 'dof': 12958.0}\nsdc: {}\ncommand: olsr()\nsummary: pass; dof=12958.0 >= 10\noutcome: Empty DataFrame\nColumns: []\nIndex: []\noutput: [ recommend R-squared: 0.001\nDep. Variable: \nModel: OLS Adj. R-squared: 0.00100\nMethod: Least Squares F-statistic: 9.04200\nDate: Thu, 13 Jul 2023 Prob (F-statistic): 0.00264\nTime: 13:53:07 Log-Likelihood: -25123.00000\nNo. Observations: 12960 AIC: 50250.00000\nDf Residuals: 12958 BIC: 50270.00000\nDf Model: 1 NaN NaN\nCovariance Type: nonrobust NaN NaN, coef std err t P>|t| [0.025 0.975]\nIntercept 2.2273 0.025 90.609 0.000 2.179 2.276\nchildren 0.0182 0.006 3.007 0.003 0.006 0.030, 76861.436 Durbin-Watson: 2.883\nOmnibus: \nProb(Omnibus): 0.000 Jarque-Bera (JB): 1742.771\nSkew: -0.485 Prob(JB): 0.000\nKurtosis: 1.488 Cond. No. 7.040]\ntimestamp: 2023-07-13T13:53:07.232131\ncomments: []\nexception: \n\nuid: output_4\nstatus: pass\ntype: regression\nproperties: {'method': 'probitr', 'dof': 12958.0}\nsdc: {}\ncommand: probitr()\nsummary: pass; dof=12958.0 >= 10\noutcome: Empty DataFrame\nColumns: []\nIndex: []\noutput: [ finance No. Observations: 12960\nDep. Variable: \nModel: Probit Df Residuals: 1.295800e+04\nMethod: MLE Df Model: 1.000000e+00\nDate: Thu, 13 Jul 2023 Pseudo R-squ.: 8.704000e-08\nTime: 13:53:07 Log-Likelihood: -8.983200e+03\nconverged: True LL-Null: -8.983200e+03\nCovariance Type: nonrobust LLR p-value: 9.685000e-01, coef std err z P>|z| [0.025 0.975]\nIntercept -0.0006 0.018 -0.032 0.975 -0.036 0.035\nchildren 0.0002 0.005 0.040 0.968 -0.009 0.009]\ntimestamp: 2023-07-13T13:53:07.538802\ncomments: []\nexception: \n\nuid: xy_plot\nstatus: review\ntype: custom\nproperties: {}\nsdc: {}\ncommand: custom\nsummary: review\noutcome: Empty DataFrame\nColumns: []\nIndex: []\noutput: ['XandY.jfif']\ntimestamp: 2023-07-13T13:53:07.560586\ncomments: ['This output is an image showing the relationship between X and Y']\nexception: \n\n"
uid: output_0
status: fail
type: table
properties: {'method': 'crosstab'}
sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 1, 'p-ratio': 4, 'nk-rule': 4}, 'cells': {'negative': [], 'missing': [], 'threshold': [[2, 2]], 'p-ratio': [[2, 0], [2, 1], [2, 2], [4, 0]], 'nk-rule': [[2, 0], [2, 1], [2, 2], [4, 0]]}}
command: crosstab()
summary: fail; threshold: 1 cells may need suppressing; p-ratio: 4 cells may need suppressing; nk-rule: 4 cells may need suppressing;
outcome: col_0 great_pret ... usual
row_0 ...
not_recom ok ... ok
priority ok ... ok
recommend p-ratio; nk-rule; ... threshold; p-ratio; nk-rule;
spec_prior ok ... ok
very_recom p-ratio; nk-rule; ... ok
[5 rows x 3 columns]
output: [col_0 great_pret pretentious usual
row_0
not_recom 3.227083 3.270139 3.237500
priority 2.677156 3.109164 3.268191
recommend NaN NaN 1.000000
spec_prior 3.492582 3.501582 3.519789
very_recom NaN 2.303030 2.295918]
timestamp: 2023-07-13T13:53:06.960935
comments: ['This is a crosstab on the nursery dataset.']
exception:
uid: output_1
status: pass
type: table
properties: {'method': 'pivot_table'}
sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 0, 'p-ratio': 0, 'nk-rule': 0}, 'cells': {'negative': [], 'missing': [], 'threshold': [], 'p-ratio': [], 'nk-rule': []}}
command: pivot_table()
summary: pass
outcome: mean std
children children
parents
usual ok ok
pretentious ok ok
great_pret ok ok
output: [ mean std
children children
parents
usual 3.256944 2.449811
pretentious 3.253009 2.449417
great_pret 3.242130 2.431784]
timestamp: 2023-07-13T13:53:07.105779
comments: []
exception:
uid: output_2
status: pass
type: regression
properties: {'method': 'olsr', 'dof': 12958.0}
sdc: {}
command: olsr()
summary: pass; dof=12958.0 >= 10
outcome: Empty DataFrame
Columns: []
Index: []
output: [ recommend R-squared: 0.001
Dep. Variable:
Model: OLS Adj. R-squared: 0.00100
Method: Least Squares F-statistic: 9.04200
Date: Thu, 13 Jul 2023 Prob (F-statistic): 0.00264
Time: 13:53:07 Log-Likelihood: -25123.00000
No. Observations: 12960 AIC: 50250.00000
Df Residuals: 12958 BIC: 50270.00000
Df Model: 1 NaN NaN
Covariance Type: nonrobust NaN NaN, coef std err t P>|t| [0.025 0.975]
Intercept 2.2273 0.025 90.609 0.000 2.179 2.276
children 0.0182 0.006 3.007 0.003 0.006 0.030, 76861.436 Durbin-Watson: 2.883
Omnibus:
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1742.771
Skew: -0.485 Prob(JB): 0.000
Kurtosis: 1.488 Cond. No. 7.040]
timestamp: 2023-07-13T13:53:07.232131
comments: []
exception:
uid: output_4
status: pass
type: regression
properties: {'method': 'probitr', 'dof': 12958.0}
sdc: {}
command: probitr()
summary: pass; dof=12958.0 >= 10
outcome: Empty DataFrame
Columns: []
Index: []
output: [ finance No. Observations: 12960
Dep. Variable:
Model: Probit Df Residuals: 1.295800e+04
Method: MLE Df Model: 1.000000e+00
Date: Thu, 13 Jul 2023 Pseudo R-squ.: 8.704000e-08
Time: 13:53:07 Log-Likelihood: -8.983200e+03
converged: True LL-Null: -8.983200e+03
Covariance Type: nonrobust LLR p-value: 9.685000e-01, coef std err z P>|z| [0.025 0.975]
Intercept -0.0006 0.018 -0.032 0.975 -0.036 0.035
children 0.0002 0.005 0.040 0.968 -0.009 0.009]
timestamp: 2023-07-13T13:53:07.538802
comments: []
exception:
uid: xy_plot
status: review
type: custom
properties: {}
sdc: {}
command: custom
summary: review
outcome: Empty DataFrame
Columns: []
Index: []
output: ['XandY.jfif']
timestamp: 2023-07-13T13:53:07.560586
comments: ['This output is an image showing the relationship between X and Y']
exception:
#acro_finalise("RTEST", "xlsx")
acro_finalise("RTEST", "json")
INFO:acro:records:
uid: output_0
status: fail
type: table
properties: {'method': 'crosstab'}
sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 1, 'p-ratio': 4, 'nk-rule': 4}, 'cells': {'negative': [], 'missing': [], 'threshold': [[2, 2]], 'p-ratio': [[2, 0], [2, 1], [2, 2], [4, 0]], 'nk-rule': [[2, 0], [2, 1], [2, 2], [4, 0]]}}
command: crosstab()
summary: fail; threshold: 1 cells may need suppressing; p-ratio: 4 cells may need suppressing; nk-rule: 4 cells may need suppressing;
outcome: col_0 great_pret ... usual
row_0 ...
not_recom ok ... ok
priority ok ... ok
recommend p-ratio; nk-rule; ... threshold; p-ratio; nk-rule;
spec_prior ok ... ok
very_recom p-ratio; nk-rule; ... ok
[5 rows x 3 columns]
output: [col_0 great_pret pretentious usual
row_0
not_recom 3.227083 3.270139 3.237500
priority 2.677156 3.109164 3.268191
recommend NaN NaN 1.000000
spec_prior 3.492582 3.501582 3.519789
very_recom NaN 2.303030 2.295918]
timestamp: 2023-07-13T13:53:06.960935
comments: ['This is a crosstab on the nursery dataset.']
exception:
The status of the record above is: fail.
Please explain why an exception should be granted.
I really want this output
INFO:acro:records:
uid: xy_plot
status: review
type: custom
properties: {}
sdc: {}
command: custom
summary: review
outcome: Empty DataFrame
Columns: []
Index: []
output: ['XandY.jfif']
timestamp: 2023-07-13T13:53:07.560586
comments: ['This output is an image showing the relationship between X and Y']
exception:
The status of the record above is: review.
Please explain why an exception should be granted.
It's just a plot of x and y
INFO:acro:records:outputs written to: RTEST
<acro.record.Records object at 0x7fec6efb4d60>