Code
projectdir = splitpath(pwd()) |>
ss -> joinpath(ss[1:findall([s == "endogenous-macrodynamics-in-algorithmic-recourse" for s in ss])[1]]...)
cd(projectdir)projectdir = splitpath(pwd()) |>
ss -> joinpath(ss[1:findall([s == "endogenous-macrodynamics-in-algorithmic-recourse" for s in ss])[1]]...)
cd(projectdir)n = 1000
p = 2
# Linearly separable:
X, y = make_blobs(n, p; centers=2, center_box=(-2 => 2), cluster_std=0.1)
df = DataFrame(X)
df.target .= ifelse.(y.==1,0,1)
CSV.write(joinpath(data_path, "linearly_separable.csv"),df)
# Overlapping:
X, y = make_blobs(n, p; centers=2, center_box=(-2 => 2), cluster_std=0.5)
df = DataFrame(X)
df.target .= ifelse.(y.==1,0,1)
CSV.write(joinpath(data_path, "overlapping.csv"),df)
# Circles:
X, y = make_circles(n; noise=0.15, factor=0.01)
df = DataFrame(X)
df.target = y
CSV.write(joinpath(data_path, "circles.csv"),df)
# Moon:
X, y = make_moons(n)
df = DataFrame(X)
df.target = y
CSV.write(joinpath(data_path, "moons.csv"),df)generate_artifacts(data_path)catalogue = load_synthetic()
function plot_data(data,title)
plt = plot(title=uppercasefirst(replace(string(title),"_" => " ")))
scatter!(data)
return plt
end
plts = [plot_data(data,name) for (name, data) in catalogue]
plt = plot(plts..., layout=(1,4), size=(850,200))
savefig(plt, "paper/www/synthetic_data.png")
display(plt)projectdir = splitpath(pwd()) |>
ss -> joinpath(ss[1:findall([s == "endogenous-macrodynamics-in-algorithmic-recourse" for s in ss])[1]]...)
cd(projectdir)Fetching the data using Python’s sklearn (run this in the Python REPL):
from sklearn.datasets import fetch_california_housing
df, y = fetch_california_housing(return_X_y=True, as_frame=True)
df["target"] = y.values
data_path = "dev/artifacts/upload/data/real_world"
import os
if not os.path.isdir(os.path.join(data_path,"raw")):
os.makedirs(os.path.join(data_path,"raw"))
df.to_csv(os.path.join(data_path,"raw/cal_housing.csv"), index=False)Loading the data into Julia session.
df = CSV.read(joinpath(data_path, "raw/cal_housing.csv"), DataFrame)
# Target:
y = df.target
y = Float64.(y .>= median(y)); # binary target (positive outcome)
# Data:
df.target = yRandom undersampling to balance the data:
df_balanced = getobs(undersample(df, df.target; shuffle=true))[1]All features are continuous:
schema(df_balanced)Feature transformation:
transformer = Standardizer(count=true)
mach = MLJBase.fit!(machine(transformer, df_balanced[:,Not(:target)]))
X = MLJBase.transform(mach, df_balanced[:,Not(:target)])
schema(X)Turning the data into CounterfactualData:
X = Matrix(X)
X = permutedims(X)
y = permutedims(df_balanced.target)
data = CounterfactualData(X,y)Saving the data:
CSV.write(joinpath(data_path, "cal_housing.csv"), df_balanced) # binary file
Serialization.serialize(joinpath(data_path,"cal_housing.jls"), data) # CounterfactualDataLoading and basic preprocessing:
df = CSV.read(joinpath(data_path, "raw/cs-training.csv"), DataFrame)
select!(df, Not([:Column1]))
rename!(df, :SeriousDlqin2yrs => :target)
mapcols!(x -> [ifelse(x_=="NA", missing, x_) for x_ in x], df)
dropmissing!(df)
mapcols!(x -> eltype(x) <: AbstractString ? parse.(Int, x) : x, df)
df.target .= map(y -> y == 0 ? 1 : 0, df.target) # postive outcome = no delinquencyBalancing:
df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]All features are continuous:
schema(df_balanced)Feature transformation:
transformer = Standardizer(count=true)
mach = MLJBase.fit!(machine(transformer, df_balanced[:,Not(:target)]))
X = MLJBase.transform(mach, df_balanced[:,Not(:target)])
schema(X)Turning the data into CounterfactualData:
X = Matrix(X)
X = permutedims(X)
y = permutedims(df_balanced.target)
data = CounterfactualData(X,y)Saving:
CSV.write(joinpath(data_path, "gmsc.csv"), df_balanced) # binary file
Serialization.serialize(joinpath(data_path,"gmsc.jls"), data) # CounterfactualDataLoading and basic preprocessing:
df = CSV.read(joinpath(data_path, "raw/UCI_Credit_Card.csv"), DataFrame)
select!(df, Not([:ID]))
rename!(df, "default.payment.next.month" => :target)
dropmissing!(df)
df.SEX = categorical(df.SEX)
df.EDUCATION = categorical(df.EDUCATION)
df.MARRIAGE = categorical(df.MARRIAGE)
mapcols!(x -> eltype(x) <: AbstractString ? parse.(Int, x) : x, df)
df.target .= map(y -> y == 0 ? 1 : 0, df.target) # postive outcome = no defaultBalancing:
df_balanced = getobs(undersample(df, df.target;shuffle=true))[1]Not all features are continuous:
schema(df_balanced)Feature transformation:
transformer = Standardizer(count=true) |> ContinuousEncoder()
mach = MLJBase.fit!(machine(transformer, df_balanced[:,Not(:target)]))
X = MLJBase.transform(mach, df_balanced[:,Not(:target)])
schema(X)Categorical indices:
features_categorical = [
[2,3],
collect(4:10),
collect(11:14)
]Preparing for use with CounterfactualExplanations.jl:
X = Matrix(X)
X = permutedims(X)
y = permutedims(df_balanced.target)
data = CounterfactualData(
X, y;
features_categorical = features_categorical
)Saving:
CSV.write(joinpath(data_path, "credit_default.csv"), df_balanced) # binary file
Serialization.serialize(joinpath(data_path,"credit_default.jls"), data) # CounterfactualDatagenerate_artifacts(data_path)