using MLJ, DataFrames, Random
Random.seed!(95117)
n = 8
df = DataFrame(target=rand(0:1,n).>.5, feature1=rand(n), feature2=rand(n))


schema(df)

┌──────────┬─────────┬────────────┐
│ _.names  │ _.types │ _.scitypes │
├──────────┼─────────┼────────────┤
│ target   │ Bool    │ Count      │
│ feature1 │ Float64 │ Continuous │
│ feature2 │ Float64 │ Continuous │
└──────────┴─────────┴────────────┘
_.nrows = 8


using Plots
scatter(df.feature1[.!df.target],df.feature2[.!df.target],label="Target 1")
scatter!(df.feature1[df.target],df.feature2[df.target],label="Target 2")


coerce!(df, :target=>OrderedFactor)
schema(df)

┌──────────┬────────────────────────────────┬──────────────────┐
│ _.names  │ _.types                        │ _.scitypes       │
├──────────┼────────────────────────────────┼──────────────────┤
│ target   │ CategoricalValue{Bool, UInt32} │ OrderedFactor{2} │
│ feature1 │ Float64                        │ Continuous       │
│ feature2 │ Float64                        │ Continuous       │
└──────────┴────────────────────────────────┴──────────────────┘
_.nrows = 8


y = df.target
X = select(df, Not(:target))
first(X, 5)


models(matching(X, y))

48-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple}:
 (name = AdaBoostClassifier, package_name = ScikitLearn, ... )
 (name = AdaBoostStumpClassifier, package_name = DecisionTree, ... )
 (name = BaggingClassifier, package_name = ScikitLearn, ... )
 (name = BayesianLDA, package_name = MultivariateStats, ... )
 (name = BayesianLDA, package_name = ScikitLearn, ... )
 (name = BayesianQDA, package_name = ScikitLearn, ... )
 (name = BayesianSubspaceLDA, package_name = MultivariateStats, ... )
 (name = ConstantClassifier, package_name = MLJModels, ... )
 (name = DecisionTreeClassifier, package_name = BetaML, ... )
 (name = DecisionTreeClassifier, package_name = DecisionTree, ... )
 (name = DeterministicConstantClassifier, package_name = MLJModels, ... )
 (name = DummyClassifier, package_name = ScikitLearn, ... )
 (name = EvoTreeClassifier, package_name = EvoTrees, ... )
 ⋮
 (name = RandomForestClassifier, package_name = BetaML, ... )
 (name = RandomForestClassifier, package_name = DecisionTree, ... )
 (name = RandomForestClassifier, package_name = ScikitLearn, ... )
 (name = RidgeCVClassifier, package_name = ScikitLearn, ... )
 (name = RidgeClassifier, package_name = ScikitLearn, ... )
 (name = SGDClassifier, package_name = ScikitLearn, ... )
 (name = SVC, package_name = LIBSVM, ... )
 (name = SVMClassifier, package_name = ScikitLearn, ... )
 (name = SVMLinearClassifier, package_name = ScikitLearn, ... )
 (name = SVMNuClassifier, package_name = ScikitLearn, ... )
 (name = SubspaceLDA, package_name = MultivariateStats, ... )
 (name = XGBoostClassifier, package_name = XGBoost, ... )


@load DecisionTreeClassifier pkg=DecisionTree verbosity=0
model = MLJDecisionTreeInterface.DecisionTreeClassifier()

DecisionTreeClassifier(
    max_depth = -1,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_purity_increase = 0.0,
    n_subfeatures = 0,
    post_prune = false,
    merge_purity_threshold = 1.0,
    pdf_smoothing = 0.0,
    display_depth = 5,
    rng = Random._GLOBAL_RNG()) @577


mach = machine(model, X, y)

Machine{DecisionTreeClassifier,…} @233 trained 0 times; caches data
  args: 
    1:	Source @499 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @806 ⏎ `AbstractVector{OrderedFactor{2}}`


fit!(mach)

┌ Info: Training Machine{DecisionTreeClassifier,…} @233.
└ @ MLJBase C:\Users\kay\.julia\packages\MLJBase\xlh6G\src\machines.jl:390

Machine{DecisionTreeClassifier,…} @233 trained 1 time; caches data
  args: 
    1:	Source @499 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @806 ⏎ `AbstractVector{OrderedFactor{2}}`


mach.fitresult

(Decision Tree
Leaves: 4
Depth:  3, CategoricalArrays.CategoricalValue{Bool, UInt32}[false, true], UInt32[0x00000001, 0x00000002])


report(mach).print_tree(3)

Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
    L-> 2 : 2/2
    R-> Feature 1, Threshold 0.7637465112420186
        L-> 1 : 2/2
        R-> 2 : 1/1
R-> 1 : 3/3


ŷ = predict_mode(mach, X)

8-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
 false
 false
 false
 true
 true
 false
 true
 false


[ŷ y]

8×2 CategoricalArrays.CategoricalArray{Bool,2,UInt32}:
 false  false
 false  false
 false  false
 true   true
 true   true
 false  false
 true   true
 false  false


confusion_matrix(ŷ, y)

              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │    false    │    true     │
├─────────────┼─────────────┼─────────────┤
│    false    │      5      │      0      │
├─────────────┼─────────────┼─────────────┤
│    true     │      0      │      3      │
└─────────────┴─────────────┴─────────────┘


predict_mode(mach, DataFrame(feature1=[.5], feature2=[.5]))

1-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
 false


predict(mach, DataFrame(feature1=[.5], feature2=[.5]))

1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)


report(mach).print_tree(3)

Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
    L-> 2 : 2/2
    R-> Feature 1, Threshold 0.7637465112420186
        L-> 1 : 2/2
        R-> 2 : 1/1
R-> 1 : 3/3


report(mach).print_tree(2)

Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
    L-> 2 : 2/2
    R-> 
R-> 1 : 3/3


model2 = MLJDecisionTreeInterface.DecisionTreeClassifier(max_depth = 2)

DecisionTreeClassifier(
    max_depth = 2,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_purity_increase = 0.0,
    n_subfeatures = 0,
    post_prune = false,
    merge_purity_threshold = 1.0,
    pdf_smoothing = 0.0,
    display_depth = 5,
    rng = Random._GLOBAL_RNG()) @790


mach2 = machine(model2, X, y)
fit!(mach2)
mach2.fitresult

┌ Info: Training Machine{DecisionTreeClassifier,…} @711.
└ @ MLJBase C:\Users\kay\.julia\packages\MLJBase\xlh6G\src\machines.jl:390

(Decision Tree
Leaves: 3
Depth:  2, CategoricalArrays.CategoricalValue{Bool, UInt32}[false, true], UInt32[0x00000001, 0x00000002])


report(mach2).print_tree(2)

Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
    L-> 2 : 2/2
    R-> 1 : 2/3
R-> 1 : 3/3


ŷ = predict_mode(mach2, X)

8-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
 false
 false
 false
 true
 false
 false
 true
 false


[ŷ y]

8×2 CategoricalArrays.CategoricalArray{Bool,2,UInt32}:
 false  false
 false  false
 false  false
 true   true
 false  true
 false  false
 true   true
 false  false


confusion_matrix(ŷ, y)

              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │    false    │    true     │
├─────────────┼─────────────┼─────────────┤
│    false    │      5      │      1      │
├─────────────┼─────────────┼─────────────┤
│    true     │      0      │      2      │
└─────────────┴─────────────┴─────────────┘


7/8, accuracy(ŷ, y)

(0.875, 0.875)


predict(mach2, X)

8-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
 UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
 UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
 UnivariateFinite{OrderedFactor{2}}(false=>0.0, true=>1.0)
 UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
 UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
 UnivariateFinite{OrderedFactor{2}}(false=>0.0, true=>1.0)
 UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)


predict_mode(mach2, DataFrame(feature1=[.5], feature2=[.5]))

1-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
 false


predict(mach2, DataFrame(feature1=[.5], feature2=[.5]))

1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)


info("DecisionTreeClassifier", pkg="DecisionTree")

CART decision tree classifier.
→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).
→ do `@load DecisionTreeClassifier pkg="DecisionTree"` to use the model.
→ do `?DecisionTreeClassifier` for documentation.
(name = "DecisionTreeClassifier",
 package_name = "DecisionTree",
 is_supervised = true,
 abstract_type = Probabilistic,
 deep_properties = (),
 docstring = "CART decision tree classifier.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load DecisionTreeClassifier pkg=\"DecisionTree\"` to use the model.\n→ do `?DecisionTreeClassifier` for documentation.",
 fit_data_scitype = Tuple{Table{_s48} where _s48<:Union{AbstractVector{_s47} where _s47<:Count, AbstractVector{_s47} where _s47<:OrderedFactor, AbstractVector{_s47} where _s47<:Continuous}, AbstractVector{_s41} where _s41<:Finite},
 hyperparameter_ranges = (nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing),
 hyperparameter_types = ("Int64", "Int64", "Int64", "Float64", "Int64", "Bool", "Float64", "Float64", "Int64", "Union{Integer, Random.AbstractRNG}"),
 hyperparameters = (:max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune, :merge_purity_threshold, :pdf_smoothing, :display_depth, :rng),
 implemented_methods = [:clean!, :fit, :fitted_params, :predict],
 inverse_transform_scitype = Unknown,
 is_pure_julia = true,
 is_wrapper = false,
 iteration_parameter = nothing,
 load_path = "MLJDecisionTreeInterface.DecisionTreeClassifier",
 package_license = "MIT",
 package_url = "https://github.com/bensadeghi/DecisionTree.jl",
 package_uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb",
 predict_scitype = AbstractVector{ScientificTypesBase.Density{_s25} where _s25<:Finite},
 prediction_type = :probabilistic,
 supports_class_weights = false,
 supports_online = false,
 supports_training_losses = false,
 supports_weights = false,
 transform_scitype = Unknown,
 input_scitype = Table{_s48} where _s48<:Union{AbstractVector{_s47} where _s47<:Count, AbstractVector{_s47} where _s47<:OrderedFactor, AbstractVector{_s47} where _s47<:Continuous},
 target_scitype = AbstractVector{_s41} where _s41<:Finite,
 output_scitype = Unknown,)


report(mach).print_tree(3)

Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
    L-> 2 : 2/2
    R-> Feature 1, Threshold 0.7637465112420186
        L-> 1 : 2/2
        R-> 2 : 1/1
R-> 1 : 3/3


t, f2 = df.target, df.feature2
idx = sortperm(f2)
t, f2 = t[idx], f2[idx]
s = 0.63
t[f2 .<= s]

5-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
 false
 true
 true
 false
 true


1 - (2/5)^2 - (3/5)^2

0.48


impurity(t) = 1 - sum(([sum(t .== i) for i in unique(t)]./length(t)).^2)
impurity(t[f2 .<= s])

0.48


impurity(t[f2 .> s])

0.0


f2[1:end-1] .+ diff(f2)/2

7-element Vector{Float64}:
 0.12477885945422329
 0.2882471857031962
 0.45488184432864687
 0.5718955812012008
 0.6340010854678377
 0.7143205625010004
 0.8640919332503664


split_cont(f) = f[1:end-1] .+ diff(f)/2
split_cont(f2)

7-element Vector{Float64}:
 0.12477885945422329
 0.2882471857031962
 0.45488184432864687
 0.5718955812012008
 0.6340010854678377
 0.7143205625010004
 0.8640919332503664


impurity(t)

0.46875


# IG = impurity(T) - impurity(TL)*nTL/nT - impurity(TR)*nTR/nT
IG = 0.46875 - 0.48*5/8 - 0.0*3/8

0.16875


IG = impurity(t) - impurity(t[f2 .<= s])*5/8 - impurity(t[f2 .> s])*3/8

0.16875


function infogain(s,f,t)
    is = f .<= s
    IG = impurity(t) - impurity(t[is])*length(f[is])/length(f) - 
        impurity(t[.!is])*length(f[.!is])/length(f)
end

infogain (generic function with 1 method)

df


T = Matrix(df[!,[2,3,1]])

8×3 Matrix{Float64}:
 0.0761072  0.987106   0.0
 0.420093   0.741078   0.0
 0.666973   0.0194752  0.0
 0.0972444  0.230083   1.0
 0.86052    0.580439   1.0
 0.447504   0.563352   0.0
 0.29995    0.346412   1.0
 0.629998   0.687563   0.0


idx1 = sortperm(T[:,1])

8-element Vector{Int64}:
 1
 4
 7
 2
 6
 8
 3
 5


s1 = split_cont(T[idx1,1])

7-element Vector{Float64}:
 0.08667578253830954
 0.19859718643090984
 0.3600215998051657
 0.4337986380036255
 0.5387510783253423
 0.6484857689281089
 0.7637465112420186


[infogain(i,T[idx1,1],T[idx1,end]) for i in s1]

7-element Vector{Float64}:
 0.04017857142857134
 0.010416666666666685
 0.10208333333333347
 0.03125
 0.0020833333333333537
 0.010416666666666685
 0.11160714285714285


idx2 = sortperm(T[:,2])
s2 = split_cont(T[idx2,2])

7-element Vector{Float64}:
 0.12477885945422329
 0.2882471857031962
 0.45488184432864687
 0.5718955812012008
 0.6340010854678377
 0.7143205625010004
 0.8640919332503664


[infogain(i,T[idx2,2],T[idx2,end]) for i in s2]

7-element Vector{Float64}:
 0.04017857142857134
 0.010416666666666685
 0.10208333333333347
 0.03125
 0.16875
 0.09375
 0.04017857142857134


argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])

5


s2[argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])]

0.6340010854678377


isL = T[idx2,2] .<= s2[argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])]

8-element BitVector:
 1
 1
 1
 1
 1
 0
 0
 0


TL = T[idx2[isL],:]

5×3 Matrix{Float64}:
 0.666973   0.0194752  0.0
 0.0972444  0.230083   1.0
 0.29995    0.346412   1.0
 0.447504   0.563352   0.0
 0.86052    0.580439   1.0


impurity(TL[:,end])

0.48


TR = T[idx2[.!isL],:]

3×3 Matrix{Float64}:
 0.629998   0.687563  0.0
 0.420093   0.741078  0.0
 0.0761072  0.987106  0.0


impurity(TR[:,end])

0.0


idx1 = sortperm(TL[:,1])
s1 = split_cont(TL[idx1,1])
[infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1]

4-element Vector{Float64}:
 0.07999999999999996
 0.21333333333333332
 0.013333333333333308
 0.07999999999999996


idx2 = sortperm(TL[:,2])
s2 = split_cont(TL[idx2,2])
[infogain(i,TL[idx2,2],TL[idx2,end]) for i in s2]

4-element Vector{Float64}:
 0.18
 0.013333333333333308
 0.013333333333333308
 0.07999999999999996


s1[argmax([infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1])]

0.3737270065061601


isLL = TL[idx1,1] .<= s1[argmax([infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1])]

5-element BitVector:
 1
 1
 0
 0
 0


TLL = TL[idx1[isLL],:]

2×3 Matrix{Float64}:
 0.0972444  0.230083  1.0
 0.29995    0.346412  1.0


impurity(TLL[:,end])

0.0


TLR = TL[idx1[.!isLL],:]

3×3 Matrix{Float64}:
 0.447504  0.563352   0.0
 0.666973  0.0194752  0.0
 0.86052   0.580439   1.0


impurity(TLR[:,end])

0.4444444444444444


idx1 = sortperm(TLR[:,1])
s1 = split_cont(TLR[idx1,1])
[infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1]

2-element Vector{Float64}:
 0.1111111111111111
 0.4444444444444444


idx2 = sortperm(TLR[:,2])
s2 = split_cont(TLR[idx2,2])
[infogain(i,TLR[idx2,2],TLR[idx2,end]) for i in s2]

2-element Vector{Float64}:
 0.1111111111111111
 0.4444444444444444


s1[argmax([infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1])]

0.7637465112420186


isLLL = TLR[idx1,1] .<= s1[argmax([infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1])]

3-element BitVector:
 1
 1
 0


TLLL = TLR[idx1[isLLL],:]

2×3 Matrix{Float64}:
 0.447504  0.563352   0.0
 0.666973  0.0194752  0.0


impurity(TLLL[:,end])

0.0


TLRR = TLR[idx1[.!isLLL],:]

1×3 Matrix{Float64}:
 0.86052  0.580439  1.0


impurity(TLRR[:,end])

0.0


using MLJ, DataFrames, CSV
df = DataFrame(CSV.File("ML-3-Titanic-train.csv"))
first(df, 5)


describe(df)


rename!(df, "Siblings/Spouses Aboard"=>:Sibsp, "Parents/Children Aboard"=>:Parch)
df = select(df, Not(:Name))
first(df, 5)


describe(df)


schema(df)

┌──────────┬─────────┬────────────┐
│ _.names  │ _.types │ _.scitypes │
├──────────┼─────────┼────────────┤
│ Survived │ Int64   │ Count      │
│ Pclass   │ Int64   │ Count      │
│ Sex      │ String  │ Textual    │
│ Age      │ Float64 │ Continuous │
│ Sibsp    │ Int64   │ Count      │
│ Parch    │ Int64   │ Count      │
│ Fare     │ Float64 │ Continuous │
└──────────┴─────────┴────────────┘
_.nrows = 710


subtypes(Finite)

2-element Vector{Any}:
 Multiclass
 OrderedFactor


subtypes(Infinite)

2-element Vector{Any}:
 Continuous
 Count


df.Survived = coerce(df.Survived, OrderedFactor)
schema(df)

┌──────────┬─────────────────────────────────┬──────────────────┐
│ _.names  │ _.types                         │ _.scitypes       │
├──────────┼─────────────────────────────────┼──────────────────┤
│ Survived │ CategoricalValue{Int64, UInt32} │ OrderedFactor{2} │
│ Pclass   │ Int64                           │ Count            │
│ Sex      │ String                          │ Textual          │
│ Age      │ Float64                         │ Continuous       │
│ Sibsp    │ Int64                           │ Count            │
│ Parch    │ Int64                           │ Count            │
│ Fare     │ Float64                         │ Continuous       │
└──────────┴─────────────────────────────────┴──────────────────┘
_.nrows = 710


df.Sex = coerce(df.Sex, OrderedFactor)
df.Sex = coerce(df.Sex, Count)
schema(df)

┌──────────┬─────────────────────────────────┬──────────────────┐
│ _.names  │ _.types                         │ _.scitypes       │
├──────────┼─────────────────────────────────┼──────────────────┤
│ Survived │ CategoricalValue{Int64, UInt32} │ OrderedFactor{2} │
│ Pclass   │ Int64                           │ Count            │
│ Sex      │ Int64                           │ Count            │
│ Age      │ Float64                         │ Continuous       │
│ Sibsp    │ Int64                           │ Count            │
│ Parch    │ Int64                           │ Count            │
│ Fare     │ Float64                         │ Continuous       │
└──────────┴─────────────────────────────────┴──────────────────┘
_.nrows = 710


first(df, 5)


y = df.Survived
X = select(df, Not(:Survived))
first(X, 5)


scitype(y)

AbstractVector{OrderedFactor{2}} (alias for AbstractArray{OrderedFactor{2}, 1})


models(matching(X, y))

11-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple}:
 (name = AdaBoostStumpClassifier, package_name = DecisionTree, ... )
 (name = ConstantClassifier, package_name = MLJModels, ... )
 (name = DecisionTreeClassifier, package_name = BetaML, ... )
 (name = DecisionTreeClassifier, package_name = DecisionTree, ... )
 (name = DeterministicConstantClassifier, package_name = MLJModels, ... )
 (name = KernelPerceptronClassifier, package_name = BetaML, ... )
 (name = PegasosClassifier, package_name = BetaML, ... )
 (name = PerceptronClassifier, package_name = BetaML, ... )
 (name = RandomForestClassifier, package_name = BetaML, ... )
 (name = RandomForestClassifier, package_name = DecisionTree, ... )
 (name = RandomForestClassifier, package_name = ScikitLearn, ... )


@load DecisionTreeClassifier pkg=DecisionTree verbosity=0
model = MLJDecisionTreeInterface.DecisionTreeClassifier()
mach = machine(model, X, y)

Machine{DecisionTreeClassifier,…} @647 trained 0 times; caches data
  args: 
    1:	Source @114 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}`
    2:	Source @304 ⏎ `AbstractVector{OrderedFactor{2}}`


train, val = partition(eachindex(y), 0.7, shuffle=true, rng=7245)

([101, 70, 656, 623, 103, 575, 576, 115, 506, 188  …  328, 123, 321, 318, 297, 201, 41, 184, 22, 592], [521, 77, 180, 290, 265, 86, 198, 31, 134, 209  …  79, 176, 531, 603, 606, 509, 447, 490, 478, 266])


fit!(mach, rows=train, verbosity=0)

Machine{DecisionTreeClassifier,…} @647 trained 1 time; caches data
  args: 
    1:	Source @114 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}`
    2:	Source @304 ⏎ `AbstractVector{OrderedFactor{2}}`


mach.fitresult

(Decision Tree
Leaves: 104
Depth:  18, CategoricalArrays.CategoricalValue{Int64, UInt32}[0, 1], UInt32[0x00000001, 0x00000002])


names(X)

6-element Vector{String}:
 "Pclass"
 "Sex"
 "Age"
 "Sibsp"
 "Parch"
 "Fare"


report(mach).print_tree(5)

Feature 2, Threshold 1.5
L-> Feature 1, Threshold 2.5
    L-> Feature 3, Threshold 3.0
        L-> 1 : 1/1
        R-> Feature 6, Threshold 28.85625
            L-> Feature 3, Threshold 43.0
                L-> 
                R-> 
            R-> Feature 5, Threshold 1.5
                L-> 2 : 58/58
                R-> 
    R-> Feature 6, Threshold 22.90415
        L-> Feature 6, Threshold 18.62915
            L-> Feature 3, Threshold 36.5
                L-> 
                R-> 
            R-> 2 : 7/7
        R-> Feature 3, Threshold 5.5
            L-> Feature 3, Threshold 3.5
                L-> 1 : 2/2
                R-> 2 : 1/1
            R-> 1 : 14/14
R-> Feature 3, Threshold 13.0
    L-> Feature 4, Threshold 2.5
        L-> 2 : 15/15
        R-> 1 : 9/9
    R-> Feature 6, Threshold 26.26875
        L-> Feature 3, Threshold 32.5
            L-> Feature 3, Threshold 28.75
                L-> 
                R-> 
            R-> Feature 6, Threshold 7.987500000000001
                L-> 
                R-> 1 : 43/43
        R-> Feature 3, Threshold 58.0
            L-> Feature 5, Threshold 0.5
                L-> 
                R-> 
            R-> 1 : 7/7


ŷ = predict_mode(mach, X[val,:])
confusion_matrix(ŷ, y[val])

              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │      0      │      1      │
├─────────────┼─────────────┼─────────────┤
│      0      │     112     │     28      │
├─────────────┼─────────────┼─────────────┤
│      1      │     23      │     50      │
└─────────────┴─────────────┴─────────────┘


accuracy(ŷ, y[val])

0.7605633802816901


sum(y .== 0), sum(y .== 1), length(y)

(430, 280, 710)


sum(y .== 0)/length(y)

0.6056338028169014


train, val = partition(eachindex(y), 0.7, shuffle=true, rng=93544)
fit!(mach, rows=train, verbosity=0)
ŷ = predict_mode(mach, X[val,:])
confusion_matrix(ŷ, y[val])

              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │      0      │      1      │
├─────────────┼─────────────┼─────────────┤
│      0      │     102     │     18      │
├─────────────┼─────────────┼─────────────┤
│      1      │     31      │     62      │
└─────────────┴─────────────┴─────────────┘


accuracy(ŷ, y[val])

0.7699530516431925


names(X)

6-element Vector{String}:
 "Pclass"
 "Sex"
 "Age"
 "Sibsp"
 "Parch"
 "Fare"


model2 = MLJDecisionTreeInterface.DecisionTreeClassifier(max_depth = 6)

DecisionTreeClassifier(
    max_depth = 6,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_purity_increase = 0.0,
    n_subfeatures = 0,
    post_prune = false,
    merge_purity_threshold = 1.0,
    pdf_smoothing = 0.0,
    display_depth = 5,
    rng = Random._GLOBAL_RNG()) @216


mach2 = machine(model2, X, y)
fit!(mach2, rows=train, verbosity=0)
report(mach2).print_tree(6)

Feature 2, Threshold 1.5
L-> Feature 1, Threshold 2.5
    L-> Feature 6, Threshold 28.85625
        L-> Feature 6, Threshold 28.23125
            L-> Feature 3, Threshold 22.5
                L-> 2 : 7/7
                R-> Feature 3, Threshold 27.5
                    L-> 2 : 6/9
                    R-> 2 : 21/24
            R-> 1 : 1/1
        R-> Feature 6, Threshold 149.0354
            L-> 2 : 51/51
            R-> Feature 6, Threshold 152.50625000000002
                L-> 1 : 1/1
                R-> 2 : 7/7
    R-> Feature 3, Threshold 38.5
        L-> Feature 6, Threshold 23.25415
            L-> Feature 3, Threshold 32.5
                L-> Feature 5, Threshold 1.5
                    L-> 2 : 27/50
                    R-> 2 : 6/6
                R-> 2 : 6/6
            R-> Feature 5, Threshold 3.5
                L-> Feature 3, Threshold 5.5
                    L-> 1 : 3/4
                    R-> 1 : 9/9
                R-> 2 : 1/1
        R-> 1 : 10/10
R-> Feature 3, Threshold 13.0
    L-> Feature 4, Threshold 2.5
        L-> Feature 5, Threshold 0.5
            L-> Feature 6, Threshold 15.014600000000002
                L-> 2 : 1/1
                R-> 1 : 1/1
            R-> 2 : 17/17
        R-> 1 : 8/8
    R-> Feature 6, Threshold 26.26875
        L-> Feature 3, Threshold 32.5
            L-> Feature 3, Threshold 28.75
                L-> Feature 6, Threshold 7.239599999999999
                    L-> 1 : 12/16
                    R-> 1 : 88/95
                R-> Feature 6, Threshold 7.7625
                    L-> 1 : 4/4
                    R-> 1 : 17/24
            R-> Feature 3, Threshold 43.5
                L-> 1 : 43/43
                R-> Feature 3, Threshold 45.25
                    L-> 2 : 2/4
                    R-> 1 : 25/25
        R-> Feature 3, Threshold 60.5
            L-> Feature 4, Threshold 0.5
                L-> Feature 1, Threshold 2.5
                    L-> 1 : 19/37
                    R-> 2 : 5/5
                R-> Feature 1, Threshold 1.5
                    L-> 1 : 8/14
                    R-> 1 : 8/8
            R-> 1 : 9/9


X[val[1:8],:]


predict(mach2, X[val[1:8],:])

8-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(0=>0.125, 1=>0.875)
 UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
 UnivariateFinite{OrderedFactor{2}}(0=>0.514, 1=>0.486)
 UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
 UnivariateFinite{OrderedFactor{2}}(0=>1.0, 1=>0.0)
 UnivariateFinite{OrderedFactor{2}}(0=>0.46, 1=>0.54)
 UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
 UnivariateFinite{OrderedFactor{2}}(0=>0.514, 1=>0.486)


predict_mode(mach2, X[val[1:8],:])

8-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
 1
 0
 0
 0
 0
 1
 0
 0


ŷ = predict_mode(mach2, X[val,:])
confusion_matrix(ŷ, y[val])

              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │      0      │      1      │
├─────────────┼─────────────┼─────────────┤
│      0      │     127     │     20      │
├─────────────┼─────────────┼─────────────┤
│      1      │      6      │     60      │
└─────────────┴─────────────┴─────────────┘


accuracy(ŷ, y[val])

0.8779342723004695


describe(X)


predict_mode(mach2, DataFrame(Plass=3, Sex=2, Age=22., Sibsp=0, Parch=0, Fare=14.))

1-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
 0


predict(mach2, DataFrame(Plass=3, Sex=2, Age=22., Sibsp=0, Parch=0, Fare=14.))    # Jack

1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)


predict(mach2, DataFrame(Plass=1, Sex=1, Age=20., Sibsp=0, Parch=2, Fare=414.))   # Rose

1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(0=>0.0, 1=>1.0)


predict(mach2, DataFrame(Plass=3, Sex=1, Age=2., Sibsp=1, Parch=2, Fare=14.))

1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(0=>0.0, 1=>1.0)


predict(mach2, DataFrame(Plass=3, Sex=2, Age=62., Sibsp=1, Parch=0, Fare=14.))

1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
 UnivariateFinite{OrderedFactor{2}}(0=>1.0, 1=>0.0)


using Plots, Statistics
display(scatter(df.Pclass, df.Fare, xlabel=:Pclass, ylabel=:Fare, legend=false))
cor(df.Pclass,df.Fare)

-0.5453318579475805


display(scatter(df.Age, df.Fare, xlabel=:Age, ylabel=:Fare, legend=false))
cor(df.Age,df.Fare)

0.11127730982128829


MLJ.save("my_machine.jlso", mach2)


using MLJ, DataFrames, CSV
df = DataFrame(CSV.File("ML-3-Titanic-test.csv"))
first(df, 5)


rename!(df, "Siblings/Spouses Aboard"=>:Sibsp, "Parents/Children Aboard"=>:Parch)
df = select(df, Not(:Name))
first(df, 5)


schema(df)

┌─────────┬─────────┬────────────┐
│ _.names │ _.types │ _.scitypes │
├─────────┼─────────┼────────────┤
│ Pclass  │ Int64   │ Count      │
│ Sex     │ String  │ Textual    │
│ Age     │ Int64   │ Count      │
│ Sibsp   │ Int64   │ Count      │
│ Parch   │ Int64   │ Count      │
│ Fare    │ Float64 │ Continuous │
└─────────┴─────────┴────────────┘
_.nrows = 15


df.Sex = coerce(df.Sex, OrderedFactor)
df.Sex = coerce(df.Sex, Count)
df.Age = coerce(df.Age, Continuous)
schema(df)

┌─────────┬─────────┬────────────┐
│ _.names │ _.types │ _.scitypes │
├─────────┼─────────┼────────────┤
│ Pclass  │ Int64   │ Count      │
│ Sex     │ Int64   │ Count      │
│ Age     │ Float64 │ Continuous │
│ Sibsp   │ Int64   │ Count      │
│ Parch   │ Int64   │ Count      │
│ Fare    │ Float64 │ Continuous │
└─────────┴─────────┴────────────┘
_.nrows = 15


mach3 = machine("my_machine.jlso")
X = df
ŷ = predict_mode(mach3, X)
y = [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1]
y = coerce(y, OrderedFactor)
accuracy(ŷ, y)

0.8


confusion_matrix(ŷ, y)

              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │      0      │      1      │
├─────────────┼─────────────┼─────────────┤
│      0      │      9      │      3      │
├─────────────┼─────────────┼─────────────┤
│      1      │      0      │      3      │
└─────────────┴─────────────┴─────────────┘

	Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard
	Int64	Int64	String	String	Float64	Int64
1	1	1	Mr. Dickinson H Bishop	male	25.0	1
2	0	3	Miss. Aloisia Haas	female	24.0	0
3	0	3	Mr. William Alfred Brocklebank	male	35.0	0
4	0	3	Mr. Samuel Beard Risien	male	69.0	0
5	1	1	Mr. Harry Anderson	male	48.0	0

	variable	mean	min	median	max
	Symbol	Union…	Any	Union…	Any
1	Survived	0.394366	0	0.0	1
2	Pclass	2.30704	1	3.0	3
3	Name		Capt. Edward Gifford Crosby		Rev. Thomas Roussel Davids Byles
4	Sex		female		male
5	Age	29.4592	0.42	28.0	74.0
6	Siblings/Spouses Aboard	0.512676	0	0.0	8
7	Parents/Children Aboard	0.394366	0	0.0	6
8	Fare	32.3272	0.0	14.4563	512.329

	Pclass	Sex	Age	Sibsp	Parch	Fare
	Int64	Int64	Float64	Int64	Int64	Float64
1	2	1	34.0	0	0	13.0
2	3	2	28.0	0	0	8.05
3	1	2	41.0	0	0	26.55
4	3	2	27.0	0	0	7.3125
5	1	1	2.0	1	2	151.55
6	3	1	14.0	1	0	11.2417
7	3	2	18.0	0	0	7.7958
8	1	2	48.0	0	0	26.55

ML 3: Classification - Decision Trees¶

Ex 1: Two Continuous Features¶

Full Fit => Deterministic¶

Partial Fit => Probabilistic¶

CART: Classification And Regression Trees¶

Ex 2: Predicting the Survival of Titanic Passengers¶

First Model: No Limit on Depth¶

Imbalanced Classification¶

Different Training Set¶

Second Model: Limit Depth of Decision Tree¶

Predictions¶

New Features¶

Testing¶

	target	feature1	feature2
	Bool	Float64	Float64
1	0	0.0761072	0.987106
2	0	0.420093	0.741078
3	0	0.666973	0.0194752
4	1	0.0972444	0.230083
5	1	0.86052	0.580439
6	0	0.447504	0.563352
7	1	0.29995	0.346412
8	0	0.629998	0.687563

	target	feature1	feature2
	Cat…	Float64	Float64
1	false	0.0761072	0.987106
2	false	0.420093	0.741078
3	false	0.666973	0.0194752
4	true	0.0972444	0.230083
5	true	0.86052	0.580439
6	false	0.447504	0.563352
7	true	0.29995	0.346412
8	false	0.629998	0.687563

	variable	mean	min	median	max	nmissing	eltype
	Symbol	Float64	Real	Float64	Real	Int64	DataType
1	Pclass	2.30704	1	3.0	3	0	Int64
2	Sex	1.63662	1	2.0	2	0	Int64
3	Age	29.4592	0.42	28.0	74.0	0	Float64
4	Sibsp	0.512676	0	0.0	8	0	Int64
5	Parch	0.394366	0	0.0	6	0	Int64
6	Fare	32.3272	0.0	14.4563	512.329	0	Float64