Packages Used: Functions from the following packages are used in this notebook for the first time:
Classificaton in relation to other aspects of machine learning:
using MLJ, DataFrames, Random
Random.seed!(95117)
n = 8
df = DataFrame(target=rand(0:1,n).>.5, feature1=rand(n), feature2=rand(n))
8 rows × 3 columns
| target | feature1 | feature2 | |
|---|---|---|---|
| Bool | Float64 | Float64 | |
| 1 | 0 | 0.0761072 | 0.987106 |
| 2 | 0 | 0.420093 | 0.741078 |
| 3 | 0 | 0.666973 | 0.0194752 |
| 4 | 1 | 0.0972444 | 0.230083 |
| 5 | 1 | 0.86052 | 0.580439 |
| 6 | 0 | 0.447504 | 0.563352 |
| 7 | 1 | 0.29995 | 0.346412 |
| 8 | 0 | 0.629998 | 0.687563 |
schema(df)
┌──────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼─────────┼────────────┤ │ target │ Bool │ Count │ │ feature1 │ Float64 │ Continuous │ │ feature2 │ Float64 │ Continuous │ └──────────┴─────────┴────────────┘ _.nrows = 8
using Plots
scatter(df.feature1[.!df.target],df.feature2[.!df.target],label="Target 1")
scatter!(df.feature1[df.target],df.feature2[df.target],label="Target 2")
coerce!(df, :target=>OrderedFactor)
schema(df)
┌──────────┬────────────────────────────────┬──────────────────┐
│ _.names │ _.types │ _.scitypes │
├──────────┼────────────────────────────────┼──────────────────┤
│ target │ CategoricalValue{Bool, UInt32} │ OrderedFactor{2} │
│ feature1 │ Float64 │ Continuous │
│ feature2 │ Float64 │ Continuous │
└──────────┴────────────────────────────────┴──────────────────┘
_.nrows = 8
y = df.target
X = select(df, Not(:target))
first(X, 5)
5 rows × 2 columns
| feature1 | feature2 | |
|---|---|---|
| Float64 | Float64 | |
| 1 | 0.0761072 | 0.987106 |
| 2 | 0.420093 | 0.741078 |
| 3 | 0.666973 | 0.0194752 |
| 4 | 0.0972444 | 0.230083 |
| 5 | 0.86052 | 0.580439 |
models(matching(X, y))
48-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple}:
(name = AdaBoostClassifier, package_name = ScikitLearn, ... )
(name = AdaBoostStumpClassifier, package_name = DecisionTree, ... )
(name = BaggingClassifier, package_name = ScikitLearn, ... )
(name = BayesianLDA, package_name = MultivariateStats, ... )
(name = BayesianLDA, package_name = ScikitLearn, ... )
(name = BayesianQDA, package_name = ScikitLearn, ... )
(name = BayesianSubspaceLDA, package_name = MultivariateStats, ... )
(name = ConstantClassifier, package_name = MLJModels, ... )
(name = DecisionTreeClassifier, package_name = BetaML, ... )
(name = DecisionTreeClassifier, package_name = DecisionTree, ... )
(name = DeterministicConstantClassifier, package_name = MLJModels, ... )
(name = DummyClassifier, package_name = ScikitLearn, ... )
(name = EvoTreeClassifier, package_name = EvoTrees, ... )
⋮
(name = RandomForestClassifier, package_name = BetaML, ... )
(name = RandomForestClassifier, package_name = DecisionTree, ... )
(name = RandomForestClassifier, package_name = ScikitLearn, ... )
(name = RidgeCVClassifier, package_name = ScikitLearn, ... )
(name = RidgeClassifier, package_name = ScikitLearn, ... )
(name = SGDClassifier, package_name = ScikitLearn, ... )
(name = SVC, package_name = LIBSVM, ... )
(name = SVMClassifier, package_name = ScikitLearn, ... )
(name = SVMLinearClassifier, package_name = ScikitLearn, ... )
(name = SVMNuClassifier, package_name = ScikitLearn, ... )
(name = SubspaceLDA, package_name = MultivariateStats, ... )
(name = XGBoostClassifier, package_name = XGBoost, ... )
@load DecisionTreeClassifier pkg=DecisionTree verbosity=0
model = MLJDecisionTreeInterface.DecisionTreeClassifier()
DecisionTreeClassifier(
max_depth = -1,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = 0,
post_prune = false,
merge_purity_threshold = 1.0,
pdf_smoothing = 0.0,
display_depth = 5,
rng = Random._GLOBAL_RNG()) @577
mach = machine(model, X, y)
Machine{DecisionTreeClassifier,…} @233 trained 0 times; caches data args: 1: Source @499 ⏎ `Table{AbstractVector{Continuous}}` 2: Source @806 ⏎ `AbstractVector{OrderedFactor{2}}`
fit!(mach)
┌ Info: Training Machine{DecisionTreeClassifier,…} @233.
└ @ MLJBase C:\Users\kay\.julia\packages\MLJBase\xlh6G\src\machines.jl:390
Machine{DecisionTreeClassifier,…} @233 trained 1 time; caches data args: 1: Source @499 ⏎ `Table{AbstractVector{Continuous}}` 2: Source @806 ⏎ `AbstractVector{OrderedFactor{2}}`
mach.fitresult
(Decision Tree
Leaves: 4
Depth: 3, CategoricalArrays.CategoricalValue{Bool, UInt32}[false, true], UInt32[0x00000001, 0x00000002])
report(mach).print_tree(3)
Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
L-> 2 : 2/2
R-> Feature 1, Threshold 0.7637465112420186
L-> 1 : 2/2
R-> 2 : 1/1
R-> 1 : 3/3
ŷ = predict_mode(mach, X)
8-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
false
false
false
true
true
false
true
false
[ŷ y]
8×2 CategoricalArrays.CategoricalArray{Bool,2,UInt32}:
false false
false false
false false
true true
true true
false false
true true
false false
confusion_matrix(ŷ, y)
┌───────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┤
│ Predicted │ false │ true │
├─────────────┼─────────────┼─────────────┤
│ false │ 5 │ 0 │
├─────────────┼─────────────┼─────────────┤
│ true │ 0 │ 3 │
└─────────────┴─────────────┴─────────────┘
predict_mode(mach, DataFrame(feature1=[.5], feature2=[.5]))
1-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
false
predict(mach, DataFrame(feature1=[.5], feature2=[.5]))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
report(mach).print_tree(3)
Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
L-> 2 : 2/2
R-> Feature 1, Threshold 0.7637465112420186
L-> 1 : 2/2
R-> 2 : 1/1
R-> 1 : 3/3
report(mach).print_tree(2)
Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
L-> 2 : 2/2
R->
R-> 1 : 3/3
model2 = MLJDecisionTreeInterface.DecisionTreeClassifier(max_depth = 2)
DecisionTreeClassifier(
max_depth = 2,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = 0,
post_prune = false,
merge_purity_threshold = 1.0,
pdf_smoothing = 0.0,
display_depth = 5,
rng = Random._GLOBAL_RNG()) @790
mach2 = machine(model2, X, y)
fit!(mach2)
mach2.fitresult
┌ Info: Training Machine{DecisionTreeClassifier,…} @711.
└ @ MLJBase C:\Users\kay\.julia\packages\MLJBase\xlh6G\src\machines.jl:390
(Decision Tree
Leaves: 3
Depth: 2, CategoricalArrays.CategoricalValue{Bool, UInt32}[false, true], UInt32[0x00000001, 0x00000002])
report(mach2).print_tree(2)
Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
L-> 2 : 2/2
R-> 1 : 2/3
R-> 1 : 3/3
ŷ = predict_mode(mach2, X)
8-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
false
false
false
true
false
false
true
false
[ŷ y]
8×2 CategoricalArrays.CategoricalArray{Bool,2,UInt32}:
false false
false false
false false
true true
false true
false false
true true
false false
confusion_matrix(ŷ, y)
┌───────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┤
│ Predicted │ false │ true │
├─────────────┼─────────────┼─────────────┤
│ false │ 5 │ 1 │
├─────────────┼─────────────┼─────────────┤
│ true │ 0 │ 2 │
└─────────────┴─────────────┴─────────────┘
7/8, accuracy(ŷ, y)
(0.875, 0.875)
predict(mach2, X)
8-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
UnivariateFinite{OrderedFactor{2}}(false=>0.0, true=>1.0)
UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
UnivariateFinite{OrderedFactor{2}}(false=>0.0, true=>1.0)
UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
predict_mode(mach2, DataFrame(feature1=[.5], feature2=[.5]))
1-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
false
predict(mach2, DataFrame(feature1=[.5], feature2=[.5]))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
info("DecisionTreeClassifier", pkg="DecisionTree")
CART decision tree classifier. → based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl). → do `@load DecisionTreeClassifier pkg="DecisionTree"` to use the model. → do `?DecisionTreeClassifier` for documentation. (name = "DecisionTreeClassifier", package_name = "DecisionTree", is_supervised = true, abstract_type = Probabilistic, deep_properties = (), docstring = "CART decision tree classifier.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load DecisionTreeClassifier pkg=\"DecisionTree\"` to use the model.\n→ do `?DecisionTreeClassifier` for documentation.", fit_data_scitype = Tuple{Table{_s48} where _s48<:Union{AbstractVector{_s47} where _s47<:Count, AbstractVector{_s47} where _s47<:OrderedFactor, AbstractVector{_s47} where _s47<:Continuous}, AbstractVector{_s41} where _s41<:Finite}, hyperparameter_ranges = (nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing), hyperparameter_types = ("Int64", "Int64", "Int64", "Float64", "Int64", "Bool", "Float64", "Float64", "Int64", "Union{Integer, Random.AbstractRNG}"), hyperparameters = (:max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune, :merge_purity_threshold, :pdf_smoothing, :display_depth, :rng), implemented_methods = [:clean!, :fit, :fitted_params, :predict], inverse_transform_scitype = Unknown, is_pure_julia = true, is_wrapper = false, iteration_parameter = nothing, load_path = "MLJDecisionTreeInterface.DecisionTreeClassifier", package_license = "MIT", package_url = "https://github.com/bensadeghi/DecisionTree.jl", package_uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb", predict_scitype = AbstractVector{ScientificTypesBase.Density{_s25} where _s25<:Finite}, prediction_type = :probabilistic, supports_class_weights = false, supports_online = false, supports_training_losses = false, supports_weights = false, transform_scitype = Unknown, input_scitype = Table{_s48} where _s48<:Union{AbstractVector{_s47} where _s47<:Count, AbstractVector{_s47} where _s47<:OrderedFactor, AbstractVector{_s47} where _s47<:Continuous}, target_scitype = AbstractVector{_s41} where _s41<:Finite, output_scitype = Unknown,)
report(mach).print_tree(3)
Feature 2, Threshold 0.6340010854678377
L-> Feature 1, Threshold 0.3737270065061601
L-> 2 : 2/2
R-> Feature 1, Threshold 0.7637465112420186
L-> 1 : 2/2
R-> 2 : 1/1
R-> 1 : 3/3
t, f2 = df.target, df.feature2
idx = sortperm(f2)
t, f2 = t[idx], f2[idx]
s = 0.63
t[f2 .<= s]
5-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}:
false
true
true
false
true
1 - (2/5)^2 - (3/5)^2
0.48
impurity(t) = 1 - sum(([sum(t .== i) for i in unique(t)]./length(t)).^2)
impurity(t[f2 .<= s])
0.48
impurity(t[f2 .> s])
0.0
f2[1:end-1] .+ diff(f2)/2
7-element Vector{Float64}:
0.12477885945422329
0.2882471857031962
0.45488184432864687
0.5718955812012008
0.6340010854678377
0.7143205625010004
0.8640919332503664
split_cont(f) = f[1:end-1] .+ diff(f)/2
split_cont(f2)
7-element Vector{Float64}:
0.12477885945422329
0.2882471857031962
0.45488184432864687
0.5718955812012008
0.6340010854678377
0.7143205625010004
0.8640919332503664
impurity(t)
0.46875
# IG = impurity(T) - impurity(TL)*nTL/nT - impurity(TR)*nTR/nT
IG = 0.46875 - 0.48*5/8 - 0.0*3/8
0.16875
IG = impurity(t) - impurity(t[f2 .<= s])*5/8 - impurity(t[f2 .> s])*3/8
0.16875
function infogain(s,f,t)
is = f .<= s
IG = impurity(t) - impurity(t[is])*length(f[is])/length(f) -
impurity(t[.!is])*length(f[.!is])/length(f)
end
infogain (generic function with 1 method)
df
8 rows × 3 columns
| target | feature1 | feature2 | |
|---|---|---|---|
| Cat… | Float64 | Float64 | |
| 1 | false | 0.0761072 | 0.987106 |
| 2 | false | 0.420093 | 0.741078 |
| 3 | false | 0.666973 | 0.0194752 |
| 4 | true | 0.0972444 | 0.230083 |
| 5 | true | 0.86052 | 0.580439 |
| 6 | false | 0.447504 | 0.563352 |
| 7 | true | 0.29995 | 0.346412 |
| 8 | false | 0.629998 | 0.687563 |
T = Matrix(df[!,[2,3,1]])
8×3 Matrix{Float64}:
0.0761072 0.987106 0.0
0.420093 0.741078 0.0
0.666973 0.0194752 0.0
0.0972444 0.230083 1.0
0.86052 0.580439 1.0
0.447504 0.563352 0.0
0.29995 0.346412 1.0
0.629998 0.687563 0.0
idx1 = sortperm(T[:,1])
8-element Vector{Int64}:
1
4
7
2
6
8
3
5
s1 = split_cont(T[idx1,1])
7-element Vector{Float64}:
0.08667578253830954
0.19859718643090984
0.3600215998051657
0.4337986380036255
0.5387510783253423
0.6484857689281089
0.7637465112420186
[infogain(i,T[idx1,1],T[idx1,end]) for i in s1]
7-element Vector{Float64}:
0.04017857142857134
0.010416666666666685
0.10208333333333347
0.03125
0.0020833333333333537
0.010416666666666685
0.11160714285714285
idx2 = sortperm(T[:,2])
s2 = split_cont(T[idx2,2])
7-element Vector{Float64}:
0.12477885945422329
0.2882471857031962
0.45488184432864687
0.5718955812012008
0.6340010854678377
0.7143205625010004
0.8640919332503664
[infogain(i,T[idx2,2],T[idx2,end]) for i in s2]
7-element Vector{Float64}:
0.04017857142857134
0.010416666666666685
0.10208333333333347
0.03125
0.16875
0.09375
0.04017857142857134
argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])
5
s2[argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])]
0.6340010854678377
isL = T[idx2,2] .<= s2[argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])]
8-element BitVector: 1 1 1 1 1 0 0 0
TL = T[idx2[isL],:]
5×3 Matrix{Float64}:
0.666973 0.0194752 0.0
0.0972444 0.230083 1.0
0.29995 0.346412 1.0
0.447504 0.563352 0.0
0.86052 0.580439 1.0
impurity(TL[:,end])
0.48
TR = T[idx2[.!isL],:]
3×3 Matrix{Float64}:
0.629998 0.687563 0.0
0.420093 0.741078 0.0
0.0761072 0.987106 0.0
impurity(TR[:,end])
0.0
idx1 = sortperm(TL[:,1])
s1 = split_cont(TL[idx1,1])
[infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1]
4-element Vector{Float64}:
0.07999999999999996
0.21333333333333332
0.013333333333333308
0.07999999999999996
idx2 = sortperm(TL[:,2])
s2 = split_cont(TL[idx2,2])
[infogain(i,TL[idx2,2],TL[idx2,end]) for i in s2]
4-element Vector{Float64}:
0.18
0.013333333333333308
0.013333333333333308
0.07999999999999996
s1[argmax([infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1])]
0.3737270065061601
isLL = TL[idx1,1] .<= s1[argmax([infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1])]
5-element BitVector: 1 1 0 0 0
TLL = TL[idx1[isLL],:]
2×3 Matrix{Float64}:
0.0972444 0.230083 1.0
0.29995 0.346412 1.0
impurity(TLL[:,end])
0.0
TLR = TL[idx1[.!isLL],:]
3×3 Matrix{Float64}:
0.447504 0.563352 0.0
0.666973 0.0194752 0.0
0.86052 0.580439 1.0
impurity(TLR[:,end])
0.4444444444444444
idx1 = sortperm(TLR[:,1])
s1 = split_cont(TLR[idx1,1])
[infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1]
2-element Vector{Float64}:
0.1111111111111111
0.4444444444444444
idx2 = sortperm(TLR[:,2])
s2 = split_cont(TLR[idx2,2])
[infogain(i,TLR[idx2,2],TLR[idx2,end]) for i in s2]
2-element Vector{Float64}:
0.1111111111111111
0.4444444444444444
s1[argmax([infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1])]
0.7637465112420186
isLLL = TLR[idx1,1] .<= s1[argmax([infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1])]
3-element BitVector: 1 1 0
TLLL = TLR[idx1[isLLL],:]
2×3 Matrix{Float64}:
0.447504 0.563352 0.0
0.666973 0.0194752 0.0
impurity(TLLL[:,end])
0.0
TLRR = TLR[idx1[.!isLLL],:]
1×3 Matrix{Float64}:
0.86052 0.580439 1.0
impurity(TLRR[:,end])
0.0
using MLJ, DataFrames, CSV
df = DataFrame(CSV.File("ML-3-Titanic-train.csv"))
first(df, 5)
5 rows × 8 columns (omitted printing of 2 columns)
| Survived | Pclass | Name | Sex | Age | Siblings/Spouses Aboard | |
|---|---|---|---|---|---|---|
| Int64 | Int64 | String | String | Float64 | Int64 | |
| 1 | 1 | 1 | Mr. Dickinson H Bishop | male | 25.0 | 1 |
| 2 | 0 | 3 | Miss. Aloisia Haas | female | 24.0 | 0 |
| 3 | 0 | 3 | Mr. William Alfred Brocklebank | male | 35.0 | 0 |
| 4 | 0 | 3 | Mr. Samuel Beard Risien | male | 69.0 | 0 |
| 5 | 1 | 1 | Mr. Harry Anderson | male | 48.0 | 0 |
describe(df)
8 rows × 7 columns (omitted printing of 2 columns)
| variable | mean | min | median | max | |
|---|---|---|---|---|---|
| Symbol | Union… | Any | Union… | Any | |
| 1 | Survived | 0.394366 | 0 | 0.0 | 1 |
| 2 | Pclass | 2.30704 | 1 | 3.0 | 3 |
| 3 | Name | Capt. Edward Gifford Crosby | Rev. Thomas Roussel Davids Byles | ||
| 4 | Sex | female | male | ||
| 5 | Age | 29.4592 | 0.42 | 28.0 | 74.0 |
| 6 | Siblings/Spouses Aboard | 0.512676 | 0 | 0.0 | 8 |
| 7 | Parents/Children Aboard | 0.394366 | 0 | 0.0 | 6 |
| 8 | Fare | 32.3272 | 0.0 | 14.4563 | 512.329 |
rename!(df, "Siblings/Spouses Aboard"=>:Sibsp, "Parents/Children Aboard"=>:Parch)
df = select(df, Not(:Name))
first(df, 5)
5 rows × 7 columns
| Survived | Pclass | Sex | Age | Sibsp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| Int64 | Int64 | String | Float64 | Int64 | Int64 | Float64 | |
| 1 | 1 | 1 | male | 25.0 | 1 | 0 | 91.0792 |
| 2 | 0 | 3 | female | 24.0 | 0 | 0 | 8.85 |
| 3 | 0 | 3 | male | 35.0 | 0 | 0 | 8.05 |
| 4 | 0 | 3 | male | 69.0 | 0 | 0 | 14.5 |
| 5 | 1 | 1 | male | 48.0 | 0 | 0 | 26.55 |
describe(df)
7 rows × 7 columns
| variable | mean | min | median | max | nmissing | eltype | |
|---|---|---|---|---|---|---|---|
| Symbol | Union… | Any | Union… | Any | Int64 | DataType | |
| 1 | Survived | 0.394366 | 0 | 0.0 | 1 | 0 | Int64 |
| 2 | Pclass | 2.30704 | 1 | 3.0 | 3 | 0 | Int64 |
| 3 | Sex | female | male | 0 | String | ||
| 4 | Age | 29.4592 | 0.42 | 28.0 | 74.0 | 0 | Float64 |
| 5 | Sibsp | 0.512676 | 0 | 0.0 | 8 | 0 | Int64 |
| 6 | Parch | 0.394366 | 0 | 0.0 | 6 | 0 | Int64 |
| 7 | Fare | 32.3272 | 0.0 | 14.4563 | 512.329 | 0 | Float64 |
schema(df)
┌──────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼─────────┼────────────┤ │ Survived │ Int64 │ Count │ │ Pclass │ Int64 │ Count │ │ Sex │ String │ Textual │ │ Age │ Float64 │ Continuous │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └──────────┴─────────┴────────────┘ _.nrows = 710
subtypes(Finite)
2-element Vector{Any}:
Multiclass
OrderedFactor
subtypes(Infinite)
2-element Vector{Any}:
Continuous
Count
df.Survived = coerce(df.Survived, OrderedFactor)
schema(df)
┌──────────┬─────────────────────────────────┬──────────────────┐
│ _.names │ _.types │ _.scitypes │
├──────────┼─────────────────────────────────┼──────────────────┤
│ Survived │ CategoricalValue{Int64, UInt32} │ OrderedFactor{2} │
│ Pclass │ Int64 │ Count │
│ Sex │ String │ Textual │
│ Age │ Float64 │ Continuous │
│ Sibsp │ Int64 │ Count │
│ Parch │ Int64 │ Count │
│ Fare │ Float64 │ Continuous │
└──────────┴─────────────────────────────────┴──────────────────┘
_.nrows = 710
df.Sex = coerce(df.Sex, OrderedFactor)
df.Sex = coerce(df.Sex, Count)
schema(df)
┌──────────┬─────────────────────────────────┬──────────────────┐
│ _.names │ _.types │ _.scitypes │
├──────────┼─────────────────────────────────┼──────────────────┤
│ Survived │ CategoricalValue{Int64, UInt32} │ OrderedFactor{2} │
│ Pclass │ Int64 │ Count │
│ Sex │ Int64 │ Count │
│ Age │ Float64 │ Continuous │
│ Sibsp │ Int64 │ Count │
│ Parch │ Int64 │ Count │
│ Fare │ Float64 │ Continuous │
└──────────┴─────────────────────────────────┴──────────────────┘
_.nrows = 710
first(df, 5)
5 rows × 7 columns
| Survived | Pclass | Sex | Age | Sibsp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| Cat… | Int64 | Int64 | Float64 | Int64 | Int64 | Float64 | |
| 1 | 1 | 1 | 2 | 25.0 | 1 | 0 | 91.0792 |
| 2 | 0 | 3 | 1 | 24.0 | 0 | 0 | 8.85 |
| 3 | 0 | 3 | 2 | 35.0 | 0 | 0 | 8.05 |
| 4 | 0 | 3 | 2 | 69.0 | 0 | 0 | 14.5 |
| 5 | 1 | 1 | 2 | 48.0 | 0 | 0 | 26.55 |
y = df.Survived
X = select(df, Not(:Survived))
first(X, 5)
5 rows × 6 columns
| Pclass | Sex | Age | Sibsp | Parch | Fare | |
|---|---|---|---|---|---|---|
| Int64 | Int64 | Float64 | Int64 | Int64 | Float64 | |
| 1 | 1 | 2 | 25.0 | 1 | 0 | 91.0792 |
| 2 | 3 | 1 | 24.0 | 0 | 0 | 8.85 |
| 3 | 3 | 2 | 35.0 | 0 | 0 | 8.05 |
| 4 | 3 | 2 | 69.0 | 0 | 0 | 14.5 |
| 5 | 1 | 2 | 48.0 | 0 | 0 | 26.55 |
scitype(y)
AbstractVector{OrderedFactor{2}} (alias for AbstractArray{OrderedFactor{2}, 1})
models(matching(X, y))
11-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple}:
(name = AdaBoostStumpClassifier, package_name = DecisionTree, ... )
(name = ConstantClassifier, package_name = MLJModels, ... )
(name = DecisionTreeClassifier, package_name = BetaML, ... )
(name = DecisionTreeClassifier, package_name = DecisionTree, ... )
(name = DeterministicConstantClassifier, package_name = MLJModels, ... )
(name = KernelPerceptronClassifier, package_name = BetaML, ... )
(name = PegasosClassifier, package_name = BetaML, ... )
(name = PerceptronClassifier, package_name = BetaML, ... )
(name = RandomForestClassifier, package_name = BetaML, ... )
(name = RandomForestClassifier, package_name = DecisionTree, ... )
(name = RandomForestClassifier, package_name = ScikitLearn, ... )
@load DecisionTreeClassifier pkg=DecisionTree verbosity=0
model = MLJDecisionTreeInterface.DecisionTreeClassifier()
mach = machine(model, X, y)
Machine{DecisionTreeClassifier,…} @647 trained 0 times; caches data args: 1: Source @114 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}` 2: Source @304 ⏎ `AbstractVector{OrderedFactor{2}}`
train, val = partition(eachindex(y), 0.7, shuffle=true, rng=7245)
([101, 70, 656, 623, 103, 575, 576, 115, 506, 188 … 328, 123, 321, 318, 297, 201, 41, 184, 22, 592], [521, 77, 180, 290, 265, 86, 198, 31, 134, 209 … 79, 176, 531, 603, 606, 509, 447, 490, 478, 266])
fit!(mach, rows=train, verbosity=0)
Machine{DecisionTreeClassifier,…} @647 trained 1 time; caches data args: 1: Source @114 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}` 2: Source @304 ⏎ `AbstractVector{OrderedFactor{2}}`
mach.fitresult
(Decision Tree
Leaves: 104
Depth: 18, CategoricalArrays.CategoricalValue{Int64, UInt32}[0, 1], UInt32[0x00000001, 0x00000002])
names(X)
6-element Vector{String}:
"Pclass"
"Sex"
"Age"
"Sibsp"
"Parch"
"Fare"
report(mach).print_tree(5)
Feature 2, Threshold 1.5
L-> Feature 1, Threshold 2.5
L-> Feature 3, Threshold 3.0
L-> 1 : 1/1
R-> Feature 6, Threshold 28.85625
L-> Feature 3, Threshold 43.0
L->
R->
R-> Feature 5, Threshold 1.5
L-> 2 : 58/58
R->
R-> Feature 6, Threshold 22.90415
L-> Feature 6, Threshold 18.62915
L-> Feature 3, Threshold 36.5
L->
R->
R-> 2 : 7/7
R-> Feature 3, Threshold 5.5
L-> Feature 3, Threshold 3.5
L-> 1 : 2/2
R-> 2 : 1/1
R-> 1 : 14/14
R-> Feature 3, Threshold 13.0
L-> Feature 4, Threshold 2.5
L-> 2 : 15/15
R-> 1 : 9/9
R-> Feature 6, Threshold 26.26875
L-> Feature 3, Threshold 32.5
L-> Feature 3, Threshold 28.75
L->
R->
R-> Feature 6, Threshold 7.987500000000001
L->
R-> 1 : 43/43
R-> Feature 3, Threshold 58.0
L-> Feature 5, Threshold 0.5
L->
R->
R-> 1 : 7/7
ŷ = predict_mode(mach, X[val,:])
confusion_matrix(ŷ, y[val])
┌───────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┤
│ Predicted │ 0 │ 1 │
├─────────────┼─────────────┼─────────────┤
│ 0 │ 112 │ 28 │
├─────────────┼─────────────┼─────────────┤
│ 1 │ 23 │ 50 │
└─────────────┴─────────────┴─────────────┘
accuracy(ŷ, y[val])
0.7605633802816901
sum(y .== 0), sum(y .== 1), length(y)
(430, 280, 710)
sum(y .== 0)/length(y)
0.6056338028169014
train, val = partition(eachindex(y), 0.7, shuffle=true, rng=93544)
fit!(mach, rows=train, verbosity=0)
ŷ = predict_mode(mach, X[val,:])
confusion_matrix(ŷ, y[val])
┌───────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┤
│ Predicted │ 0 │ 1 │
├─────────────┼─────────────┼─────────────┤
│ 0 │ 102 │ 18 │
├─────────────┼─────────────┼─────────────┤
│ 1 │ 31 │ 62 │
└─────────────┴─────────────┴─────────────┘
accuracy(ŷ, y[val])
0.7699530516431925
names(X)
6-element Vector{String}:
"Pclass"
"Sex"
"Age"
"Sibsp"
"Parch"
"Fare"
model2 = MLJDecisionTreeInterface.DecisionTreeClassifier(max_depth = 6)
DecisionTreeClassifier(
max_depth = 6,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = 0,
post_prune = false,
merge_purity_threshold = 1.0,
pdf_smoothing = 0.0,
display_depth = 5,
rng = Random._GLOBAL_RNG()) @216
mach2 = machine(model2, X, y)
fit!(mach2, rows=train, verbosity=0)
report(mach2).print_tree(6)
Feature 2, Threshold 1.5
L-> Feature 1, Threshold 2.5
L-> Feature 6, Threshold 28.85625
L-> Feature 6, Threshold 28.23125
L-> Feature 3, Threshold 22.5
L-> 2 : 7/7
R-> Feature 3, Threshold 27.5
L-> 2 : 6/9
R-> 2 : 21/24
R-> 1 : 1/1
R-> Feature 6, Threshold 149.0354
L-> 2 : 51/51
R-> Feature 6, Threshold 152.50625000000002
L-> 1 : 1/1
R-> 2 : 7/7
R-> Feature 3, Threshold 38.5
L-> Feature 6, Threshold 23.25415
L-> Feature 3, Threshold 32.5
L-> Feature 5, Threshold 1.5
L-> 2 : 27/50
R-> 2 : 6/6
R-> 2 : 6/6
R-> Feature 5, Threshold 3.5
L-> Feature 3, Threshold 5.5
L-> 1 : 3/4
R-> 1 : 9/9
R-> 2 : 1/1
R-> 1 : 10/10
R-> Feature 3, Threshold 13.0
L-> Feature 4, Threshold 2.5
L-> Feature 5, Threshold 0.5
L-> Feature 6, Threshold 15.014600000000002
L-> 2 : 1/1
R-> 1 : 1/1
R-> 2 : 17/17
R-> 1 : 8/8
R-> Feature 6, Threshold 26.26875
L-> Feature 3, Threshold 32.5
L-> Feature 3, Threshold 28.75
L-> Feature 6, Threshold 7.239599999999999
L-> 1 : 12/16
R-> 1 : 88/95
R-> Feature 6, Threshold 7.7625
L-> 1 : 4/4
R-> 1 : 17/24
R-> Feature 3, Threshold 43.5
L-> 1 : 43/43
R-> Feature 3, Threshold 45.25
L-> 2 : 2/4
R-> 1 : 25/25
R-> Feature 3, Threshold 60.5
L-> Feature 4, Threshold 0.5
L-> Feature 1, Threshold 2.5
L-> 1 : 19/37
R-> 2 : 5/5
R-> Feature 1, Threshold 1.5
L-> 1 : 8/14
R-> 1 : 8/8
R-> 1 : 9/9
X[val[1:8],:]
8 rows × 6 columns
| Pclass | Sex | Age | Sibsp | Parch | Fare | |
|---|---|---|---|---|---|---|
| Int64 | Int64 | Float64 | Int64 | Int64 | Float64 | |
| 1 | 2 | 1 | 34.0 | 0 | 0 | 13.0 |
| 2 | 3 | 2 | 28.0 | 0 | 0 | 8.05 |
| 3 | 1 | 2 | 41.0 | 0 | 0 | 26.55 |
| 4 | 3 | 2 | 27.0 | 0 | 0 | 7.3125 |
| 5 | 1 | 1 | 2.0 | 1 | 2 | 151.55 |
| 6 | 3 | 1 | 14.0 | 1 | 0 | 11.2417 |
| 7 | 3 | 2 | 18.0 | 0 | 0 | 7.7958 |
| 8 | 1 | 2 | 48.0 | 0 | 0 | 26.55 |
predict(mach2, X[val[1:8],:])
8-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(0=>0.125, 1=>0.875)
UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
UnivariateFinite{OrderedFactor{2}}(0=>0.514, 1=>0.486)
UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
UnivariateFinite{OrderedFactor{2}}(0=>1.0, 1=>0.0)
UnivariateFinite{OrderedFactor{2}}(0=>0.46, 1=>0.54)
UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
UnivariateFinite{OrderedFactor{2}}(0=>0.514, 1=>0.486)
predict_mode(mach2, X[val[1:8],:])
8-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
1
0
0
0
0
1
0
0
ŷ = predict_mode(mach2, X[val,:])
confusion_matrix(ŷ, y[val])
┌───────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┤
│ Predicted │ 0 │ 1 │
├─────────────┼─────────────┼─────────────┤
│ 0 │ 127 │ 20 │
├─────────────┼─────────────┼─────────────┤
│ 1 │ 6 │ 60 │
└─────────────┴─────────────┴─────────────┘
accuracy(ŷ, y[val])
0.8779342723004695
describe(X)
6 rows × 7 columns
| variable | mean | min | median | max | nmissing | eltype | |
|---|---|---|---|---|---|---|---|
| Symbol | Float64 | Real | Float64 | Real | Int64 | DataType | |
| 1 | Pclass | 2.30704 | 1 | 3.0 | 3 | 0 | Int64 |
| 2 | Sex | 1.63662 | 1 | 2.0 | 2 | 0 | Int64 |
| 3 | Age | 29.4592 | 0.42 | 28.0 | 74.0 | 0 | Float64 |
| 4 | Sibsp | 0.512676 | 0 | 0.0 | 8 | 0 | Int64 |
| 5 | Parch | 0.394366 | 0 | 0.0 | 6 | 0 | Int64 |
| 6 | Fare | 32.3272 | 0.0 | 14.4563 | 512.329 | 0 | Float64 |
Use model to predict survival of for people that were not acutal passengers.
predict_mode(mach2, DataFrame(Plass=3, Sex=2, Age=22., Sibsp=0, Parch=0, Fare=14.))
1-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}:
0
predict(mach2, DataFrame(Plass=3, Sex=2, Age=22., Sibsp=0, Parch=0, Fare=14.)) # Jack
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
predict(mach2, DataFrame(Plass=1, Sex=1, Age=20., Sibsp=0, Parch=2, Fare=414.)) # Rose
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(0=>0.0, 1=>1.0)
predict(mach2, DataFrame(Plass=3, Sex=1, Age=2., Sibsp=1, Parch=2, Fare=14.))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(0=>0.0, 1=>1.0)
predict(mach2, DataFrame(Plass=3, Sex=2, Age=62., Sibsp=1, Parch=0, Fare=14.))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}:
UnivariateFinite{OrderedFactor{2}}(0=>1.0, 1=>0.0)
Can consider using new features that are derived from the original features.
using Plots, Statistics
display(scatter(df.Pclass, df.Fare, xlabel=:Pclass, ylabel=:Fare, legend=false))
cor(df.Pclass,df.Fare)
-0.5453318579475805
display(scatter(df.Age, df.Fare, xlabel=:Age, ylabel=:Fare, legend=false))
cor(df.Age,df.Fare)
0.11127730982128829
MLJ.save("my_machine.jlso", mach2)
using MLJ, DataFrames, CSV
df = DataFrame(CSV.File("ML-3-Titanic-test.csv"))
first(df, 5)
5 rows × 7 columns (omitted printing of 2 columns)
| Pclass | Name | Sex | Age | Siblings/Spouses Aboard | |
|---|---|---|---|---|---|
| Int64 | String | String | Int64 | Int64 | |
| 1 | 2 | Mr. Moses Aaron Troupiansky | male | 23 | 0 |
| 2 | 2 | Rev. Juozas Montvila | male | 27 | 0 |
| 3 | 3 | Miss. Marguerite Rut Sandstrom | female | 4 | 1 |
| 4 | 1 | Mrs. Thomas Jr (Lily Alexenia Wilson) Potter | female | 56 | 0 |
| 5 | 3 | Mr. Farred Chehab Emir | male | 26 | 0 |
rename!(df, "Siblings/Spouses Aboard"=>:Sibsp, "Parents/Children Aboard"=>:Parch)
df = select(df, Not(:Name))
first(df, 5)
5 rows × 6 columns
| Pclass | Sex | Age | Sibsp | Parch | Fare | |
|---|---|---|---|---|---|---|
| Int64 | String | Int64 | Int64 | Int64 | Float64 | |
| 1 | 2 | male | 23 | 0 | 0 | 13.0 |
| 2 | 2 | male | 27 | 0 | 0 | 13.0 |
| 3 | 3 | female | 4 | 1 | 1 | 16.7 |
| 4 | 1 | female | 56 | 0 | 1 | 83.1583 |
| 5 | 3 | male | 26 | 0 | 0 | 7.225 |
schema(df)
┌─────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├─────────┼─────────┼────────────┤ │ Pclass │ Int64 │ Count │ │ Sex │ String │ Textual │ │ Age │ Int64 │ Count │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └─────────┴─────────┴────────────┘ _.nrows = 15
df.Sex = coerce(df.Sex, OrderedFactor)
df.Sex = coerce(df.Sex, Count)
df.Age = coerce(df.Age, Continuous)
schema(df)
┌─────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├─────────┼─────────┼────────────┤ │ Pclass │ Int64 │ Count │ │ Sex │ Int64 │ Count │ │ Age │ Float64 │ Continuous │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └─────────┴─────────┴────────────┘ _.nrows = 15
mach3 = machine("my_machine.jlso")
X = df
ŷ = predict_mode(mach3, X)
y = [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1]
y = coerce(y, OrderedFactor)
accuracy(ŷ, y)
0.8
confusion_matrix(ŷ, y)
┌───────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┤
│ Predicted │ 0 │ 1 │
├─────────────┼─────────────┼─────────────┤
│ 0 │ 9 │ 3 │
├─────────────┼─────────────┼─────────────┤
│ 1 │ 0 │ 3 │
└─────────────┴─────────────┴─────────────┘