Packages Used: Functions from the following packages are used in this notebook for the first time:
Classificaton in relation to other aspects of machine learning:
using MLJ, DataFrames, Random
Random.seed!(95117)
n = 8
df = DataFrame(target=rand(0:1,n).>.5, feature1=rand(n), feature2=rand(n))
8 rows × 3 columns
target | feature1 | feature2 | |
---|---|---|---|
Bool | Float64 | Float64 | |
1 | 0 | 0.0761072 | 0.987106 |
2 | 0 | 0.420093 | 0.741078 |
3 | 0 | 0.666973 | 0.0194752 |
4 | 1 | 0.0972444 | 0.230083 |
5 | 1 | 0.86052 | 0.580439 |
6 | 0 | 0.447504 | 0.563352 |
7 | 1 | 0.29995 | 0.346412 |
8 | 0 | 0.629998 | 0.687563 |
schema(df)
┌──────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼─────────┼────────────┤ │ target │ Bool │ Count │ │ feature1 │ Float64 │ Continuous │ │ feature2 │ Float64 │ Continuous │ └──────────┴─────────┴────────────┘ _.nrows = 8
using Plots
scatter(df.feature1[.!df.target],df.feature2[.!df.target],label="Target 1")
scatter!(df.feature1[df.target],df.feature2[df.target],label="Target 2")
coerce!(df, :target=>OrderedFactor)
schema(df)
┌──────────┬────────────────────────────────┬──────────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼────────────────────────────────┼──────────────────┤ │ target │ CategoricalValue{Bool, UInt32} │ OrderedFactor{2} │ │ feature1 │ Float64 │ Continuous │ │ feature2 │ Float64 │ Continuous │ └──────────┴────────────────────────────────┴──────────────────┘ _.nrows = 8
y = df.target
X = select(df, Not(:target))
first(X, 5)
5 rows × 2 columns
feature1 | feature2 | |
---|---|---|
Float64 | Float64 | |
1 | 0.0761072 | 0.987106 |
2 | 0.420093 | 0.741078 |
3 | 0.666973 | 0.0194752 |
4 | 0.0972444 | 0.230083 |
5 | 0.86052 | 0.580439 |
models(matching(X, y))
48-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple}: (name = AdaBoostClassifier, package_name = ScikitLearn, ... ) (name = AdaBoostStumpClassifier, package_name = DecisionTree, ... ) (name = BaggingClassifier, package_name = ScikitLearn, ... ) (name = BayesianLDA, package_name = MultivariateStats, ... ) (name = BayesianLDA, package_name = ScikitLearn, ... ) (name = BayesianQDA, package_name = ScikitLearn, ... ) (name = BayesianSubspaceLDA, package_name = MultivariateStats, ... ) (name = ConstantClassifier, package_name = MLJModels, ... ) (name = DecisionTreeClassifier, package_name = BetaML, ... ) (name = DecisionTreeClassifier, package_name = DecisionTree, ... ) (name = DeterministicConstantClassifier, package_name = MLJModels, ... ) (name = DummyClassifier, package_name = ScikitLearn, ... ) (name = EvoTreeClassifier, package_name = EvoTrees, ... ) ⋮ (name = RandomForestClassifier, package_name = BetaML, ... ) (name = RandomForestClassifier, package_name = DecisionTree, ... ) (name = RandomForestClassifier, package_name = ScikitLearn, ... ) (name = RidgeCVClassifier, package_name = ScikitLearn, ... ) (name = RidgeClassifier, package_name = ScikitLearn, ... ) (name = SGDClassifier, package_name = ScikitLearn, ... ) (name = SVC, package_name = LIBSVM, ... ) (name = SVMClassifier, package_name = ScikitLearn, ... ) (name = SVMLinearClassifier, package_name = ScikitLearn, ... ) (name = SVMNuClassifier, package_name = ScikitLearn, ... ) (name = SubspaceLDA, package_name = MultivariateStats, ... ) (name = XGBoostClassifier, package_name = XGBoost, ... )
@load DecisionTreeClassifier pkg=DecisionTree verbosity=0
model = MLJDecisionTreeInterface.DecisionTreeClassifier()
DecisionTreeClassifier(
max_depth = -1,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = 0,
post_prune = false,
merge_purity_threshold = 1.0,
pdf_smoothing = 0.0,
display_depth = 5,
rng = Random._GLOBAL_RNG()) @577
mach = machine(model, X, y)
Machine{DecisionTreeClassifier,…} @233 trained 0 times; caches data args: 1: Source @499 ⏎ `Table{AbstractVector{Continuous}}` 2: Source @806 ⏎ `AbstractVector{OrderedFactor{2}}`
fit!(mach)
┌ Info: Training Machine{DecisionTreeClassifier,…} @233.
└ @ MLJBase C:\Users\kay\.julia\packages\MLJBase\xlh6G\src\machines.jl:390
Machine{DecisionTreeClassifier,…} @233 trained 1 time; caches data args: 1: Source @499 ⏎ `Table{AbstractVector{Continuous}}` 2: Source @806 ⏎ `AbstractVector{OrderedFactor{2}}`
mach.fitresult
(Decision Tree Leaves: 4 Depth: 3, CategoricalArrays.CategoricalValue{Bool, UInt32}[false, true], UInt32[0x00000001, 0x00000002])
report(mach).print_tree(3)
Feature 2, Threshold 0.6340010854678377 L-> Feature 1, Threshold 0.3737270065061601 L-> 2 : 2/2 R-> Feature 1, Threshold 0.7637465112420186 L-> 1 : 2/2 R-> 2 : 1/1 R-> 1 : 3/3
ŷ = predict_mode(mach, X)
8-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}: false false false true true false true false
[ŷ y]
8×2 CategoricalArrays.CategoricalArray{Bool,2,UInt32}: false false false false false false true true true true false false true true false false
confusion_matrix(ŷ, y)
┌───────────────────────────┐ │ Ground Truth │ ┌─────────────┼─────────────┬─────────────┤ │ Predicted │ false │ true │ ├─────────────┼─────────────┼─────────────┤ │ false │ 5 │ 0 │ ├─────────────┼─────────────┼─────────────┤ │ true │ 0 │ 3 │ └─────────────┴─────────────┴─────────────┘
predict_mode(mach, DataFrame(feature1=[.5], feature2=[.5]))
1-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}: false
predict(mach, DataFrame(feature1=[.5], feature2=[.5]))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
report(mach).print_tree(3)
Feature 2, Threshold 0.6340010854678377 L-> Feature 1, Threshold 0.3737270065061601 L-> 2 : 2/2 R-> Feature 1, Threshold 0.7637465112420186 L-> 1 : 2/2 R-> 2 : 1/1 R-> 1 : 3/3
report(mach).print_tree(2)
Feature 2, Threshold 0.6340010854678377 L-> Feature 1, Threshold 0.3737270065061601 L-> 2 : 2/2 R-> R-> 1 : 3/3
model2 = MLJDecisionTreeInterface.DecisionTreeClassifier(max_depth = 2)
DecisionTreeClassifier(
max_depth = 2,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = 0,
post_prune = false,
merge_purity_threshold = 1.0,
pdf_smoothing = 0.0,
display_depth = 5,
rng = Random._GLOBAL_RNG()) @790
mach2 = machine(model2, X, y)
fit!(mach2)
mach2.fitresult
┌ Info: Training Machine{DecisionTreeClassifier,…} @711.
└ @ MLJBase C:\Users\kay\.julia\packages\MLJBase\xlh6G\src\machines.jl:390
(Decision Tree Leaves: 3 Depth: 2, CategoricalArrays.CategoricalValue{Bool, UInt32}[false, true], UInt32[0x00000001, 0x00000002])
report(mach2).print_tree(2)
Feature 2, Threshold 0.6340010854678377 L-> Feature 1, Threshold 0.3737270065061601 L-> 2 : 2/2 R-> 1 : 2/3 R-> 1 : 3/3
ŷ = predict_mode(mach2, X)
8-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}: false false false true false false true false
[ŷ y]
8×2 CategoricalArrays.CategoricalArray{Bool,2,UInt32}: false false false false false false true true false true false false true true false false
confusion_matrix(ŷ, y)
┌───────────────────────────┐ │ Ground Truth │ ┌─────────────┼─────────────┬─────────────┤ │ Predicted │ false │ true │ ├─────────────┼─────────────┼─────────────┤ │ false │ 5 │ 1 │ ├─────────────┼─────────────┼─────────────┤ │ true │ 0 │ 2 │ └─────────────┴─────────────┴─────────────┘
7/8, accuracy(ŷ, y)
(0.875, 0.875)
predict(mach2, X)
8-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0) UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0) UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333) UnivariateFinite{OrderedFactor{2}}(false=>0.0, true=>1.0) UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333) UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333) UnivariateFinite{OrderedFactor{2}}(false=>0.0, true=>1.0) UnivariateFinite{OrderedFactor{2}}(false=>1.0, true=>0.0)
predict_mode(mach2, DataFrame(feature1=[.5], feature2=[.5]))
1-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}: false
predict(mach2, DataFrame(feature1=[.5], feature2=[.5]))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Bool, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(false=>0.667, true=>0.333)
info("DecisionTreeClassifier", pkg="DecisionTree")
CART decision tree classifier. → based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl). → do `@load DecisionTreeClassifier pkg="DecisionTree"` to use the model. → do `?DecisionTreeClassifier` for documentation. (name = "DecisionTreeClassifier", package_name = "DecisionTree", is_supervised = true, abstract_type = Probabilistic, deep_properties = (), docstring = "CART decision tree classifier.\n→ based on [DecisionTree](https://github.com/bensadeghi/DecisionTree.jl).\n→ do `@load DecisionTreeClassifier pkg=\"DecisionTree\"` to use the model.\n→ do `?DecisionTreeClassifier` for documentation.", fit_data_scitype = Tuple{Table{_s48} where _s48<:Union{AbstractVector{_s47} where _s47<:Count, AbstractVector{_s47} where _s47<:OrderedFactor, AbstractVector{_s47} where _s47<:Continuous}, AbstractVector{_s41} where _s41<:Finite}, hyperparameter_ranges = (nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing), hyperparameter_types = ("Int64", "Int64", "Int64", "Float64", "Int64", "Bool", "Float64", "Float64", "Int64", "Union{Integer, Random.AbstractRNG}"), hyperparameters = (:max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :n_subfeatures, :post_prune, :merge_purity_threshold, :pdf_smoothing, :display_depth, :rng), implemented_methods = [:clean!, :fit, :fitted_params, :predict], inverse_transform_scitype = Unknown, is_pure_julia = true, is_wrapper = false, iteration_parameter = nothing, load_path = "MLJDecisionTreeInterface.DecisionTreeClassifier", package_license = "MIT", package_url = "https://github.com/bensadeghi/DecisionTree.jl", package_uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb", predict_scitype = AbstractVector{ScientificTypesBase.Density{_s25} where _s25<:Finite}, prediction_type = :probabilistic, supports_class_weights = false, supports_online = false, supports_training_losses = false, supports_weights = false, transform_scitype = Unknown, input_scitype = Table{_s48} where _s48<:Union{AbstractVector{_s47} where _s47<:Count, AbstractVector{_s47} where _s47<:OrderedFactor, AbstractVector{_s47} where _s47<:Continuous}, target_scitype = AbstractVector{_s41} where _s41<:Finite, output_scitype = Unknown,)
report(mach).print_tree(3)
Feature 2, Threshold 0.6340010854678377 L-> Feature 1, Threshold 0.3737270065061601 L-> 2 : 2/2 R-> Feature 1, Threshold 0.7637465112420186 L-> 1 : 2/2 R-> 2 : 1/1 R-> 1 : 3/3
t, f2 = df.target, df.feature2
idx = sortperm(f2)
t, f2 = t[idx], f2[idx]
s = 0.63
t[f2 .<= s]
5-element CategoricalArrays.CategoricalArray{Bool,1,UInt32}: false true true false true
1 - (2/5)^2 - (3/5)^2
0.48
impurity(t) = 1 - sum(([sum(t .== i) for i in unique(t)]./length(t)).^2)
impurity(t[f2 .<= s])
0.48
impurity(t[f2 .> s])
0.0
f2[1:end-1] .+ diff(f2)/2
7-element Vector{Float64}: 0.12477885945422329 0.2882471857031962 0.45488184432864687 0.5718955812012008 0.6340010854678377 0.7143205625010004 0.8640919332503664
split_cont(f) = f[1:end-1] .+ diff(f)/2
split_cont(f2)
7-element Vector{Float64}: 0.12477885945422329 0.2882471857031962 0.45488184432864687 0.5718955812012008 0.6340010854678377 0.7143205625010004 0.8640919332503664
impurity(t)
0.46875
# IG = impurity(T) - impurity(TL)*nTL/nT - impurity(TR)*nTR/nT
IG = 0.46875 - 0.48*5/8 - 0.0*3/8
0.16875
IG = impurity(t) - impurity(t[f2 .<= s])*5/8 - impurity(t[f2 .> s])*3/8
0.16875
function infogain(s,f,t)
is = f .<= s
IG = impurity(t) - impurity(t[is])*length(f[is])/length(f) -
impurity(t[.!is])*length(f[.!is])/length(f)
end
infogain (generic function with 1 method)
df
8 rows × 3 columns
target | feature1 | feature2 | |
---|---|---|---|
Cat… | Float64 | Float64 | |
1 | false | 0.0761072 | 0.987106 |
2 | false | 0.420093 | 0.741078 |
3 | false | 0.666973 | 0.0194752 |
4 | true | 0.0972444 | 0.230083 |
5 | true | 0.86052 | 0.580439 |
6 | false | 0.447504 | 0.563352 |
7 | true | 0.29995 | 0.346412 |
8 | false | 0.629998 | 0.687563 |
T = Matrix(df[!,[2,3,1]])
8×3 Matrix{Float64}: 0.0761072 0.987106 0.0 0.420093 0.741078 0.0 0.666973 0.0194752 0.0 0.0972444 0.230083 1.0 0.86052 0.580439 1.0 0.447504 0.563352 0.0 0.29995 0.346412 1.0 0.629998 0.687563 0.0
idx1 = sortperm(T[:,1])
8-element Vector{Int64}: 1 4 7 2 6 8 3 5
s1 = split_cont(T[idx1,1])
7-element Vector{Float64}: 0.08667578253830954 0.19859718643090984 0.3600215998051657 0.4337986380036255 0.5387510783253423 0.6484857689281089 0.7637465112420186
[infogain(i,T[idx1,1],T[idx1,end]) for i in s1]
7-element Vector{Float64}: 0.04017857142857134 0.010416666666666685 0.10208333333333347 0.03125 0.0020833333333333537 0.010416666666666685 0.11160714285714285
idx2 = sortperm(T[:,2])
s2 = split_cont(T[idx2,2])
7-element Vector{Float64}: 0.12477885945422329 0.2882471857031962 0.45488184432864687 0.5718955812012008 0.6340010854678377 0.7143205625010004 0.8640919332503664
[infogain(i,T[idx2,2],T[idx2,end]) for i in s2]
7-element Vector{Float64}: 0.04017857142857134 0.010416666666666685 0.10208333333333347 0.03125 0.16875 0.09375 0.04017857142857134
argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])
5
s2[argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])]
0.6340010854678377
isL = T[idx2,2] .<= s2[argmax([infogain(i,T[idx2,2],T[idx2,end]) for i in s2])]
8-element BitVector: 1 1 1 1 1 0 0 0
TL = T[idx2[isL],:]
5×3 Matrix{Float64}: 0.666973 0.0194752 0.0 0.0972444 0.230083 1.0 0.29995 0.346412 1.0 0.447504 0.563352 0.0 0.86052 0.580439 1.0
impurity(TL[:,end])
0.48
TR = T[idx2[.!isL],:]
3×3 Matrix{Float64}: 0.629998 0.687563 0.0 0.420093 0.741078 0.0 0.0761072 0.987106 0.0
impurity(TR[:,end])
0.0
idx1 = sortperm(TL[:,1])
s1 = split_cont(TL[idx1,1])
[infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1]
4-element Vector{Float64}: 0.07999999999999996 0.21333333333333332 0.013333333333333308 0.07999999999999996
idx2 = sortperm(TL[:,2])
s2 = split_cont(TL[idx2,2])
[infogain(i,TL[idx2,2],TL[idx2,end]) for i in s2]
4-element Vector{Float64}: 0.18 0.013333333333333308 0.013333333333333308 0.07999999999999996
s1[argmax([infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1])]
0.3737270065061601
isLL = TL[idx1,1] .<= s1[argmax([infogain(i,TL[idx1,1],TL[idx1,end]) for i in s1])]
5-element BitVector: 1 1 0 0 0
TLL = TL[idx1[isLL],:]
2×3 Matrix{Float64}: 0.0972444 0.230083 1.0 0.29995 0.346412 1.0
impurity(TLL[:,end])
0.0
TLR = TL[idx1[.!isLL],:]
3×3 Matrix{Float64}: 0.447504 0.563352 0.0 0.666973 0.0194752 0.0 0.86052 0.580439 1.0
impurity(TLR[:,end])
0.4444444444444444
idx1 = sortperm(TLR[:,1])
s1 = split_cont(TLR[idx1,1])
[infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1]
2-element Vector{Float64}: 0.1111111111111111 0.4444444444444444
idx2 = sortperm(TLR[:,2])
s2 = split_cont(TLR[idx2,2])
[infogain(i,TLR[idx2,2],TLR[idx2,end]) for i in s2]
2-element Vector{Float64}: 0.1111111111111111 0.4444444444444444
s1[argmax([infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1])]
0.7637465112420186
isLLL = TLR[idx1,1] .<= s1[argmax([infogain(i,TLR[idx1,1],TLR[idx1,end]) for i in s1])]
3-element BitVector: 1 1 0
TLLL = TLR[idx1[isLLL],:]
2×3 Matrix{Float64}: 0.447504 0.563352 0.0 0.666973 0.0194752 0.0
impurity(TLLL[:,end])
0.0
TLRR = TLR[idx1[.!isLLL],:]
1×3 Matrix{Float64}: 0.86052 0.580439 1.0
impurity(TLRR[:,end])
0.0
using MLJ, DataFrames, CSV
df = DataFrame(CSV.File("ML-3-Titanic-train.csv"))
first(df, 5)
5 rows × 8 columns (omitted printing of 2 columns)
Survived | Pclass | Name | Sex | Age | Siblings/Spouses Aboard | |
---|---|---|---|---|---|---|
Int64 | Int64 | String | String | Float64 | Int64 | |
1 | 1 | 1 | Mr. Dickinson H Bishop | male | 25.0 | 1 |
2 | 0 | 3 | Miss. Aloisia Haas | female | 24.0 | 0 |
3 | 0 | 3 | Mr. William Alfred Brocklebank | male | 35.0 | 0 |
4 | 0 | 3 | Mr. Samuel Beard Risien | male | 69.0 | 0 |
5 | 1 | 1 | Mr. Harry Anderson | male | 48.0 | 0 |
describe(df)
8 rows × 7 columns (omitted printing of 2 columns)
variable | mean | min | median | max | |
---|---|---|---|---|---|
Symbol | Union… | Any | Union… | Any | |
1 | Survived | 0.394366 | 0 | 0.0 | 1 |
2 | Pclass | 2.30704 | 1 | 3.0 | 3 |
3 | Name | Capt. Edward Gifford Crosby | Rev. Thomas Roussel Davids Byles | ||
4 | Sex | female | male | ||
5 | Age | 29.4592 | 0.42 | 28.0 | 74.0 |
6 | Siblings/Spouses Aboard | 0.512676 | 0 | 0.0 | 8 |
7 | Parents/Children Aboard | 0.394366 | 0 | 0.0 | 6 |
8 | Fare | 32.3272 | 0.0 | 14.4563 | 512.329 |
rename!(df, "Siblings/Spouses Aboard"=>:Sibsp, "Parents/Children Aboard"=>:Parch)
df = select(df, Not(:Name))
first(df, 5)
5 rows × 7 columns
Survived | Pclass | Sex | Age | Sibsp | Parch | Fare | |
---|---|---|---|---|---|---|---|
Int64 | Int64 | String | Float64 | Int64 | Int64 | Float64 | |
1 | 1 | 1 | male | 25.0 | 1 | 0 | 91.0792 |
2 | 0 | 3 | female | 24.0 | 0 | 0 | 8.85 |
3 | 0 | 3 | male | 35.0 | 0 | 0 | 8.05 |
4 | 0 | 3 | male | 69.0 | 0 | 0 | 14.5 |
5 | 1 | 1 | male | 48.0 | 0 | 0 | 26.55 |
describe(df)
7 rows × 7 columns
variable | mean | min | median | max | nmissing | eltype | |
---|---|---|---|---|---|---|---|
Symbol | Union… | Any | Union… | Any | Int64 | DataType | |
1 | Survived | 0.394366 | 0 | 0.0 | 1 | 0 | Int64 |
2 | Pclass | 2.30704 | 1 | 3.0 | 3 | 0 | Int64 |
3 | Sex | female | male | 0 | String | ||
4 | Age | 29.4592 | 0.42 | 28.0 | 74.0 | 0 | Float64 |
5 | Sibsp | 0.512676 | 0 | 0.0 | 8 | 0 | Int64 |
6 | Parch | 0.394366 | 0 | 0.0 | 6 | 0 | Int64 |
7 | Fare | 32.3272 | 0.0 | 14.4563 | 512.329 | 0 | Float64 |
schema(df)
┌──────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼─────────┼────────────┤ │ Survived │ Int64 │ Count │ │ Pclass │ Int64 │ Count │ │ Sex │ String │ Textual │ │ Age │ Float64 │ Continuous │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └──────────┴─────────┴────────────┘ _.nrows = 710
subtypes(Finite)
2-element Vector{Any}: Multiclass OrderedFactor
subtypes(Infinite)
2-element Vector{Any}: Continuous Count
df.Survived = coerce(df.Survived, OrderedFactor)
schema(df)
┌──────────┬─────────────────────────────────┬──────────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼─────────────────────────────────┼──────────────────┤ │ Survived │ CategoricalValue{Int64, UInt32} │ OrderedFactor{2} │ │ Pclass │ Int64 │ Count │ │ Sex │ String │ Textual │ │ Age │ Float64 │ Continuous │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └──────────┴─────────────────────────────────┴──────────────────┘ _.nrows = 710
df.Sex = coerce(df.Sex, OrderedFactor)
df.Sex = coerce(df.Sex, Count)
schema(df)
┌──────────┬─────────────────────────────────┬──────────────────┐ │ _.names │ _.types │ _.scitypes │ ├──────────┼─────────────────────────────────┼──────────────────┤ │ Survived │ CategoricalValue{Int64, UInt32} │ OrderedFactor{2} │ │ Pclass │ Int64 │ Count │ │ Sex │ Int64 │ Count │ │ Age │ Float64 │ Continuous │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └──────────┴─────────────────────────────────┴──────────────────┘ _.nrows = 710
first(df, 5)
5 rows × 7 columns
Survived | Pclass | Sex | Age | Sibsp | Parch | Fare | |
---|---|---|---|---|---|---|---|
Cat… | Int64 | Int64 | Float64 | Int64 | Int64 | Float64 | |
1 | 1 | 1 | 2 | 25.0 | 1 | 0 | 91.0792 |
2 | 0 | 3 | 1 | 24.0 | 0 | 0 | 8.85 |
3 | 0 | 3 | 2 | 35.0 | 0 | 0 | 8.05 |
4 | 0 | 3 | 2 | 69.0 | 0 | 0 | 14.5 |
5 | 1 | 1 | 2 | 48.0 | 0 | 0 | 26.55 |
y = df.Survived
X = select(df, Not(:Survived))
first(X, 5)
5 rows × 6 columns
Pclass | Sex | Age | Sibsp | Parch | Fare | |
---|---|---|---|---|---|---|
Int64 | Int64 | Float64 | Int64 | Int64 | Float64 | |
1 | 1 | 2 | 25.0 | 1 | 0 | 91.0792 |
2 | 3 | 1 | 24.0 | 0 | 0 | 8.85 |
3 | 3 | 2 | 35.0 | 0 | 0 | 8.05 |
4 | 3 | 2 | 69.0 | 0 | 0 | 14.5 |
5 | 1 | 2 | 48.0 | 0 | 0 | 26.55 |
scitype(y)
AbstractVector{OrderedFactor{2}} (alias for AbstractArray{OrderedFactor{2}, 1})
models(matching(X, y))
11-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple}: (name = AdaBoostStumpClassifier, package_name = DecisionTree, ... ) (name = ConstantClassifier, package_name = MLJModels, ... ) (name = DecisionTreeClassifier, package_name = BetaML, ... ) (name = DecisionTreeClassifier, package_name = DecisionTree, ... ) (name = DeterministicConstantClassifier, package_name = MLJModels, ... ) (name = KernelPerceptronClassifier, package_name = BetaML, ... ) (name = PegasosClassifier, package_name = BetaML, ... ) (name = PerceptronClassifier, package_name = BetaML, ... ) (name = RandomForestClassifier, package_name = BetaML, ... ) (name = RandomForestClassifier, package_name = DecisionTree, ... ) (name = RandomForestClassifier, package_name = ScikitLearn, ... )
@load DecisionTreeClassifier pkg=DecisionTree verbosity=0
model = MLJDecisionTreeInterface.DecisionTreeClassifier()
mach = machine(model, X, y)
Machine{DecisionTreeClassifier,…} @647 trained 0 times; caches data args: 1: Source @114 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}` 2: Source @304 ⏎ `AbstractVector{OrderedFactor{2}}`
train, val = partition(eachindex(y), 0.7, shuffle=true, rng=7245)
([101, 70, 656, 623, 103, 575, 576, 115, 506, 188 … 328, 123, 321, 318, 297, 201, 41, 184, 22, 592], [521, 77, 180, 290, 265, 86, 198, 31, 134, 209 … 79, 176, 531, 603, 606, 509, 447, 490, 478, 266])
fit!(mach, rows=train, verbosity=0)
Machine{DecisionTreeClassifier,…} @647 trained 1 time; caches data args: 1: Source @114 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}` 2: Source @304 ⏎ `AbstractVector{OrderedFactor{2}}`
mach.fitresult
(Decision Tree Leaves: 104 Depth: 18, CategoricalArrays.CategoricalValue{Int64, UInt32}[0, 1], UInt32[0x00000001, 0x00000002])
names(X)
6-element Vector{String}: "Pclass" "Sex" "Age" "Sibsp" "Parch" "Fare"
report(mach).print_tree(5)
Feature 2, Threshold 1.5 L-> Feature 1, Threshold 2.5 L-> Feature 3, Threshold 3.0 L-> 1 : 1/1 R-> Feature 6, Threshold 28.85625 L-> Feature 3, Threshold 43.0 L-> R-> R-> Feature 5, Threshold 1.5 L-> 2 : 58/58 R-> R-> Feature 6, Threshold 22.90415 L-> Feature 6, Threshold 18.62915 L-> Feature 3, Threshold 36.5 L-> R-> R-> 2 : 7/7 R-> Feature 3, Threshold 5.5 L-> Feature 3, Threshold 3.5 L-> 1 : 2/2 R-> 2 : 1/1 R-> 1 : 14/14 R-> Feature 3, Threshold 13.0 L-> Feature 4, Threshold 2.5 L-> 2 : 15/15 R-> 1 : 9/9 R-> Feature 6, Threshold 26.26875 L-> Feature 3, Threshold 32.5 L-> Feature 3, Threshold 28.75 L-> R-> R-> Feature 6, Threshold 7.987500000000001 L-> R-> 1 : 43/43 R-> Feature 3, Threshold 58.0 L-> Feature 5, Threshold 0.5 L-> R-> R-> 1 : 7/7
ŷ = predict_mode(mach, X[val,:])
confusion_matrix(ŷ, y[val])
┌───────────────────────────┐ │ Ground Truth │ ┌─────────────┼─────────────┬─────────────┤ │ Predicted │ 0 │ 1 │ ├─────────────┼─────────────┼─────────────┤ │ 0 │ 112 │ 28 │ ├─────────────┼─────────────┼─────────────┤ │ 1 │ 23 │ 50 │ └─────────────┴─────────────┴─────────────┘
accuracy(ŷ, y[val])
0.7605633802816901
sum(y .== 0), sum(y .== 1), length(y)
(430, 280, 710)
sum(y .== 0)/length(y)
0.6056338028169014
train, val = partition(eachindex(y), 0.7, shuffle=true, rng=93544)
fit!(mach, rows=train, verbosity=0)
ŷ = predict_mode(mach, X[val,:])
confusion_matrix(ŷ, y[val])
┌───────────────────────────┐ │ Ground Truth │ ┌─────────────┼─────────────┬─────────────┤ │ Predicted │ 0 │ 1 │ ├─────────────┼─────────────┼─────────────┤ │ 0 │ 102 │ 18 │ ├─────────────┼─────────────┼─────────────┤ │ 1 │ 31 │ 62 │ └─────────────┴─────────────┴─────────────┘
accuracy(ŷ, y[val])
0.7699530516431925
names(X)
6-element Vector{String}: "Pclass" "Sex" "Age" "Sibsp" "Parch" "Fare"
model2 = MLJDecisionTreeInterface.DecisionTreeClassifier(max_depth = 6)
DecisionTreeClassifier(
max_depth = 6,
min_samples_leaf = 1,
min_samples_split = 2,
min_purity_increase = 0.0,
n_subfeatures = 0,
post_prune = false,
merge_purity_threshold = 1.0,
pdf_smoothing = 0.0,
display_depth = 5,
rng = Random._GLOBAL_RNG()) @216
mach2 = machine(model2, X, y)
fit!(mach2, rows=train, verbosity=0)
report(mach2).print_tree(6)
Feature 2, Threshold 1.5 L-> Feature 1, Threshold 2.5 L-> Feature 6, Threshold 28.85625 L-> Feature 6, Threshold 28.23125 L-> Feature 3, Threshold 22.5 L-> 2 : 7/7 R-> Feature 3, Threshold 27.5 L-> 2 : 6/9 R-> 2 : 21/24 R-> 1 : 1/1 R-> Feature 6, Threshold 149.0354 L-> 2 : 51/51 R-> Feature 6, Threshold 152.50625000000002 L-> 1 : 1/1 R-> 2 : 7/7 R-> Feature 3, Threshold 38.5 L-> Feature 6, Threshold 23.25415 L-> Feature 3, Threshold 32.5 L-> Feature 5, Threshold 1.5 L-> 2 : 27/50 R-> 2 : 6/6 R-> 2 : 6/6 R-> Feature 5, Threshold 3.5 L-> Feature 3, Threshold 5.5 L-> 1 : 3/4 R-> 1 : 9/9 R-> 2 : 1/1 R-> 1 : 10/10 R-> Feature 3, Threshold 13.0 L-> Feature 4, Threshold 2.5 L-> Feature 5, Threshold 0.5 L-> Feature 6, Threshold 15.014600000000002 L-> 2 : 1/1 R-> 1 : 1/1 R-> 2 : 17/17 R-> 1 : 8/8 R-> Feature 6, Threshold 26.26875 L-> Feature 3, Threshold 32.5 L-> Feature 3, Threshold 28.75 L-> Feature 6, Threshold 7.239599999999999 L-> 1 : 12/16 R-> 1 : 88/95 R-> Feature 6, Threshold 7.7625 L-> 1 : 4/4 R-> 1 : 17/24 R-> Feature 3, Threshold 43.5 L-> 1 : 43/43 R-> Feature 3, Threshold 45.25 L-> 2 : 2/4 R-> 1 : 25/25 R-> Feature 3, Threshold 60.5 L-> Feature 4, Threshold 0.5 L-> Feature 1, Threshold 2.5 L-> 1 : 19/37 R-> 2 : 5/5 R-> Feature 1, Threshold 1.5 L-> 1 : 8/14 R-> 1 : 8/8 R-> 1 : 9/9
X[val[1:8],:]
8 rows × 6 columns
Pclass | Sex | Age | Sibsp | Parch | Fare | |
---|---|---|---|---|---|---|
Int64 | Int64 | Float64 | Int64 | Int64 | Float64 | |
1 | 2 | 1 | 34.0 | 0 | 0 | 13.0 |
2 | 3 | 2 | 28.0 | 0 | 0 | 8.05 |
3 | 1 | 2 | 41.0 | 0 | 0 | 26.55 |
4 | 3 | 2 | 27.0 | 0 | 0 | 7.3125 |
5 | 1 | 1 | 2.0 | 1 | 2 | 151.55 |
6 | 3 | 1 | 14.0 | 1 | 0 | 11.2417 |
7 | 3 | 2 | 18.0 | 0 | 0 | 7.7958 |
8 | 1 | 2 | 48.0 | 0 | 0 | 26.55 |
predict(mach2, X[val[1:8],:])
8-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(0=>0.125, 1=>0.875) UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737) UnivariateFinite{OrderedFactor{2}}(0=>0.514, 1=>0.486) UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737) UnivariateFinite{OrderedFactor{2}}(0=>1.0, 1=>0.0) UnivariateFinite{OrderedFactor{2}}(0=>0.46, 1=>0.54) UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737) UnivariateFinite{OrderedFactor{2}}(0=>0.514, 1=>0.486)
predict_mode(mach2, X[val[1:8],:])
8-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}: 1 0 0 0 0 1 0 0
ŷ = predict_mode(mach2, X[val,:])
confusion_matrix(ŷ, y[val])
┌───────────────────────────┐ │ Ground Truth │ ┌─────────────┼─────────────┬─────────────┤ │ Predicted │ 0 │ 1 │ ├─────────────┼─────────────┼─────────────┤ │ 0 │ 127 │ 20 │ ├─────────────┼─────────────┼─────────────┤ │ 1 │ 6 │ 60 │ └─────────────┴─────────────┴─────────────┘
accuracy(ŷ, y[val])
0.8779342723004695
describe(X)
6 rows × 7 columns
variable | mean | min | median | max | nmissing | eltype | |
---|---|---|---|---|---|---|---|
Symbol | Float64 | Real | Float64 | Real | Int64 | DataType | |
1 | Pclass | 2.30704 | 1 | 3.0 | 3 | 0 | Int64 |
2 | Sex | 1.63662 | 1 | 2.0 | 2 | 0 | Int64 |
3 | Age | 29.4592 | 0.42 | 28.0 | 74.0 | 0 | Float64 |
4 | Sibsp | 0.512676 | 0 | 0.0 | 8 | 0 | Int64 |
5 | Parch | 0.394366 | 0 | 0.0 | 6 | 0 | Int64 |
6 | Fare | 32.3272 | 0.0 | 14.4563 | 512.329 | 0 | Float64 |
Use model to predict survival of for people that were not acutal passengers.
predict_mode(mach2, DataFrame(Plass=3, Sex=2, Age=22., Sibsp=0, Parch=0, Fare=14.))
1-element CategoricalArrays.CategoricalArray{Int64,1,UInt32}: 0
predict(mach2, DataFrame(Plass=3, Sex=2, Age=22., Sibsp=0, Parch=0, Fare=14.)) # Jack
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(0=>0.926, 1=>0.0737)
predict(mach2, DataFrame(Plass=1, Sex=1, Age=20., Sibsp=0, Parch=2, Fare=414.)) # Rose
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(0=>0.0, 1=>1.0)
predict(mach2, DataFrame(Plass=3, Sex=1, Age=2., Sibsp=1, Parch=2, Fare=14.))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(0=>0.0, 1=>1.0)
predict(mach2, DataFrame(Plass=3, Sex=2, Age=62., Sibsp=1, Parch=0, Fare=14.))
1-element MLJBase.UnivariateFiniteVector{OrderedFactor{2}, Int64, UInt32, Float64}: UnivariateFinite{OrderedFactor{2}}(0=>1.0, 1=>0.0)
Can consider using new features that are derived from the original features.
using Plots, Statistics
display(scatter(df.Pclass, df.Fare, xlabel=:Pclass, ylabel=:Fare, legend=false))
cor(df.Pclass,df.Fare)
-0.5453318579475805
display(scatter(df.Age, df.Fare, xlabel=:Age, ylabel=:Fare, legend=false))
cor(df.Age,df.Fare)
0.11127730982128829
MLJ.save("my_machine.jlso", mach2)
using MLJ, DataFrames, CSV
df = DataFrame(CSV.File("ML-3-Titanic-test.csv"))
first(df, 5)
5 rows × 7 columns (omitted printing of 2 columns)
Pclass | Name | Sex | Age | Siblings/Spouses Aboard | |
---|---|---|---|---|---|
Int64 | String | String | Int64 | Int64 | |
1 | 2 | Mr. Moses Aaron Troupiansky | male | 23 | 0 |
2 | 2 | Rev. Juozas Montvila | male | 27 | 0 |
3 | 3 | Miss. Marguerite Rut Sandstrom | female | 4 | 1 |
4 | 1 | Mrs. Thomas Jr (Lily Alexenia Wilson) Potter | female | 56 | 0 |
5 | 3 | Mr. Farred Chehab Emir | male | 26 | 0 |
rename!(df, "Siblings/Spouses Aboard"=>:Sibsp, "Parents/Children Aboard"=>:Parch)
df = select(df, Not(:Name))
first(df, 5)
5 rows × 6 columns
Pclass | Sex | Age | Sibsp | Parch | Fare | |
---|---|---|---|---|---|---|
Int64 | String | Int64 | Int64 | Int64 | Float64 | |
1 | 2 | male | 23 | 0 | 0 | 13.0 |
2 | 2 | male | 27 | 0 | 0 | 13.0 |
3 | 3 | female | 4 | 1 | 1 | 16.7 |
4 | 1 | female | 56 | 0 | 1 | 83.1583 |
5 | 3 | male | 26 | 0 | 0 | 7.225 |
schema(df)
┌─────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├─────────┼─────────┼────────────┤ │ Pclass │ Int64 │ Count │ │ Sex │ String │ Textual │ │ Age │ Int64 │ Count │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └─────────┴─────────┴────────────┘ _.nrows = 15
df.Sex = coerce(df.Sex, OrderedFactor)
df.Sex = coerce(df.Sex, Count)
df.Age = coerce(df.Age, Continuous)
schema(df)
┌─────────┬─────────┬────────────┐ │ _.names │ _.types │ _.scitypes │ ├─────────┼─────────┼────────────┤ │ Pclass │ Int64 │ Count │ │ Sex │ Int64 │ Count │ │ Age │ Float64 │ Continuous │ │ Sibsp │ Int64 │ Count │ │ Parch │ Int64 │ Count │ │ Fare │ Float64 │ Continuous │ └─────────┴─────────┴────────────┘ _.nrows = 15
mach3 = machine("my_machine.jlso")
X = df
ŷ = predict_mode(mach3, X)
y = [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1]
y = coerce(y, OrderedFactor)
accuracy(ŷ, y)
0.8
confusion_matrix(ŷ, y)
┌───────────────────────────┐ │ Ground Truth │ ┌─────────────┼─────────────┬─────────────┤ │ Predicted │ 0 │ 1 │ ├─────────────┼─────────────┼─────────────┤ │ 0 │ 9 │ 3 │ ├─────────────┼─────────────┼─────────────┤ │ 1 │ 0 │ 3 │ └─────────────┴─────────────┴─────────────┘