using Statistics
using DataFrames, CSV, Query
using RDatasets
using Pkg
Pkg.status(["DataFrames", "CSV", "Query", "RDatasets"])
Status `~/.julia/environments/v1.4/Project.toml` [336ed68f] CSV v0.6.2 [a93c6f00] DataFrames v0.21.1 [1a8c2f83] Query v0.12.2 [ce6b1742] RDatasets v0.6.8
missing is used to represent a missing value in Julia, and it is a built-in object.missing is the singleton of Missing type.missing are undetermined will return missing.true && missing
missing
1 + missing
missing
false && missing
false
true || missing
true
ismissing(missing)
true
DataFrame is a table-like data structure containing a series of vectors.df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"])
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
| 2 | 2 | F |
| 3 | 3 | F |
| 4 | 4 | M |
df = DataFrame() # 先初始化一個空的,再放入資料
df[!, :A] = 1:8
df[!, :B] = ["M", "F", "F", "M", "F", "M", "M", "F"]
df
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
| 2 | 2 | F |
| 3 | 3 | F |
| 4 | 4 | M |
| 5 | 5 | F |
| 6 | 6 | M |
| 7 | 7 | M |
| 8 | 8 | F |
df = DataFrame(A=Int64[], B=String[])
| A | B | |
|---|---|---|
| Int64 | String |
push!(df, (1, "M"))
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
push!(df, [2, "F"])
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
| 2 | 2 | F |
push!(df, Dict(:B => "M", :A => 4))
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
| 2 | 2 | F |
| 3 | 4 | M |
mat = [1 "M"; 2 "F"; 3 "F"]
DataFrame(mat, [:A, :B])
| A | B | |
|---|---|---|
| Any | Any | |
| 1 | 1 | M |
| 2 | 2 | F |
| 3 | 3 | F |
d = Dict("A" => [1, 2, 3], "B" => ["M", "F", "F"])
DataFrame(d)
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
| 2 | 2 | F |
| 3 | 3 | F |
df[!, :A]
3-element Array{Int64,1}:
1
2
4
df[!, 1]
3-element Array{Int64,1}:
1
2
4
df.A
3-element Array{Int64,1}:
1
2
4
df[1, 1]
1
df[1, :A]
1
size(df)
(3, 2)
nrow(df)
3
ncol(df)
2
names(df)
2-element Array{String,1}:
"A"
"B"
df[1:3, :]
| A | B | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | M |
| 2 | 2 | F |
| 3 | 4 | M |
df[1:3, [:B, :A]]
| B | A | |
|---|---|---|
| String | Int64 | |
| 1 | M | 1 |
| 2 | F | 2 |
| 3 | M | 4 |
df[!, :C] = ['a', 'b', 'c']
df
| A | B | C | |
|---|---|---|---|
| Int64 | String | Char | |
| 1 | 1 | M | 'a' |
| 2 | 2 | F | 'b' |
| 3 | 4 | M | 'c' |
df[!, :B] = ['α', 'β', 'γ']
df
| A | B | C | |
|---|---|---|---|
| Int64 | Char | Char | |
| 1 | 1 | 'α' | 'a' |
| 2 | 2 | 'β' | 'b' |
| 3 | 4 | 'γ' | 'c' |
df[df[!,:A] .% 2 .== 0, :] # 取符合條件的列,跟所有欄位
| A | B | C | |
|---|---|---|---|
| Int64 | Char | Char | |
| 1 | 2 | 'β' | 'b' |
| 2 | 4 | 'γ' | 'c' |
df[!, :D] = df[!, :B] .* df[!, :C]
df
| A | B | C | D | |
|---|---|---|---|---|
| Int64 | Char | Char | String | |
| 1 | 1 | 'α' | 'a' | αa |
| 2 | 2 | 'β' | 'b' | βb |
| 3 | 4 | 'γ' | 'c' | γc |
mean(df[!,1]) # 對第1欄取平均
2.3333333333333335
median(df[!,:A]) # 對第一個欄位取中位數
2.0
df = DataFrame(A = 1:4, B = randn(4))
mapcols(cumsum, df)
| A | B | |
|---|---|---|
| Int64 | Float64 | |
| 1 | 1 | 0.238524 |
| 2 | 3 | 1.09133 |
| 3 | 6 | 0.384994 |
| 4 | 10 | 0.286134 |
CSV.write("test.csv", df)
"test.csv"
df = CSV.read("test.csv")
| A | B | |
|---|---|---|
| Int64 | Float64 | |
| 1 | 1 | 0.238524 |
| 2 | 2 | 0.852803 |
| 3 | 3 | -0.706332 |
| 4 | 4 | -0.0988603 |
first(RDatasets.datasets(), 6) # 可以選你要的資料集
| Package | Dataset | Title | Rows | Columns | |
|---|---|---|---|---|---|
| String | String | String | Int64 | Int64 | |
| 1 | COUNT | affairs | affairs | 601 | 18 |
| 2 | COUNT | azdrg112 | azdrg112 | 1798 | 4 |
| 3 | COUNT | azpro | azpro | 3589 | 6 |
| 4 | COUNT | badhealth | badhealth | 1127 | 3 |
| 5 | COUNT | fasttrakg | fasttrakg | 15 | 9 |
| 6 | COUNT | lbw | lbw | 189 | 10 |
iris = dataset("datasets", "iris")
first(iris, 6)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | |
| 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
size(iris) # 看一下他有幾列幾行
(150, 5)
size(vcat(iris, iris))
(300, 5)
size(hcat(iris, iris, makeunique=true))
(150, 10)
completecases(iris) # true means no missing
150-element BitArray{1}:
1
1
1
1
1
1
1
1
1
1
1
1
1
⋮
1
1
1
1
1
1
1
1
1
1
1
1
first(iris[completecases(iris), :], 10) # equivalent to complete_cases!(iris)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | |
| 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 8 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 10 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
first(unique(iris), 10)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | |
| 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 8 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 10 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
iris[!,:PetalArea] = iris[!,:PetalLength] .* iris[!,:PetalWidth]
first(iris)
DataFrameRow (6 columns)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | PetalArea | |
|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | Float64 | |
| 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 0.28 |
first(sort!(iris, :SepalLength), 10)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | PetalArea | |
|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | Float64 | |
| 1 | 4.3 | 3.0 | 1.1 | 0.1 | setosa | 0.11 |
| 2 | 4.4 | 2.9 | 1.4 | 0.2 | setosa | 0.28 |
| 3 | 4.4 | 3.0 | 1.3 | 0.2 | setosa | 0.26 |
| 4 | 4.4 | 3.2 | 1.3 | 0.2 | setosa | 0.26 |
| 5 | 4.5 | 2.3 | 1.3 | 0.3 | setosa | 0.39 |
| 6 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 0.3 |
| 7 | 4.6 | 3.4 | 1.4 | 0.3 | setosa | 0.42 |
| 8 | 4.6 | 3.6 | 1.0 | 0.2 | setosa | 0.2 |
| 9 | 4.6 | 3.2 | 1.4 | 0.2 | setosa | 0.28 |
| 10 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 0.26 |
first(sort!(iris, [:Species, :SepalLength, :SepalWidth], rev=[true, false, false]), 10)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | PetalArea | |
|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | Float64 | |
| 1 | 4.9 | 2.5 | 4.5 | 1.7 | virginica | 7.65 |
| 2 | 5.6 | 2.8 | 4.9 | 2.0 | virginica | 9.8 |
| 3 | 5.7 | 2.5 | 5.0 | 2.0 | virginica | 10.0 |
| 4 | 5.8 | 2.7 | 5.1 | 1.9 | virginica | 9.69 |
| 5 | 5.8 | 2.7 | 5.1 | 1.9 | virginica | 9.69 |
| 6 | 5.8 | 2.8 | 5.1 | 2.4 | virginica | 12.24 |
| 7 | 5.9 | 3.0 | 5.1 | 1.8 | virginica | 9.18 |
| 8 | 6.0 | 2.2 | 5.0 | 1.5 | virginica | 7.5 |
| 9 | 6.0 | 3.0 | 4.8 | 1.8 | virginica | 8.64 |
| 10 | 6.1 | 2.6 | 5.6 | 1.4 | virginica | 7.84 |
employees = DataFrame(ID = [1, 2, 3], Name = ["John Doe", "Jane Doe", "Andy Doe"])
| ID | Name | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | John Doe |
| 2 | 2 | Jane Doe |
| 3 | 3 | Andy Doe |
jobs = DataFrame(ID = [1, 2, 4], Job = ["Lawyer", "Doctor", "Chief"])
| ID | Job | |
|---|---|---|
| Int64 | String | |
| 1 | 1 | Lawyer |
| 2 | 2 | Doctor |
| 3 | 4 | Chief |
full = innerjoin(employees, jobs, on=:ID)
| ID | Name | Job | |
|---|---|---|---|
| Int64 | String | String | |
| 1 | 1 | John Doe | Lawyer |
| 2 | 2 | Jane Doe | Doctor |
left_join = leftjoin(employees, jobs, on=:ID)
| ID | Name | Job | |
|---|---|---|---|
| Int64 | String | String? | |
| 1 | 1 | John Doe | Lawyer |
| 2 | 2 | Jane Doe | Doctor |
| 3 | 3 | Andy Doe | missing |
right_join = rightjoin(employees, jobs, on=:ID)
| ID | Name | Job | |
|---|---|---|---|
| Int64 | String? | String | |
| 1 | 1 | John Doe | Lawyer |
| 2 | 2 | Jane Doe | Doctor |
| 3 | 4 | missing | Chief |
outer_join = outerjoin(employees, jobs, on=:ID)
| ID | Name | Job | |
|---|---|---|---|
| Int64 | String? | String? | |
| 1 | 1 | John Doe | Lawyer |
| 2 | 2 | Jane Doe | Doctor |
| 3 | 3 | Andy Doe | missing |
| 4 | 4 | missing | Chief |
cross_join = crossjoin(employees, jobs, makeunique=true) # 不需要key
| ID | Name | ID_1 | Job | |
|---|---|---|---|---|
| Int64 | String | Int64 | String | |
| 1 | 1 | John Doe | 1 | Lawyer |
| 2 | 1 | John Doe | 2 | Doctor |
| 3 | 1 | John Doe | 4 | Chief |
| 4 | 2 | Jane Doe | 1 | Lawyer |
| 5 | 2 | Jane Doe | 2 | Doctor |
| 6 | 2 | Jane Doe | 4 | Chief |
| 7 | 3 | Andy Doe | 1 | Lawyer |
| 8 | 3 | Andy Doe | 2 | Doctor |
| 9 | 3 | Andy Doe | 4 | Chief |
by(iris, :Species, size)
┌ Warning: `by(d::AbstractDataFrame, cols::Any, f::Base.Callable; sort::Bool = false, skipmissing::Bool = false)` is deprecated, use `combine(f, groupby(d, cols, sort = sort, skipmissing = skipmissing))` instead. │ caller = top-level scope at In[54]:1 └ @ Core In[54]:1
| Species | x1 | |
|---|---|---|
| Cat… | Tuple… | |
| 1 | setosa | (50, 6) |
| 2 | versicolor | (50, 6) |
| 3 | virginica | (50, 6) |
by(iris, :Species) do df
DataFrame(μ = mean(df[!,:PetalLength]), σ = var(df[!,:PetalLength]))
end
┌ Warning: `by(f::Base.Callable, d::AbstractDataFrame, cols::Any; sort::Bool = false, skipmissing::Bool = false)` is deprecated, use `combine(f, groupby(d, cols, sort = sort, skipmissing = skipmissing))` instead. │ caller = top-level scope at In[55]:1 └ @ Core In[55]:1
| Species | μ | σ | |
|---|---|---|---|
| Cat… | Float64 | Float64 | |
| 1 | setosa | 1.462 | 0.0301592 |
| 2 | versicolor | 4.26 | 0.220816 |
| 3 | virginica | 5.552 | 0.304588 |
aggregate(iris, :Species, [sum, mean])
┌ Warning: `aggregate(d, cols, fs, sort=false, skipmissing=false)` is deprecated. Instead use combine(groupby(d, cols, sort=false, skipmissing=false), [names(d, Not(cols)) .=> f for f in fs]...)` if functions in `fs` have unique names. │ caller = top-level scope at In[56]:1 └ @ Core In[56]:1
| Species | SepalLength_sum | SepalWidth_sum | PetalLength_sum | PetalWidth_sum | PetalArea_sum | |
|---|---|---|---|---|---|---|
| Cat… | Float64 | Float64 | Float64 | Float64 | Float64 | |
| 1 | setosa | 250.3 | 171.4 | 73.1 | 12.3 | 18.28 |
| 2 | versicolor | 296.8 | 138.5 | 213.0 | 66.3 | 286.02 |
| 3 | virginica | 329.4 | 148.7 | 277.6 | 101.3 | 564.81 |
for subdf in groupby(iris, :Species)
println(size(subdf, 1))
end
50 50 50
gdf = groupby(iris, :Species)
GroupedDataFrame with 3 groups based on key: Species
First Group (50 rows): Species = "setosa"
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | PetalArea | |
|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | Float64 | |
| 1 | 4.3 | 3.0 | 1.1 | 0.1 | setosa | 0.11 |
| 2 | 4.4 | 2.9 | 1.4 | 0.2 | setosa | 0.28 |
| 3 | 4.4 | 3.0 | 1.3 | 0.2 | setosa | 0.26 |
| 4 | 4.4 | 3.2 | 1.3 | 0.2 | setosa | 0.26 |
| 5 | 4.5 | 2.3 | 1.3 | 0.3 | setosa | 0.39 |
| 6 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 0.3 |
| 7 | 4.6 | 3.2 | 1.4 | 0.2 | setosa | 0.28 |
| 8 | 4.6 | 3.4 | 1.4 | 0.3 | setosa | 0.42 |
| 9 | 4.6 | 3.6 | 1.0 | 0.2 | setosa | 0.2 |
| 10 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 0.26 |
| 11 | 4.7 | 3.2 | 1.6 | 0.2 | setosa | 0.32 |
| 12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa | 0.14 |
| 13 | 4.8 | 3.0 | 1.4 | 0.3 | setosa | 0.42 |
| 14 | 4.8 | 3.1 | 1.6 | 0.2 | setosa | 0.32 |
| 15 | 4.8 | 3.4 | 1.6 | 0.2 | setosa | 0.32 |
| 16 | 4.8 | 3.4 | 1.9 | 0.2 | setosa | 0.38 |
| 17 | 4.9 | 3.0 | 1.4 | 0.2 | setosa | 0.28 |
| 18 | 4.9 | 3.1 | 1.5 | 0.1 | setosa | 0.15 |
| 19 | 4.9 | 3.1 | 1.5 | 0.2 | setosa | 0.3 |
| 20 | 4.9 | 3.6 | 1.4 | 0.1 | setosa | 0.14 |
| 21 | 5.0 | 3.0 | 1.6 | 0.2 | setosa | 0.32 |
| 22 | 5.0 | 3.2 | 1.2 | 0.2 | setosa | 0.24 |
| 23 | 5.0 | 3.3 | 1.4 | 0.2 | setosa | 0.28 |
| 24 | 5.0 | 3.4 | 1.5 | 0.2 | setosa | 0.3 |
| 25 | 5.0 | 3.4 | 1.6 | 0.4 | setosa | 0.64 |
| 26 | 5.0 | 3.5 | 1.3 | 0.3 | setosa | 0.39 |
| 27 | 5.0 | 3.5 | 1.6 | 0.6 | setosa | 0.96 |
| 28 | 5.0 | 3.6 | 1.4 | 0.2 | setosa | 0.28 |
| 29 | 5.1 | 3.3 | 1.7 | 0.5 | setosa | 0.85 |
| 30 | 5.1 | 3.4 | 1.5 | 0.2 | setosa | 0.3 |
| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
⋮
Last Group (50 rows): Species = "virginica"
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | PetalArea | |
|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | Float64 | |
| 1 | 4.9 | 2.5 | 4.5 | 1.7 | virginica | 7.65 |
| 2 | 5.6 | 2.8 | 4.9 | 2.0 | virginica | 9.8 |
| 3 | 5.7 | 2.5 | 5.0 | 2.0 | virginica | 10.0 |
| 4 | 5.8 | 2.7 | 5.1 | 1.9 | virginica | 9.69 |
| 5 | 5.8 | 2.7 | 5.1 | 1.9 | virginica | 9.69 |
| 6 | 5.8 | 2.8 | 5.1 | 2.4 | virginica | 12.24 |
| 7 | 5.9 | 3.0 | 5.1 | 1.8 | virginica | 9.18 |
| 8 | 6.0 | 2.2 | 5.0 | 1.5 | virginica | 7.5 |
| 9 | 6.0 | 3.0 | 4.8 | 1.8 | virginica | 8.64 |
| 10 | 6.1 | 2.6 | 5.6 | 1.4 | virginica | 7.84 |
| 11 | 6.1 | 3.0 | 4.9 | 1.8 | virginica | 8.82 |
| 12 | 6.2 | 2.8 | 4.8 | 1.8 | virginica | 8.64 |
| 13 | 6.2 | 3.4 | 5.4 | 2.3 | virginica | 12.42 |
| 14 | 6.3 | 2.5 | 5.0 | 1.9 | virginica | 9.5 |
| 15 | 6.3 | 2.7 | 4.9 | 1.8 | virginica | 8.82 |
| 16 | 6.3 | 2.8 | 5.1 | 1.5 | virginica | 7.65 |
| 17 | 6.3 | 2.9 | 5.6 | 1.8 | virginica | 10.08 |
| 18 | 6.3 | 3.3 | 6.0 | 2.5 | virginica | 15.0 |
| 19 | 6.3 | 3.4 | 5.6 | 2.4 | virginica | 13.44 |
| 20 | 6.4 | 2.7 | 5.3 | 1.9 | virginica | 10.07 |
| 21 | 6.4 | 2.8 | 5.6 | 2.1 | virginica | 11.76 |
| 22 | 6.4 | 2.8 | 5.6 | 2.2 | virginica | 12.32 |
| 23 | 6.4 | 3.1 | 5.5 | 1.8 | virginica | 9.9 |
| 24 | 6.4 | 3.2 | 5.3 | 2.3 | virginica | 12.19 |
| 25 | 6.5 | 3.0 | 5.8 | 2.2 | virginica | 12.76 |
| 26 | 6.5 | 3.0 | 5.5 | 1.8 | virginica | 9.9 |
| 27 | 6.5 | 3.0 | 5.2 | 2.0 | virginica | 10.4 |
| 28 | 6.5 | 3.2 | 5.1 | 2.0 | virginica | 10.2 |
| 29 | 6.7 | 2.5 | 5.8 | 1.8 | virginica | 10.44 |
| 30 | 6.7 | 3.0 | 5.2 | 2.3 | virginica | 11.96 |
| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
combine(gdf, :PetalLength => mean)
| Species | PetalLength_mean | |
|---|---|---|
| Cat… | Float64 | |
| 1 | setosa | 1.462 |
| 2 | versicolor | 4.26 |
| 3 | virginica | 5.552 |
iris[!,:id] = 1:size(iris, 1)
first(iris, 10)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Species | PetalArea | id | |
|---|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Cat… | Float64 | Int64 | |
| 1 | 4.9 | 2.5 | 4.5 | 1.7 | virginica | 7.65 | 1 |
| 2 | 5.6 | 2.8 | 4.9 | 2.0 | virginica | 9.8 | 2 |
| 3 | 5.7 | 2.5 | 5.0 | 2.0 | virginica | 10.0 | 3 |
| 4 | 5.8 | 2.7 | 5.1 | 1.9 | virginica | 9.69 | 4 |
| 5 | 5.8 | 2.7 | 5.1 | 1.9 | virginica | 9.69 | 5 |
| 6 | 5.8 | 2.8 | 5.1 | 2.4 | virginica | 12.24 | 6 |
| 7 | 5.9 | 3.0 | 5.1 | 1.8 | virginica | 9.18 | 7 |
| 8 | 6.0 | 2.2 | 5.0 | 1.5 | virginica | 7.5 | 8 |
| 9 | 6.0 | 3.0 | 4.8 | 1.8 | virginica | 8.64 | 9 |
| 10 | 6.1 | 2.6 | 5.6 | 1.4 | virginica | 7.84 | 10 |
d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth])
first(d, 10)
| Species | PetalArea | id | variable | value | |
|---|---|---|---|---|---|
| Cat… | Float64 | Int64 | Cat… | Float64 | |
| 1 | virginica | 7.65 | 1 | SepalLength | 4.9 |
| 2 | virginica | 9.8 | 2 | SepalLength | 5.6 |
| 3 | virginica | 10.0 | 3 | SepalLength | 5.7 |
| 4 | virginica | 9.69 | 4 | SepalLength | 5.8 |
| 5 | virginica | 9.69 | 5 | SepalLength | 5.8 |
| 6 | virginica | 12.24 | 6 | SepalLength | 5.8 |
| 7 | virginica | 9.18 | 7 | SepalLength | 5.9 |
| 8 | virginica | 7.5 | 8 | SepalLength | 6.0 |
| 9 | virginica | 8.64 | 9 | SepalLength | 6.0 |
| 10 | virginica | 7.84 | 10 | SepalLength | 6.1 |
d = stack(iris, [:SepalLength, :SepalWidth], :Species)
first(d, 10)
| Species | variable | value | |
|---|---|---|---|
| Cat… | Cat… | Float64 | |
| 1 | virginica | SepalLength | 4.9 |
| 2 | virginica | SepalLength | 5.6 |
| 3 | virginica | SepalLength | 5.7 |
| 4 | virginica | SepalLength | 5.8 |
| 5 | virginica | SepalLength | 5.8 |
| 6 | virginica | SepalLength | 5.8 |
| 7 | virginica | SepalLength | 5.9 |
| 8 | virginica | SepalLength | 6.0 |
| 9 | virginica | SepalLength | 6.0 |
| 10 | virginica | SepalLength | 6.1 |
d = stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth])
first(unstack(d, :id, :variable, :value), 10)
| id | SepalLength | SepalWidth | PetalLength | PetalWidth | |
|---|---|---|---|---|---|
| Int64 | Float64? | Float64? | Float64? | Float64? | |
| 1 | 1 | 4.9 | 2.5 | 4.5 | 1.7 |
| 2 | 2 | 5.6 | 2.8 | 4.9 | 2.0 |
| 3 | 3 | 5.7 | 2.5 | 5.0 | 2.0 |
| 4 | 4 | 5.8 | 2.7 | 5.1 | 1.9 |
| 5 | 5 | 5.8 | 2.7 | 5.1 | 1.9 |
| 6 | 6 | 5.8 | 2.8 | 5.1 | 2.4 |
| 7 | 7 | 5.9 | 3.0 | 5.1 | 1.8 |
| 8 | 8 | 6.0 | 2.2 | 5.0 | 1.5 |
| 9 | 9 | 6.0 | 3.0 | 4.8 | 1.8 |
| 10 | 10 | 6.1 | 2.6 | 5.6 | 1.4 |
first(unstack(d, :variable, :value), 10)
| Species | PetalArea | id | SepalLength | SepalWidth | PetalLength | PetalWidth | |
|---|---|---|---|---|---|---|---|
| Cat… | Float64 | Int64 | Float64? | Float64? | Float64? | Float64? | |
| 1 | setosa | 0.11 | 101 | 4.3 | 3.0 | 1.1 | 0.1 |
| 2 | setosa | 0.14 | 112 | 4.8 | 3.0 | 1.4 | 0.1 |
| 3 | setosa | 0.14 | 120 | 4.9 | 3.6 | 1.4 | 0.1 |
| 4 | setosa | 0.15 | 118 | 4.9 | 3.1 | 1.5 | 0.1 |
| 5 | setosa | 0.15 | 139 | 5.2 | 4.1 | 1.5 | 0.1 |
| 6 | setosa | 0.2 | 109 | 4.6 | 3.6 | 1.0 | 0.2 |
| 7 | setosa | 0.24 | 122 | 5.0 | 3.2 | 1.2 | 0.2 |
| 8 | setosa | 0.24 | 150 | 5.8 | 4.0 | 1.2 | 0.2 |
| 9 | setosa | 0.26 | 103 | 4.4 | 3.0 | 1.3 | 0.2 |
| 10 | setosa | 0.26 | 104 | 4.4 | 3.2 | 1.3 | 0.2 |
Calculate the average of every feature grouped by :Species
d = stack(iris)
x = by(d, [:variable, :Species], df -> DataFrame(vsum = mean(df[!,:value])))
unstack(x, :Species, :vsum)
┌ Warning: `by(d::AbstractDataFrame, cols::Any, f::Base.Callable; sort::Bool = false, skipmissing::Bool = false)` is deprecated, use `combine(f, groupby(d, cols, sort = sort, skipmissing = skipmissing))` instead. │ caller = top-level scope at In[65]:2 └ @ Core In[65]:2
| variable | setosa | versicolor | virginica | |
|---|---|---|---|---|
| Cat… | Float64? | Float64? | Float64? | |
| 1 | SepalLength | 5.006 | 5.936 | 6.588 |
| 2 | SepalWidth | 3.428 | 2.77 | 2.974 |
| 3 | PetalLength | 1.462 | 4.26 | 5.552 |
| 4 | PetalWidth | 0.246 | 1.326 | 2.026 |
| 5 | PetalArea | 0.3656 | 5.7204 | 11.2962 |
df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,5,2])
| name | age | children | |
|---|---|---|---|
| String | Float64 | Int64 | |
| 1 | John | 23.0 | 3 |
| 2 | Sally | 42.0 | 5 |
| 3 | Kirk | 59.0 | 2 |
x = @from i in df begin
@where i.age>50
@select {i.name, i.children}
@collect DataFrame
end
| name | children | |
|---|---|---|
| String | Int64 | |
| 1 | Kirk | 2 |
df = DataFrame(a=[2,1,1,2,1,3],b=[2,2,1,1,3,2])
x = @from i in df begin
@orderby descending(i.a), i.b
@select i
@collect DataFrame
end
| a | b | |
|---|---|---|
| Int64 | Int64 | |
| 1 | 3 | 2 |
| 2 | 2 | 1 |
| 3 | 2 | 2 |
| 4 | 1 | 1 |
| 5 | 1 | 2 |
| 6 | 1 | 3 |
df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,5,2])
x = @from i in df begin
@where i.age > 30. && i.children > 2
@select i
@collect DataFrame
end
| name | age | children | |
|---|---|---|---|
| String | Float64 | Int64 | |
| 1 | Sally | 42.0 | 5 |
data = [1,2,3]
x = @from i in data begin
@select i^2
@collect
end
3-element Array{Int64,1}:
1
4
9
source = Dict(:a=>[1,2,3], :b=>[4,5])
q = @from i in source begin
@from j in i.second
@select {Key=i.first,Value=j}
@collect DataFrame
end
| Key | Value | |
|---|---|---|
| Symbol | Int64 | |
| 1 | a | 1 |
| 2 | a | 2 |
| 3 | a | 3 |
| 4 | b | 4 |
| 5 | b | 5 |
df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,2,2])
x = @from i in df begin
@group i.name by i.children
@collect
end
2-element Array{Grouping{Int64,String},1}:
["John"]
["Sally", "Kirk"]