Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds method for splitting data into periods #528

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ jobs:
fail-fast: false
matrix:
version:
- "1.6.7" # LTS
- "1.6"
- "1" # Latest Release
- "1.10.7" # LTS
- "1" # Latest Release
os:
- ubuntu-latest
arch:
Expand Down Expand Up @@ -56,7 +55,7 @@ jobs:
- uses: actions/checkout@v2
- uses: julia-actions/setup-julia@latest
with:
version: '1'
version: "1"
- run: |
git config --global user.name name
git config --global user.email email
Expand Down
11 changes: 11 additions & 0 deletions docs/src/split.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,14 @@ using MarketData
tail(cl)
tail(cl, 3)
```

## Splitting by period

Splitting data by a given function, e.g. `Dates.day` into periods.

```@repl
using TimeSeries
using MarketData

split(cl, Dates.day)
```
12 changes: 6 additions & 6 deletions src/TimeSeries.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ using Tables
using PrettyTables: pretty_table

export TimeArray, AbstractTimeSeries,
when, from, to, findwhen, timestamp, values, colnames, meta, head, tail,
lag, lead, diff, percentchange, moving, upto,
uniformspaced, uniformspace, dropnan,
basecall,
merge, collapse,
readtimearray, writetimearray
when, from, to, findwhen, timestamp, values, colnames, meta, head, tail,
lag, lead, diff, percentchange, moving, upto,
uniformspaced, uniformspace, dropnan,
basecall,
merge, collapse,
readtimearray, writetimearray

# modify.jl
export rename, rename!
Expand Down
50 changes: 41 additions & 9 deletions src/split.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ when(ta::TimeArray, period::Function, t::String) =

# from, to ######################

from(ta::TimeArray{T, N, D}, d::D) where {T, N, D} =
from(ta::TimeArray{T,N,D}, d::D) where {T,N,D} =
length(ta) == 0 ? ta :
d < timestamp(ta)[1] ? ta :
d > timestamp(ta)[end] ? ta[1:0] :
ta[searchsortedfirst(timestamp(ta), d):end]
d < timestamp(ta)[1] ? ta :
d > timestamp(ta)[end] ? ta[1:0] :
ta[searchsortedfirst(timestamp(ta), d):end]

to(ta::TimeArray{T, N, D}, d::D) where {T, N, D} =
to(ta::TimeArray{T,N,D}, d::D) where {T,N,D} =
length(ta) == 0 ? ta :
d < timestamp(ta)[1] ? ta[1:0] :
d > timestamp(ta)[end] ? ta :
ta[1:searchsortedlast(timestamp(ta), d)]
d < timestamp(ta)[1] ? ta[1:0] :
d > timestamp(ta)[end] ? ta :
ta[1:searchsortedlast(timestamp(ta), d)]

###### findall ##################

Expand All @@ -43,7 +43,7 @@ findwhen(ta::TimeArray{Bool,1}) = timestamp(ta)[findall(values(ta))]
end
end

@generated function tail(ta::TimeArray{T,N}, n::Int=6) where {T,N}
@generated function tail(ta::TimeArray{T,N}, n::Int=6) where {T,N}
new_values = (N == 1) ? :(values(ta)[start:end]) : :(values(ta)[start:end, :])

quote
Expand All @@ -58,3 +58,35 @@ end
Base.first(ta::TimeArray) = head(ta, 1)

Base.last(ta::TimeArray) = tail(ta, 1)


"""
split(data::TimeSeries.TimeArray, period::Function)

Split `data` by `period` function, returns a vector of `TimeSeries.TimeArray`.

## Arguments

- `data::TimeSeries.TimeArray`: Data to split
- `period::Function`: Function, e.g. `Dates.day` that is used to split the `data`.
"""
Base.split(data::TimeSeries.TimeArray, period::Function) = Iterators.map(i -> data[i], _split(TimeSeries.timestamp(data), period))

function _split(ts::AbstractVector{D}, period::Function) where {D<:TimeType}
m = length(ts)
idx = UnitRange{Int}[]
if !isempty(ts)
sizehint!(idx, m)
t0 = period(ts[1])
j = 1
for i in 1:(m-1)
t1 = period(ts[i+1])
t0 == t1 && continue
push!(idx, j:i)
j = i + 1
t0 = t1
end
push!(idx, j:m)
end
return Iterators.map(i -> ts[i], idx)
end
ValentinKaisermayer marked this conversation as resolved.
Show resolved Hide resolved
218 changes: 113 additions & 105 deletions test/split.jl
Original file line number Diff line number Diff line change
@@ -1,121 +1,129 @@
using Dates
using Test

using MarketData

using TimeSeries


@testset "split" begin


@testset "find methods" begin
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, could you send the formatting patch as another PR?
(and maybe introduce some formatter in the standalone PR, e.g. https://github.com/julia-actions/julia-format)

This cause the patch not human readable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@testset "find returns correct row numbers array" begin
@test timestamp(cl[findall(cl .> op)])[1] == Date(2000, 1, 3)
@test length(findall(cl .> op)) == 244
@testset "find methods" begin
@testset "find returns correct row numbers array" begin
@test timestamp(cl[findall(cl .> op)])[1] == Date(2000, 1, 3)
@test length(findall(cl .> op)) == 244
end

@testset "findwhen returns correct Dates array" begin
@test findwhen(cl .> op)[2] == Date(2000, 1, 5)
@test length(findwhen(cl .> op)) == 244
end

@testset "findall(f::Function, ta)" begin
@test findall(cl .> 100) == findall(x -> x > 100, cl)
@test findall(cl .> 100) == findall(x -> x[4] > 100, ohlc)
end
end

@testset "findwhen returns correct Dates array" begin
@test findwhen(cl .> op)[2] == Date(2000, 1, 5)
@test length(findwhen(cl .> op)) == 244
@testset "split date operations" begin
@testset "from and to correctly subset non-zero and zero-length time arrays" begin
@test length(from(cl, Date(2001, 12, 28))) == 2
@test length(from(cl, Date(2002, 1, 1))) == 0
@test length(from(from(cl, Date(2002, 1, 1)), Date(2012, 1, 1))) == 0

@test length(to(cl, Date(2000, 1, 4))) == 2
@test length(to(cl, Date(1999, 1, 4))) == 0
@test length(to(to(cl, Date(1999, 1, 4)), Date(1912, 1, 1))) == 0
end

@testset "when method correctly subset" begin
@test timestamp(when(cl, day, 4))[1] == Date(2000, 1, 4)
@test timestamp(when(cl, dayname, "Friday"))[1] == Date(2000, 1, 7)
@test timestamp(when(cl, week, 5))[1] == Date(2000, 1, 31)
@test timestamp(when(cl, month, 5))[1] == Date(2000, 5, 1)
@test timestamp(when(cl, monthname, "June"))[1] == Date(2000, 6, 1)
@test timestamp(when(cl, year, 2001))[1] == Date(2001, 1, 2)
@test timestamp(when(cl, dayofweek, 1))[1] == Date(2000, 1, 3)
# all the days in the nth week of each month
@test timestamp(when(cl, dayofweekofmonth, 5))[1] == Date(2000, 1, 31)
@test timestamp(when(cl, dayofyear, 365))[1] == Date(2001, 12, 31)
@test timestamp(when(cl, quarterofyear, 4))[1] == Date(2000, 10, 2)
@test timestamp(when(cl, dayofquarter, 1))[1] == Date(2001, 10, 1)
end
end

@testset "findall(f::Function, ta)" begin
@test findall(cl .> 100) == findall(x -> x > 100, cl)
@test findall(cl .> 100) == findall(x -> x[4] > 100, ohlc)
@testset "head, tail, first and last methods" begin
@testset "head, tail, first and last methods work with default n value on single column TimeArray" begin
@test length(head(cl, 6)) == 6
@test timestamp(head(cl)) == [Date(2000, 1, 3), Date(2000, 1, 4), Date(2000, 1, 5),
Date(2000, 1, 6), Date(2000, 1, 7), Date(2000, 1, 10)]
@test values(head(cl)) == [111.94, 102.5, 104.0, 95.0, 99.5, 97.75]

@test length(tail(cl, 6)) == 6
@test timestamp(tail(cl)) == [Date(2001, 12, 21), Date(2001, 12, 24), Date(2001, 12, 26),
Date(2001, 12, 27), Date(2001, 12, 28), Date(2001, 12, 31)]
@test values(tail(cl)) == [21.0, 21.36, 21.49, 22.07, 22.43, 21.9]

@test length(first(cl)) == 1
@test timestamp(first(cl))[1] == Date(2000, 1, 3)
@test values(first(cl))[1] == 111.94
@test meta(first(cl)) == "AAPL"

@test length(last(cl)) == 1
@test timestamp(last(cl))[1] == Date(2001, 12, 31)
@test values(last(cl))[1] == 21.9
@test meta(last(cl)) == "AAPL"
end

@testset "head, tail, first and last methods work with default n value on multi column TimeArray" begin
@test length(head(ohlc)) == 6
@test values(head(ohlc, 1)) == [104.88 112.5 101.69 111.94]

@test length(tail(ohlc)) == 6
@test values(tail(ohlc, 1)) == [22.51 22.66 21.83 21.9]

@test length(first(ohlc)) == 1
@test timestamp(first(ohlc))[1] == Date(2000, 1, 3)
@test values(first(ohlc)) == [104.88 112.5 101.69 111.94]
@test meta(first(ohlc)) == "AAPL"

@test length(last(ohlc)) == 1
@test timestamp(last(ohlc))[1] == Date(2001, 12, 31)
@test values(last(ohlc)) == [22.51 22.66 21.83 21.9]
@test meta(last(ohlc)) == "AAPL"
end

@testset "head, tail, first and last methods work with custom periods on single column TimeArray" begin
@test length(head(cl, 2)) == 2
@test length(head(cl, 500)) == length(cl)
@test length(tail(cl, 2)) == 2
@test length(tail(cl, 500)) == length(cl)

@test length(first(cl)) == 1
@test length(last(cl)) == 1
end

@testset "head, tail, first and last methods work with custom periods on multi column TimeArray" begin
@test length(head(ohlc, 2)) == 2
@test length(head(ohlc, 500)) == length(ohlc)
@test length(tail(ohlc, 2)) == 2
@test length(tail(ohlc, 500)) == length(ohlc)

@test length(first(ohlc)) == 1
@test length(last(ohlc)) == 1
end
end
end


@testset "split date operations" begin
@testset "from and to correctly subset non-zero and zero-length time arrays" begin
@test length(from(cl, Date(2001,12,28))) == 2
@test length(from(cl, Date(2002,1,1))) == 0
@test length(from(from(cl, Date(2002,1,1)), Date(2012,1,1))) == 0

@test length(to(cl, Date(2000,1,4))) == 2
@test length(to(cl, Date(1999,1,4))) == 0
@test length(to(to(cl, Date(1999,1,4)), Date(1912,1,1))) == 0
@testset "split period" begin
for period in [day, week, month, year]
for cl_ in split(cl, period)
@test allequal(period.(timestamp(cl_)))
end
end
@test length(split(cl, day)) == 500
@test length(split(cl, week)) == 105
@test length(split(cl, month)) == 24
@test length(split(cl, year)) == 2

# test empty timearray
@test length(split(to(cl, Date(2000)), week)) == 0
end

@testset "when method correctly subset" begin
@test timestamp(when(cl, day, 4))[1] == Date(2000,1,4)
@test timestamp(when(cl, dayname, "Friday"))[1] == Date(2000,1,7)
@test timestamp(when(cl, week, 5))[1] == Date(2000,1,31)
@test timestamp(when(cl, month, 5))[1] == Date(2000,5,1)
@test timestamp(when(cl, monthname, "June"))[1] == Date(2000,6,1)
@test timestamp(when(cl, year, 2001))[1] == Date(2001,1,2)
@test timestamp(when(cl, dayofweek, 1))[1] == Date(2000,1,3)
# all the days in the nth week of each month
@test timestamp(when(cl, dayofweekofmonth, 5))[1] == Date(2000,1,31)
@test timestamp(when(cl, dayofyear, 365))[1] == Date(2001,12,31)
@test timestamp(when(cl, quarterofyear, 4))[1] == Date(2000,10,2)
@test timestamp(when(cl, dayofquarter, 1))[1] == Date(2001,10,1)
end
end


@testset "head, tail, first and last methods" begin
@testset "head, tail, first and last methods work with default n value on single column TimeArray" begin
@test length(head(cl,6)) == 6
@test timestamp(head(cl)) == [Date(2000,1,3), Date(2000,1,4), Date(2000,1,5),
Date(2000,1,6), Date(2000,1,7), Date(2000,1,10)]
@test values(head(cl)) == [111.94, 102.5, 104.0, 95.0, 99.5, 97.75]

@test length(tail(cl,6)) == 6
@test timestamp(tail(cl)) == [Date(2001,12,21), Date(2001,12,24), Date(2001,12,26),
Date(2001,12,27), Date(2001,12,28), Date(2001,12,31)]
@test values(tail(cl)) == [21.0, 21.36, 21.49, 22.07, 22.43, 21.9]

@test length(first(cl)) == 1
@test timestamp(first(cl))[1] == Date(2000,1,3)
@test values(first(cl))[1] == 111.94
@test meta(first(cl)) == "AAPL"

@test length(last(cl)) == 1
@test timestamp(last(cl))[1] == Date(2001,12,31)
@test values(last(cl))[1] == 21.9
@test meta(last(cl)) == "AAPL"
end

@testset "head, tail, first and last methods work with default n value on multi column TimeArray" begin
@test length(head(ohlc)) == 6
@test values(head(ohlc, 1)) == [104.88 112.5 101.69 111.94]

@test length(tail(ohlc)) == 6
@test values(tail(ohlc, 1)) == [22.51 22.66 21.83 21.9]

@test length(first(ohlc)) == 1
@test timestamp(first(ohlc))[1] == Date(2000,1,3)
@test values(first(ohlc)) == [104.88 112.5 101.69 111.94]
@test meta(first(ohlc)) == "AAPL"

@test length(last(ohlc)) == 1
@test timestamp(last(ohlc))[1] == Date(2001,12,31)
@test values(last(ohlc)) == [22.51 22.66 21.83 21.9]
@test meta(last(ohlc)) == "AAPL"
end

@testset "head, tail, first and last methods work with custom periods on single column TimeArray" begin
@test length(head(cl, 2)) == 2
@test length(head(cl, 500)) == length(cl)
@test length(tail(cl, 2)) == 2
@test length(tail(cl, 500)) == length(cl)

@test length(first(cl)) == 1
@test length(last(cl)) == 1
end

@testset "head, tail, first and last methods work with custom periods on multi column TimeArray" begin
@test length(head(ohlc, 2)) == 2
@test length(head(ohlc, 500)) == length(ohlc)
@test length(tail(ohlc, 2)) == 2
@test length(tail(ohlc, 500)) == length(ohlc)

@test length(first(ohlc)) == 1
@test length(last(ohlc)) == 1
end
end


end # @testset "split"
Loading