Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove time unit and column from python and make time the first column #18

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions python/tests/ts/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,46 @@ def fn_2(rows):
"with key 7"
)

def test_addColumnsForCycleTimeNotFirst(self):
import pyspark.sql.types as pyspark_types
price = self.flintContext.read.pandas(test_utils.make_pdf([
[7, 1000, 0.5],
[3, 1000, 1.0],
[3, 1050, 1.5],
[7, 1050, 2.0],
[3, 1100, 2.5],
[7, 1100, 3.0],
[3, 1150, 3.5],
[7, 1150, 4.0],
[3, 1200, 4.5],
[7, 1200, 5.0],
[3, 1250, 5.5],
[7, 1250, 6.0],
], ["id", "time", "price"]))
expected_pdf = test_utils.make_pdf([
[1000, 7, 0.5, 1.0],
[1000, 3, 1.0, 2.0],
[1050, 3, 1.5, 3.0],
[1050, 7, 2.0, 4.0],
[1100, 3, 2.5, 5.0],
[1100, 7, 3.0, 6.0],
[1150, 3, 3.5, 7.0],
[1150, 7, 4.0, 8.0],
[1200, 3, 4.5, 9.0],
[1200, 7, 5.0, 10.0],
[1250, 3, 5.5, 11.0],
[1250, 7, 6.0, 12.0],
], ["time", "id", "price", "adjustedPrice"])

def fn_1(rows):
size = len(rows)
return {row: row.price*size for row in rows}

new_pdf = price.addColumnsForCycle(
{"adjustedPrice": (pyspark_types.DoubleType(), fn_1)}
).toPandas()
test_utils.assert_same(new_pdf, expected_pdf)

def test_merge(self):
price = self.price()
price1 = price.filter(price.time > 1100)
Expand Down Expand Up @@ -171,6 +211,79 @@ def test_leftJoin(self):
).toPandas()
test_utils.assert_same(new_pdf, expected_pdf)

def test_leftJoinTimeNotFirst(self):
# Note that in price we have time as the second column
price = self.flintContext.read.pandas(test_utils.make_pdf([
[7, 1000, 0.5],
[3, 1000, 1.0],
[3, 1050, 1.5],
[7, 1050, 2.0],
[3, 1100, 2.5],
[7, 1100, 3.0],
[3, 1150, 3.5],
[7, 1150, 4.0],
[3, 1200, 4.5],
[7, 1200, 5.0],
[3, 1250, 5.5],
[7, 1250, 6.0],
], ["id", "time", "price"]))
# Time is also the second column of vol
vol = self.flintContext.read.pandas(test_utils.make_pdf([
[7, 1000, 100],
[3, 1000, 200],
[3, 1050, 300],
[7, 1050, 400],
[3, 1100, 500],
[7, 1100, 600],
[3, 1150, 700],
[7, 1150, 800],
[3, 1200, 900],
[7, 1200, 1000],
[3, 1250, 1100],
[7, 1250, 1200],
], ["id", "time", "volume"]))
# We expect to get the result with time as the first column
expected_pdf = test_utils.make_pdf([
(1000, 7, 0.5, 100,),
(1000, 3, 1.0, 200,),
(1050, 3, 1.5, 300,),
(1050, 7, 2.0, 400,),
(1100, 3, 2.5, 500,),
(1100, 7, 3.0, 600,),
(1150, 3, 3.5, 700,),
(1150, 7, 4.0, 800,),
(1200, 3, 4.5, 900,),
(1200, 7, 5.0, 1000,),
(1250, 3, 5.5, 1100,),
(1250, 7, 6.0, 1200,)
], ["time", "id", "price", "volume"])

new_pdf = price.leftJoin(vol, key=["id"]).toPandas()
test_utils.assert_same(new_pdf, expected_pdf)
test_utils.assert_same(
new_pdf, price.leftJoin(vol, key="id").toPandas()
)

expected_pdf = test_utils.make_pdf([
(1000, 7, 0.5, 100),
(1000, 3, 1.0, 200),
(1050, 3, 1.5, None),
(1050, 7, 2.0, None),
(1100, 3, 2.5, 500),
(1100, 7, 3.0, 600),
(1150, 3, 3.5, 700),
(1150, 7, 4.0, 800),
(1200, 3, 4.5, 900),
(1200, 7, 5.0, 1000),
(1250, 3, 5.5, 1100),
(1250, 7, 6.0, 1200),
], ["time", "id", "price", "volume"])

new_pdf = price.leftJoin(
vol.filter(vol.time != 1050), key="id"
).toPandas()
test_utils.assert_same(new_pdf, expected_pdf)

def test_futureLeftJoin(self):
import pyspark.sql.types as pyspark_types
price = self.price()
Expand Down
29 changes: 10 additions & 19 deletions python/ts/flint/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,9 @@ class TimeSeriesDataFrame(pyspark.sql.DataFrame):
DEFAULT_UNIT = "ns"
'''The units of the timestamps present in :attr:`DEFAULT_TIME_COLUMN`.

Acceptable values are: ``'s'``, ``'ms'``, ``'us'``, ``'ns'``.

'''

def __init__(self, df, sql_ctx, *, time_column=DEFAULT_TIME_COLUMN, is_sorted=True, unit=DEFAULT_UNIT, tsrdd_part_info=None):
def __init__(self, df, sql_ctx, *, is_sorted=True, tsrdd_part_info=None):
'''
:type df: pyspark.sql.DataFrame
:type sql_ctx: pyspark.sql.SqlContext
Expand All @@ -102,7 +100,7 @@ def __init__(self, df, sql_ctx, *, time_column=DEFAULT_TIME_COLUMN, is_sorted=Tr
:param tsrdd_part_info: Partition info
:type tsrdd_part_info: Option[com.twosigma.flint.timeseries.PartitionInfo]
'''
self._time_column = time_column
self._time_column = self.DEFAULT_TIME_COLUMN
self._is_sorted = is_sorted
self._tsrdd_part_info = tsrdd_part_info

Expand All @@ -112,7 +110,7 @@ def __init__(self, df, sql_ctx, *, time_column=DEFAULT_TIME_COLUMN, is_sorted=Tr
super().__init__(self._jdf, sql_ctx)

self._jpkg = java.Packages(self._sc)
self._junit = utils.junit(self._sc, unit) if isinstance(unit,str) else unit
self._junit = utils.junit(self._sc, self.DEFAULT_UNIT)

if tsrdd_part_info:
if not is_sorted:
Expand Down Expand Up @@ -169,9 +167,7 @@ def _new_method(self, *args, **kwargs):
if self._jpkg.OrderPreservingOperation.isDerivedFrom(self._jdf, df._jdf):
tsdf_args = {
"df": df,
"sql_ctx": df.sql_ctx,
"time_column": self._time_column,
"unit": self._junit
"sql_ctx": df.sql_ctx
}

tsdf_args['is_sorted'] = self._is_sorted and self._jpkg.OrderPreservingOperation.isOrderPreserving(self._jdf, df._jdf)
Expand Down Expand Up @@ -226,17 +222,14 @@ def _from_df(df, *, time_column, is_sorted, unit):
return TimeSeriesDataFrame(df,
df.sql_ctx,
time_column=time_column,
is_sorted=is_sorted,
unit=unit)
is_sorted=is_sorted)

@staticmethod
def _from_pandas(df, schema, sql_ctx, *, time_column, is_sorted, unit):
def _from_pandas(df, schema, sql_ctx, *, is_sorted):
df = sql_ctx.createDataFrame(df, schema)
return TimeSeriesDataFrame(df,
sql_ctx,
time_column=time_column,
is_sorted=is_sorted,
unit=unit)
is_sorted=is_sorted)

def _timedelta_ns(self, varname, timedelta, *, default=None):
"""Transforms pandas.Timedelta to a ns string with appropriate checks
Expand Down Expand Up @@ -308,10 +301,10 @@ def addColumnsForCycle(self, columns, *, key=None):
:returns: a new dataframe with the columns added
:rtype: :class:`TimeSeriesDataFrame`
"""
# Need to make a new StructType to prevent from modifying the original schema object
schema = pyspark_types.StructType.fromJson(self.schema.jsonValue())
tsdf = self.groupByCycle(key)
# Don't pickle the whole schema, just the names for the lambda
# Last element of tsdf.schema describes the 'rows' returned
# which does differ from self.schema if the first column is not 'time'
schema = tsdf.schema[len(tsdf.schema)-1].dataType.elementType
schema_names = list(schema.names)

def flatmap_fn():
Expand All @@ -335,8 +328,6 @@ def _(orig_row):

return TimeSeriesDataFrame(df,
df.sql_ctx,
time_column=self._time_column,
unit=self._junit,
tsrdd_part_info=tsdf._tsrdd_part_info)

def merge(self, other):
Expand Down
5 changes: 1 addition & 4 deletions python/ts/flint/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#

from pyspark.sql import DataFrame
from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter

from . import java
from . import utils
Expand Down Expand Up @@ -57,9 +56,7 @@ def pandas(self, df, schema=None, *,

return TimeSeriesDataFrame._from_pandas(
df, schema, self._flintContext._sqlContext,
time_column=time_column,
is_sorted=is_sorted,
unit=unit)
is_sorted=is_sorted)

def _df_between(self, df, begin, end, time_column, unit):
"""Filter a Python dataframe to contain data between begin (inclusive) and end (exclusive)
Expand Down