🐛 Export dates correctly for datawrapper

wdr-data · Nov 8, 2023 · cf6a688 · cf6a688
1 parent fc929eb
commit cf6a688
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 6 deletions.
diff --git a/ddj_cloud/scrapers/talsperren/talsperren.py b/ddj_cloud/scrapers/talsperren/talsperren.py
@@ -112,11 +112,11 @@ def run():
     df_base = _get_base_dataset()
 
     ## For testing
-    #
-    # bio = to_parquet_bio(df_base, compression="gzip", index=False)
-    # bio.seek(0)
-    # upload_file(bio.read(), "talsperren/base.parquet.gzip")
-    #
+
+    bio = to_parquet_bio(df_base, compression="gzip", index=False)
+    bio.seek(0)
+    upload_file(bio.read(), "talsperren/base.parquet.gzip")
+
     # df_base = pd.read_parquet("local_storage/talsperren/base.parquet.gzip", engine="fastparquet")
 
     # Filter out reservoirs in ignore list
@@ -129,7 +129,11 @@ def run():
     for exporter in exporters:
         try:
             df_export = exporter.run(df_base.copy())
-            upload_dataframe(df_export, f"talsperren/{exporter.filename}.csv")
+            upload_dataframe(
+                df_export,
+                f"talsperren/{exporter.filename}.csv",
+                datawrapper_datetimes=True,
+            )
         except Exception as e:
             print("Skipping exporter due to error:")
             print(e)

diff --git a/ddj_cloud/utils/storage.py b/ddj_cloud/utils/storage.py
@@ -299,6 +299,7 @@ def upload_dataframe(
     compare_fn: Callable[[bytes, bytes], bool] = simple_compare,
     acl: Optional[str] = "public-read",
     create_cloudfront_invalidation: bool = False,
+    datawrapper_datetimes: bool = False,
 ):
     """Upload a dataframe to storage.
 
@@ -313,6 +314,21 @@ def upload_dataframe(
         acl (str, optional): ACL to use when uploading. Defaults to ``"public-read"``.
         create_cloudfront_invalidation (bool, optional): Whether to create a CloudFront invalidation. Defaults to False.
     """
+
+    # Convert datetime for datawrapper (no timezone support madge)
+    if datawrapper_datetimes:
+        for col in df.columns:
+            # There's some different types of datetime columns,
+            # like datetime64[ns, Europe/Berlin] and datetime64[ns, UTC]
+            if not str(df[col].dtype).startswith("datetime64"):
+                continue
+
+            # Convert to Berlin timezone
+            df[col] = df[col].dt.tz_convert("Europe/Berlin")
+
+            # Convert to string
+            df[col] = df[col].dt.strftime("%Y-%m-%d %H:%M:%S")
+
     # Convert to csv and encode to get bytes
     write = df.to_csv(index=False).encode("utf-8")