Skip to content

Commit

Permalink
Merge pull request #38 from uche-madu/dev
Browse files Browse the repository at this point in the history
fix pyspark logic
  • Loading branch information
uche-madu authored Oct 18, 2023
2 parents 771dbea + 734a3cc commit ab25886
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 4 deletions.
5 changes: 2 additions & 3 deletions dags/user_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,15 +239,14 @@ def dataproc_tasks() -> None:

CLUSTER_GENERATOR_CONFIG = ClusterGenerator(
project_id=PROJECT_ID,
zone=ZONE,
master_machine_type="n2-standard-2",
master_machine_type="n2-standard-4",
master_disk_size=32,
worker_machine_type="n2-standard-2",
worker_disk_size=32,
num_workers=2,
storage_bucket=BUCKET_NAME,
init_actions_uris=[PIP_INIT_FILE],
metadata={"PIP_PACKAGES": "spark-nlp==5.1.2 google-cloud-storage==2.12.0 transformers==4.25.1 tensorflow==2.11.0"},
metadata={"PIP_PACKAGES": "spark-nlp==5.1.2 google-cloud-storage==2.12.0 scipy==1.11.3 transformers==4.25.1 tensorflow==2.11.0"},
properties={
'spark:spark.serializer': 'org.apache.spark.serializer.KryoSerializer',
'spark:spark.driver.maxResultSize': '0',
Expand Down
2 changes: 1 addition & 1 deletion pyspark-scripts/gcs_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def load_files_from_gcs(
# Load data from GCS
df = (spark.read.format(file_format)
.options(**(read_options or {}))
.load(f"gs://{directory_path}/{file}"))
.load(f"gs://{file}"))

dataframes.append(df)

Expand Down

0 comments on commit ab25886

Please sign in to comment.