-
Notifications
You must be signed in to change notification settings - Fork 8
/
pivot.py
29 lines (21 loc) · 965 Bytes
/
pivot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#Problem: Given a dataset of sales records with monthly sales per product, reshape the data to have one row per product-month combination.
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
# Initialize Spark Session
#spark = SparkSession.builder.appName("DataReshaping").getOrCreate()
# Sample data: Product sales per month
data = [("Product1", 100, 150, 200),
("Product2", 200, 250, 300),
("Product3", 300, 350, 400)]
# Columns: Product, Sales_Jan, Sales_Feb, Sales_Mar
columns = ["Product", "Sales_Jan", "Sales_Feb", "Sales_Mar"]
# Creating DataFrame
df = spark.createDataFrame(data, columns)
# Pivoting the DataFrame
# This step transforms the data into a long format: Product, Month, Sales
pivoted_df = df.selectExpr("Product",
"stack(3, 'Jan', Sales_Jan, 'Feb', Sales_Feb, 'Mar', Sales_Mar) as (Month, Sales)")
# Show the result
pivoted_df.show()
# Stop Spark Session
#spark.stop()