diff --git a/koseungbin2024_spark_study_final_project.ipynb b/koseungbin2024_spark_study_final_project.ipynb
new file mode 100644
index 0000000..cd92cd3
--- /dev/null
+++ b/koseungbin2024_spark_study_final_project.ipynb
@@ -0,0 +1,660 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "mount_file_id": "1KgTVPmqkbxAz-6fOwB7PoxT_gy90j_1O",
+ "authorship_tag": "ABX9TyN2ezpxYneJMdQcTYSjghsD",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# [데이터셋]\n",
+ "- https://www.kaggle.com/datasets/brllrb/uber-and-lyft-dataset-boston-ma\n"
+ ],
+ "metadata": {
+ "id": "XnDy_QHUdXU_"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "uzPTXxUcByYJ"
+ },
+ "outputs": [],
+ "source": [
+ "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n",
+ "!cp /content/drive/MyDrive/colab_notebooks/spark/spark-3.5.1-bin-hadoop3.tgz /content/\n",
+ "!tar xf spark-3.5.1-bin-hadoop3.tgz\n",
+ "!pip install -q findspark\n",
+ "\n",
+ "!cp -r /content/drive/MyDrive/colab_notebooks/sample_data /content/\n",
+ "\n",
+ "import os\n",
+ "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
+ "os.environ[\"SPARK_HOME\"] = \"/content/spark-3.5.1-bin-hadoop3\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import findspark\n",
+ "findspark.init()\n",
+ "from pyspark.sql import SparkSession\n",
+ "\n",
+ "spark = SparkSession \\\n",
+ " .builder \\\n",
+ " .master(\"local[*]\") \\\n",
+ " .getOrCreate()\n",
+ "\n",
+ "spark.conf.set(\"spark.sql.repl.eagerEval.enabled\", True)\n",
+ "spark"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 219
+ },
+ "id": "N8x70X_7B7vi",
+ "outputId": "0ef0e064-3996-4c51-e411-dc3a1ec547c4"
+ },
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
SparkSession - in-memory
\n",
+ " \n",
+ "
\n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " - Version
\n",
+ " v3.5.1
\n",
+ " - Master
\n",
+ " local[*]
\n",
+ " - AppName
\n",
+ " pyspark-shell
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 2
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, BooleanType\n",
+ "\n",
+ "\n",
+ "raw_df = spark.read.csv(\n",
+ " './sample_data/yellow_taxi/rideshare_kaggle.csv',\n",
+ " header=True,\n",
+ " sep=',',\n",
+ " inferSchema=True\n",
+ ")\n",
+ "\n",
+ "# raw data\n",
+ "print(\"[raw data]\")\n",
+ "raw_df.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "N_FYosoLCz1U",
+ "outputId": "ce1dabc9-720d-4e3a-c58a-b178f91e17d6"
+ },
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[raw data]\n",
n",
+ "| id| timestamp|hour|day|month| datetime| timezone| source| destination|cab_type| product_id| name|price|distance|surge_multiplier|latitude|longitude|temperature|apparentTemperature| short_summary| long_summary|precipIntensity|precipProbability|humidity|windSpeed|windGust|windGustTime|visibility|temperatureHigh|temperatureHighTime|temperatureLow|temperatureLowTime|apparentTemperatureHigh|apparentTemperatureHighTime|apparentTemperatureLow|apparentTemperatureLowTime| icon|dewPoint|pressure|windBearing|cloudCover|uvIndex|visibility.1|ozone|sunriseTime|sunsetTime|moonPhase|precipIntensityMax|uvIndexTime|temperatureMin|temperatureMinTime|temperatureMax|temperatureMaxTime|apparentTemperatureMin|apparentTemperatureMinTime|apparentTemperatureMax|apparentTemperatureMaxTime|\n",
n",
+ "|424553bb-7174-41e...| 1.54495260789E9| 9| 16| 12|2018-12-16 09:30:07|America/New_York|Haymarket Square| North Station| Lyft| lyft_line| Shared| 5| 0.44| 1.0| 42.2148| -71.033| 42.34| 37.12| Mostly Cloudy | Rain throughout ...| 0.0| 0.0| 0.68| 8.66| 9.17| 1545015600| 10.0| 43.68| 1544968800| 34.19| 1545048000| 37.95| 1544968800| 27.39| 1545044400| partly-cloudy-ni...| 32.7| 1021.98| 57| 0.72| 0| 10.0|303.8| 1544962084|1544994864| 0.3| 0.1276| 1544979600| 39.89| 1545012000| 43.68| 1544968800| 33.73| 1545012000| 38.07| 1544958000|\n",
+ "|4bd23055-6827-41c...|1.543284023677E9| 2| 27| 11|2018-11-27 02:00:23|America/New_York|Haymarket Square| North Station| Lyft| lyft_premier| Lux| 11| 0.44| 1.0| 42.2148| -71.033| 43.58| 37.35| Rain | Rain until morni...| 0.1299| 1.0| 0.94| 11.98| 11.98| 1543291200| 4.786| 47.3| 1543251600| 42.1| 1543298400| 43.92| 1543251600| 36.2| 1543291200| rain | 41.83| 1003.97| 90| 1.0| 0| 4.786|291.1| 1543232969|1543266992| 0.64| 0.13| 1543251600| 40.49| 1543233600| 47.3| 1543251600| 36.2| 1543291200| 43.92| 1543251600|\n",
+ "|981a3613-77af-462...|1.543366822198E9| 1| 28| 11|2018-11-28 01:00:22|America/New_York|Haymarket Square| North Station| Lyft| lyft| Lyft| 7| 0.44| 1.0| 42.2148| -71.033| 38.33| 32.93| Clear | Light rain in th...| 0.0| 0.0| 0.75| 7.33| 7.33| 1543334400| 10.0| 47.55| 1543320000| 33.1| 1543402800| 44.12| 1543320000| 29.11| 1543392000| clear-night | 31.1| 992.28| 240| 0.03| 0| 10.0|315.7| 1543319437|1543353364| 0.68| 0.1064| 1543338000| 35.36| 1543377600| 47.55| 1543320000| 31.04| 1543377600| 44.12| 1543320000|\n",
+ "|c2d88af2-d278-4bf...|1.543553582749E9| 4| 30| 11|2018-11-30 04:53:02|America/New_York|Haymarket Square| North Station| Lyft| lyft_luxsuv|Lux Black XL| 26| 0.44| 1.0| 42.2148| -71.033| 34.38| 29.63| Clear | Partly cloudy th...| 0.0| 0.0| 0.73| 5.28| 5.28| 1543514400| 10.0| 45.03| 1543510800| 28.9| 1543579200| 38.53| 1543510800| 26.2| 1543575600| clear-night | 26.64| 1013.73| 310| 0.0| 0| 10.0|291.1| 1543492370|1543526114| 0.75| 0.0| 1543507200| 34.67| 1543550400| 45.03| 1543510800| 30.3| 1543550400| 38.53| 1543510800|\n",
+ "|e0126e1f-8ca9-4f2...|1.543463360223E9| 3| 29| 11|2018-11-29 03:49:20|America/New_York|Haymarket Square| North Station| Lyft| lyft_plus| Lyft XL| 9| 0.44| 1.0| 42.2148| -71.033| 37.44| 30.88| Partly Cloudy | Mostly cloudy th...| 0.0| 0.0| 0.7| 9.14| 9.14| 1543446000| 10.0| 42.18| 1543420800| 36.71| 1543478400| 35.75| 1543420800| 30.29| 1543460400| partly-cloudy-ni...| 28.61| 998.36| 303| 0.44| 0| 10.0|347.7| 1543405904|1543439738| 0.72| 1.0E-4| 1543420800| 33.1| 1543402800| 42.18| 1543420800| 29.11| 1543392000| 35.75| 1543420800|\n",
+ "|f6f6d7e4-3e18-492...|1.545071112138E9| 18| 17| 12|2018-12-17 18:25:12|America/New_York|Haymarket Square| North Station| Lyft| lyft_lux| Lux Black| 16.5| 0.44| 1.0| 42.2148| -71.033| 38.75| 33.51| Overcast | Light rain in th...| 0.0| 0.0| 0.84| 7.19| 8.88| 1545022800| 8.325| 40.61| 1545076800| 24.07| 1545130800| 34.97| 1545080400| 12.04| 1545134400| cloudy | 34.41| 1000.46| 294| 1.0| 1| 8.325|335.8| 1545048523|1545081282| 0.33| 0.0221| 1545066000| 34.19| 1545048000| 40.66| 1545022800| 27.39| 1545044400| 34.97| 1545080400|\n",
+ "|462816a3-820d-408...| 1.5432085802E9| 5| 26| 11|2018-11-26 05:03:00|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_plus| Lyft XL| 10.5| 1.08| 1.0| 42.3503| -71.081| 41.99| 41.99| Overcast | Rain until morni...| 0.0| 0.0| 0.91| 0.53| 0.88| 1543287600| 4.675| 46.46| 1543255200| 42.17| 1543298400| 43.81| 1543251600| 37.08| 1543298400| cloudy | 39.54| 1014.11| 91| 1.0| 0| 4.675|312.3| 1543233004|1543266980| 0.64| 0.1245| 1543251600| 40.67| 1543233600| 46.46| 1543255200| 37.45| 1543291200| 43.81| 1543251600|\n",
+ "|474d6376-bc59-4ec...|1.543780384677E9| 19| 2| 12|2018-12-02 19:53:04|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_lux| Lux Black| 16.5| 1.08| 1.0| 42.3503| -71.081| 49.88| 49.22| Light Rain | Light rain until...| 0.0246| 1.0| 0.93| 3.38| 3.38| 1543755600| 3.052| 50.8| 1543788000| 44.97| 1543816800| 50.13| 1543788000| 45.62| 1543816800| rain | 48.02| 1004.33| 159| 1.0| 0| 3.052|282.5| 1543751798|1543785242| 0.86| 0.0916| 1543770000| 36.32| 1543726800| 50.8| 1543788000| 35.84| 1543748400| 50.13| 1543788000|\n",
+ "|4f9fee41-fde3-476...|1.543818482645E9| 6| 3| 12|2018-12-03 06:28:02|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_line| Shared| 3| 1.08| 1.0| 42.3503| -71.081| 45.58| 45.58| Foggy | Foggy in the mor...| 0.0| 0.0| 0.96| 1.25| 2.09| 1543856400| 1.413| 57.02| 1543852800| 33.74| 1543921200| 56.35| 1543852800| 28.53| 1543914000| fog | 44.5| 1001.06| 307| 1.0| 0| 1.413|290.9| 1543838259|1543871628| 0.89| 4.0E-4| 1543852800| 43.09| 1543896000| 57.02| 1543852800| 39.9| 1543896000| 56.35| 1543852800|\n",
+ "|8612d909-98b8-445...|1.543315522249E9| 10| 27| 11|2018-11-27 10:45:22|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_luxsuv|Lux Black XL| 27.5| 1.08| 1.0| 42.3503| -71.081| 45.45| 41.77| Light Rain | Light rain in th...| 0.0624| 1.0| 0.93| 6.87| 7.42| 1543338000| 2.686| 46.91| 1543320000| 33.82| 1543399200| 44.01| 1543320000| 30.19| 1543399200| rain | 43.52| 989.98| 79| 1.0| 0| 2.686|296.2| 1543319472|1543353352| 0.68| 0.1425| 1543338000| 36.34| 1543377600| 46.91| 1543320000| 32.43| 1543377600| 44.01| 1543320000|\n",
+ "|9043bf77-1d45-4a9...|1.543594383882E9| 16| 30| 11|2018-11-30 16:13:03|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_premier| Lux| 13.5| 1.08| 1.0| 42.3503| -71.081| 40.13| 38.0| Clear | Mostly cloudy th...| 0.0| 0.0| 0.62| 3.46| 4.47| 1543554000| 9.92| 42.32| 1543600800| 31.57| 1543665600| 40.48| 1543611600| 28.1| 1543658400| clear-day | 27.99| 1016.84| 291| 0.12| 2| 9.92|269.9| 1543578871|1543612479| 0.79| 4.0E-4| 1543593600| 28.64| 1543579200| 42.32| 1543600800| 29.29| 1543579200| 40.48| 1543611600|\n",
+ "|d859ec69-b3ff-4af...|1.543432987778E9| 19| 28| 11|2018-11-28 19:23:07|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft| Lyft| 7| 1.08| 1.0| 42.3503| -71.081| 41.47| 35.66| Overcast | Mostly cloudy th...| 0.0| 0.0| 0.63| 9.54| 14.86| 1543431600| 10.0| 42.72| 1543438800| 37.59| 1543485600| 36.75| 1543438800| 32.25| 1543478400| cloudy | 29.72| 991.85| 295| 1.0| 0| 10.0|354.2| 1543405940|1543439725| 0.72| 0.0| 1543420800| 33.82| 1543399200| 42.72| 1543438800| 30.19| 1543399200| 36.75| 1543438800|\n",
+ "|009e9c53-074d-43c...|1.543615981179E9| 22| 30| 11|2018-11-30 22:13:01|America/New_York| North End| West End| Uber|6f72dfc5-27f1-42e...| UberXL| 12| 1.11| 1.0| 42.3647| -71.0542| 40.13| 38.08| Overcast | Mostly cloudy th...| 0.0| 0.0| 0.6| 3.38| 3.99| 1543554000| 9.833| 42.52| 1543600800| 31.71| 1543658400| 40.53| 1543611600| 28.06| 1543658400| cloudy | 27.31| 1017.16| 281| 1.0| 0| 9.833|281.8| 1543578867|1543612470| 0.79| 3.0E-4| 1543593600| 28.79| 1543579200| 42.52| 1543600800| 26.41| 1543575600| 40.53| 1543611600|\n",
+ "|23f145da-f0c1-4d1...|1.544698211014E9| 10| 13| 12|2018-12-13 10:50:11|America/New_York| North End| West End| Uber|6c84fd89-3f11-478...| Black| 16| 1.11| 1.0| 42.3647| -71.0542| 20.38| 20.38| Clear | Partly cloudy th...| 0.0| 0.0| 0.66| 2.94| 3.22| 1544738400| 9.831| 33.83| 1544731200| 27.27| 1544781600| 32.85| 1544734800| 24.61| 1544785200| clear-night | 10.87| 1031.51| 2| 0.03| 0| 9.831|327.3| 1544702792|1544735599| 0.21| 1.0E-4| 1544716800| 18.29| 1544688000| 33.83| 1544731200| 13.79| 1544688000| 32.85| 1544734800|\n",
+ "|357559cb-8c58-427...|1.544728503935E9| 19| 13| 12|2018-12-13 19:15:03|America/New_York| North End| West End| Uber|55c66225-fbe7-4fd...| UberX| 7.5| 1.11| 1.0| 42.3647| -71.0542| 32.85| 32.85| Mostly Cloudy | Partly cloudy th...| 0.0| 0.0| 0.56| 2.65| 3.83| 1544738400| 9.959| 33.83| 1544731200| 27.27| 1544781600| 32.85| 1544734800| 24.61| 1544785200| partly-cloudy-day | 18.66| 1033.65| 76| 0.64| 0| 9.959|330.8| 1544702792|1544735599| 0.21| 1.0E-4| 1544716800| 18.29| 1544688000| 33.83| 1544731200| 13.79| 1544688000| 32.85| 1544734800|\n",
+ "|50ef1165-9d23-416...| 1.54500451143E9| 23| 16| 12|2018-12-16 23:55:11|America/New_York| North End| West End| Uber|9a0e7b09-b92b-4c4...| WAV| 7.5| 1.11| 1.0| 42.3647| -71.0542| 41.29| 36.01| Light Rain | Rain throughout ...| 0.0567| 0.94| 0.86| 8.3| 8.3| 1545015600| 4.054| 43.83| 1544990400| 34.25| 1545044400| 38.38| 1544986800| 28.3| 1545044400| rain | 37.56| 1012.72| 57| 1.0| 0| 4.054|325.3| 1544962119|1544994839| 0.3| 0.1252| 1544979600| 39.22| 1544954400| 43.83| 1544990400| 33.98| 1545019200| 38.38| 1544986800|\n",
+ "|91c4861c-1780-42b...|1.544748007961E9| 0| 14| 12|2018-12-14 00:40:07|America/New_York| North End| West End| Uber|6d318bcc-22a3-4af...| Black SUV| 26| 1.11| 1.0| 42.3647| -71.0542| 31.25| 31.25| Overcast | Partly cloudy th...| 0.0| 0.0| 0.64| 2.62| 3.54| 1544738400| 10.0| 33.83| 1544731200| 27.27| 1544781600| 32.85| 1544734800| 24.61| 1544785200| cloudy | 20.53| 1035.06| 173| 0.91| 0| 10.0|326.7| 1544702792|1544735599| 0.21| 1.0E-4| 1544716800| 18.29| 1544688000| 33.83| 1544731200| 13.79| 1544688000| 32.85| 1544734800|\n",
+ "|e219e545-a006-493...|1.543519080802E9| 19| 29| 11|2018-11-29 19:18:00|America/New_York| North End| West End| Uber|997acbb5-e102-41e...| UberPool| 5.5| 1.11| 1.0| 42.3647| -71.0542| 43.49| 37.19| Mostly Cloudy | Partly cloudy th...| 0.0| 0.0| 0.52| 12.13| 19.97| 1543514400| 9.796| 44.61| 1543510800| 28.79| 1543579200| 38.21| 1543510800| 26.41| 1543575600| partly-cloudy-day | 26.83| 1007.12| 313| 0.53| 0| 9.796|309.7| 1543492402|1543526092| 0.75| 0.0| 1543510800| 35.35| 1543550400| 44.61| 1543510800| 31.14| 1543550400| 38.21| 1543510800|\n",
+ "|fa5fb705-03a0-4eb...|1.543673584211E9| 14| 1| 12|2018-12-01 14:13:04|America/New_York| North End| West End| Uber|8cf7e821-f0d3-49c...| Taxi| NA| 1.11| 1.0| 42.3647| -71.0542| 36.99| 32.27| Partly Cloudy | Light rain in th...| 0.0| 0.0| 0.68| 5.87| 6.26| 1543672800| 9.91| 44.66| 1543690800| 35.04| 1543712400| 43.99| 1543690800| 35.69| 1543712400| partly-cloudy-day | 27.53| 1022.32| 344| 0.44| 1| 9.91|280.1| 1543665331|1543698851| 0.82| 0.0| 1543683600| 31.71| 1543658400| 44.66| 1543690800| 28.06| 1543658400| 43.99| 1543690800|\n",
+ "|18d580ac-c91a-4b6...|1.544940911553E9| 6| 16| 12|2018-12-16 06:15:11|America/New_York| North Station| Haymarket Square| Lyft| lyft_plus| Lyft XL| 11| 0.72| 1.0| 42.3661| -71.0631| 40.36| 35.52| Clear | Rain throughout ...| 0.0| 0.0| 0.69| 7.08| 8.47| 1545015600| 10.0| 43.78| 1544990400| 34.12| 1545044400| 38.39| 1544986800| 28.21| 1545044400| clear-night | 30.94| 1022.94| 52| 0.06| 0| 10.0|298.7| 1544962122|1544994841| 0.3| 0.1246| 1544979600| 38.94| 1544954400| 43.78| 1544990400| 33.76| 1545019200| 38.39| 1544986800|\n",
+ "+--------------------+----------------+----+---+-----+-------------------+----------------+----------------+--------------------+--------+--------------------+------------+-----+--------+----------------+--------+---------+-----------+-------------------+---------------+--------------------+---------------+-----------------+--------+---------+--------+------------+----------+---------------+-------------------+--------------+------------------+-----------------------+---------------------------+----------------------+--------------------------+--------------------+--------+--------+-----------+----------+-------+------------+-----+-----------+----------+---------+------------------+-----------+--------------+------------------+--------------+------------------+----------------------+--------------------------+----------------------+--------------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pyspark.sql.functions import col, avg, count, year, datediff, expr, min, max, to_date, month, hour, cast, sum, rank, concat, lit\n",
+ "from pyspark.sql.window import Window\n",
+ "import pyspark.pandas as ps\n",
+ "\n",
+ "\n",
+ "# 공급자 측면 조사\n",
+ "print(\"[공급자 측면 조사]\")\n",
+ "\n",
+ "## 평균 가격이 높았던 출발지와 도착지\n",
+ "print(\"[평균 가격이 많았던 출발지와 도착지]\")\n",
+ "result_df = raw_df \\\n",
+ " .withColumn('src_to_dest', concat(col('source'), lit('->'), col('destination'))) \\\n",
+ " .groupBy('src_to_dest') \\\n",
+ " .agg(avg('price').alias('avg_price')) \\\n",
+ " .orderBy('avg_price', ascending=False);\n",
+ "\n",
+ "result_df.show();\n",
+ "\n",
+ "### 결과 그래프\n",
+ "ps \\\n",
+ " .DataFrame(result_df) \\\n",
+ " .set_index('src_to_dest') \\\n",
+ " .avg_price \\\n",
+ " .plot \\\n",
+ " .bar() \\\n",
+ " .show();\n",
+ "\n",
+ "\n",
+ "## 월(month) 별 택시 회사 매출 비교\n",
+ "print(\"[월(month) 별 택시 회사 매출 비교]\")\n",
+ "result_df = raw_df \\\n",
+ " .withColumn('year', year(col('datetime'))) \\\n",
+ " .withColumn('month', month(col('datetime'))) \\\n",
+ " .withColumn('year_month', concat(col('year'), lit('-'), col('month'))) \\\n",
+ " .withColumn('year_month_by_cab_type', concat(col('year_month'), lit('/'), col('cab_type'))) \\\n",
+ " .groupBy('year_month_by_cab_type') \\\n",
+ " .agg(sum('price').alias('total_price')) \\\n",
+ " .orderBy(\"year_month_by_cab_type\");\n",
+ "\n",
+ "result_df.show()\n",
+ "\n",
+ "### 결과 그래프\n",
+ "ps \\\n",
+ " .DataFrame(result_df) \\\n",
+ " .set_index('year_month_by_cab_type') \\\n",
+ " .plot \\\n",
+ " .bar(y = 'total_price') \\\n",
+ " .show();\n",
+ "\n",
+ "## 매 시간(hour) 별 가장 콜이 많았던 장소\n",
+ "print(\"[매 시간(hour) 별 가장 콜이 많았던 장소]\")\n",
+ "windowSpec = Window \\\n",
+ " .partitionBy('date_hour') \\\n",
+ " .orderBy(col('count').desc()) \\\n",
+ " .rowsBetween(Window.unboundedPreceding, Window.currentRow)\n",
+ "\n",
+ "result_df = raw_df \\\n",
+ " .withColumn('date', to_date(col('datetime'))) \\\n",
+ " .withColumn('hour', hour(col('datetime'))) \\\n",
+ " .withColumn('date_hour', concat(col('date'), lit('/'), col('hour'))) \\\n",
+ " .groupBy('date_hour', 'source') \\\n",
+ " .agg(count('source').alias('count')) \\\n",
+ " .withColumn('rank', rank().over(windowSpec)) \\\n",
+ " .select('date_hour', 'source') \\\n",
+ " .orderBy(\"date_hour\") \\\n",
+ " .filter(col('rank') == 1)\n",
+ "\n",
+ "result_df.orderBy('date_hour').show()\n",
+ "\n",
+ "### 결과 그래프\n",
+ "ps \\\n",
+ " .DataFrame(result_df) \\\n",
+ " .plot \\\n",
+ " .scatter(x = 'date_hour', y = 'source', s = 'count') \\\n",
+ " .show();\n",
+ "\n",
+ "\n",
+ "### 검증용\n",
+ "# raw_df.withColumn('date', to_date(col('datetime'))) \\\n",
+ "# .withColumn('hour', hour(col('datetime'))) \\\n",
+ "# .groupBy('date', 'hour', 'source') \\\n",
+ "# .agg(count('source').alias('count')) \\\n",
+ "# .orderBy('date', 'hour', col('count').desc()) \\\n",
+ "# .show(40)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "collapsed": true,
+ "id": "eR6y6Rqfc_-5",
+ "outputId": "9d91be6d-09a5-4065-85c3-2a6614ec3c2a"
+ },
+ "execution_count": 28,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[공급자 측면 조사]\n",
+ "[평균 가격이 많았던 출발지와 도착지]\n",
+ "+--------------------+------------------+\n",
+ "| src_to_dest| avg_price|\n",
+ "+--------------------+------------------+\n",
+ "|Financial Distric...|25.029096477794795|\n",
+ "|Boston University...|24.039182282793867|\n",
+ "|Financial Distric...|23.626237623762375|\n",
+ "|Fenway->Financial...|23.088291746641076|\n",
+ "|Northeastern Univ...|22.499134948096884|\n",
+ "|Financial Distric...|21.520358306188925|\n",
+ "|Theatre District-...| 20.76215277777778|\n",
+ "|Boston University...|20.310986964618248|\n",
+ "| North End->Back Bay|19.762027491408936|\n",
+ "| Back Bay->North End| 19.73857404021938|\n",
+ "|South Station->Ba...|19.439338235294116|\n",
+ "|Fenway->North Sta...|19.414495114006513|\n",
+ "|Northeastern Univ...|19.221014492753625|\n",
+ "|North Station->Bo...|19.073701842546065|\n",
+ "|Theatre District-...|19.066096423017107|\n",
+ "|North Station->No...| 19.06513409961686|\n",
+ "|West End->Northea...|18.954682779456192|\n",
+ "|Boston University...| 18.91390728476821|\n",
+ "|Boston University...| 18.82960413080895|\n",
+ "|North Station->Fe...|18.549924357034797|\n",
+ "+--------------------+------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[월(month) 별 택시 회사 매출 비교]\n",
+ "+----------------------+-----------+\n",
+ "|year_month_by_cab_type|total_price|\n",
+ "+----------------------+-----------+\n",
+ "| 2018-11/Lyft| 154411.5|\n",
+ "| 2018-11/Uber| 141712.5|\n",
+ "| 2018-12/Lyft| 204014.0|\n",
+ "| 2018-12/Uber| 193503.0|\n",
+ "+----------------------+-----------+\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[매 시간(hour) 별 가장 콜이 많았던 장소]\n",
+ "+-------------+--------------------+\n",
+ "| date_hour| source|\n",
+ "+-------------+--------------------+\n",
+ "|2018-11-26/10| Back Bay|\n",
+ "|2018-11-26/10| Boston University|\n",
+ "|2018-11-26/11|Northeastern Univ...|\n",
+ "|2018-11-26/12| Back Bay|\n",
+ "|2018-11-26/13| Theatre District|\n",
+ "|2018-11-26/14| North Station|\n",
+ "|2018-11-26/15| Haymarket Square|\n",
+ "|2018-11-26/16| North Station|\n",
+ "|2018-11-26/17| South Station|\n",
+ "|2018-11-26/17| Fenway|\n",
+ "|2018-11-26/18| Theatre District|\n",
+ "|2018-11-26/19| Financial District|\n",
+ "|2018-11-26/20|Northeastern Univ...|\n",
+ "|2018-11-26/21| Beacon Hill|\n",
+ "|2018-11-26/22|Northeastern Univ...|\n",
+ "|2018-11-26/23| North Station|\n",
+ "| 2018-11-26/3| Theatre District|\n",
+ "| 2018-11-26/3| Haymarket Square|\n",
+ "| 2018-11-26/3| North Station|\n",
+ "| 2018-11-26/3| South Station|\n",
+ "+-------------+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from pyspark.sql.functions import col, avg, count, year, datediff, expr, min, max, to_date, month, hour, cast, sum, rank\n",
+ "from pyspark.sql.window import Window\n",
+ "\n",
+ "# 수요자 측면 조사\n",
+ "print(\"[수요자 측면 조사]\")\n",
+ "\n",
+ "## 각 회사별 출발지, 목적지 평균 가격\n",
+ "print(\"[각 회사별 출발지, 목적지 평균 가격]\")\n",
+ "result_df = raw_df \\\n",
+ " .withColumn('src_to_dest_by_cab_type', concat(col('cab_type'), lit('('), col('source'), lit('->'), col('destination'), lit(')'))) \\\n",
+ " .groupBy('cab_type', 'source', 'destination', 'src_to_dest_by_cab_type') \\\n",
+ " .agg(avg('price').alias('avg_price')) \\\n",
+ " .orderBy('source', 'destination', 'cab_type')\n",
+ "\n",
+ "result_df.show(20)\n",
+ "\n",
+ "### 결과 그래프\n",
+ "ps \\\n",
+ " .DataFrame(result_df) \\\n",
+ " .set_index('src_to_dest_by_cab_type') \\\n",
+ " .plot \\\n",
+ " .bar(y = 'avg_price') \\\n",
+ " .show();\n",
+ "\n",
+ "\n",
+ "## 각 출발지 별로 콜이 잘 잡히는 회사\n",
+ "print(\"[각 출발지 별로 콜이 잘 잡히는 회사]\")\n",
+ "\n",
+ "windowSpec = Window \\\n",
+ " .partitionBy('source') \\\n",
+ " .orderBy(col('count').desc()) \\\n",
+ " .rowsBetween(Window.unboundedPreceding, Window.currentRow)\n",
+ "\n",
+ "raw_df \\\n",
+ " .groupBy('source', 'cab_type') \\\n",
+ " .count().alias('count') \\\n",
+ " .withColumn('rank', rank().over(windowSpec)) \\\n",
+ " .filter(col('rank') == 1) \\\n",
+ " .select('source', 'cab_type', 'count') \\\n",
+ " .orderBy('source') \\\n",
+ " .show()\n",
+ "\n",
+ "### 검증용\n",
+ "# raw_df \\\n",
+ "# .groupBy('source', 'cab_type') \\\n",
+ "# .count().alias('count') \\\n",
+ "# .orderBy('source', 'cab_type', 'count') \\\n",
+ "# .show(50)\n",
+ "\n",
+ "## 가성비가 좋은 택시 회사\n",
+ "print(\"[가성비가 좋은 택시 회사]\")\n",
+ "raw_df \\\n",
+ " .groupBy('cab_type') \\\n",
+ " .agg(avg(col('price') / col('distance')).alias('avg_price_per_distance')) \\\n",
+ " .orderBy('avg_price_per_distance') \\\n",
+ " .show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "RN-UVTb6dKRi",
+ "outputId": "9fbf1762-7e97-4958-f262-7d0bf2128a11"
+ },
+ "execution_count": 34,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[수요자 측면 조사]\n",
+ "[각 회사별 출발지, 목적지 평균 가격]\n",
+ "+--------+-----------+--------------------+-----------------------+------------------+\n",
+ "|cab_type| source| destination|src_to_dest_by_cab_type| avg_price|\n",
+ "+--------+-----------+--------------------+-----------------------+------------------+\n",
+ "| Lyft| Back Bay| Boston University| Lyft(Back Bay->Bo...|14.235887096774194|\n",
+ "| Uber| Back Bay| Boston University| Uber(Back Bay->Bo...| 13.0688202247191|\n",
+ "| Lyft| Back Bay| Fenway| Lyft(Back Bay->Fe...| 14.97003745318352|\n",
+ "| Uber| Back Bay| Fenway| Uber(Back Bay->Fe...|13.309602649006623|\n",
+ "| Lyft| Back Bay| Haymarket Square| Lyft(Back Bay->Ha...|18.686974789915965|\n",
+ "| Uber| Back Bay| Haymarket Square| Uber(Back Bay->Ha...|16.791208791208792|\n",
+ "| Lyft| Back Bay| North End| Lyft(Back Bay->No...|21.535714285714285|\n",
+ "| Uber| Back Bay| North End| Uber(Back Bay->No...|18.203389830508474|\n",
+ "| Lyft| Back Bay|Northeastern Univ...| Lyft(Back Bay->No...|13.797297297297296|\n",
+ "| Uber| Back Bay|Northeastern Univ...| Uber(Back Bay->No...|12.613505747126437|\n",
+ "| Lyft| Back Bay| South Station| Lyft(Back Bay->So...|16.437037037037037|\n",
+ "| Uber| Back Bay| South Station| Uber(Back Bay->So...| 18.806640625|\n",
+ "| Lyft|Beacon Hill| Boston University| Lyft(Beacon Hill-...| 17.11919504643963|\n",
+ "| Uber|Beacon Hill| Boston University| Uber(Beacon Hill-...|15.539094650205762|\n",
+ "| Lyft|Beacon Hill| Fenway| Lyft(Beacon Hill-...|16.912225705329153|\n",
+ "| Uber|Beacon Hill| Fenway| Uber(Beacon Hill-...| 15.5893536121673|\n",
+ "| Lyft|Beacon Hill| Haymarket Square| Lyft(Beacon Hill-...|14.200607902735563|\n",
+ "| Uber|Beacon Hill| Haymarket Square| Uber(Beacon Hill-...| 13.57312925170068|\n",
+ "| Lyft|Beacon Hill| North End| Lyft(Beacon Hill-...|15.623475609756097|\n",
+ "| Uber|Beacon Hill| North End| Uber(Beacon Hill-...|15.053763440860216|\n",
+ "+--------+-----------+--------------------+-----------------------+------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[각 출발지 별로 콜이 잘 잡히는 회사]\n",
+ "+--------------------+--------+-----+\n",
+ "| source|cab_type|count|\n",
+ "+--------------------+--------+-----+\n",
+ "| Back Bay| Uber| 2135|\n",
+ "| Beacon Hill| Uber| 1905|\n",
+ "| Boston University| Uber| 2056|\n",
+ "| Fenway| Uber| 2112|\n",
+ "| Financial District| Uber| 2058|\n",
+ "| Haymarket Square| Uber| 2171|\n",
+ "| North End| Uber| 2061|\n",
+ "| North Station| Uber| 1990|\n",
+ "|Northeastern Univ...| Uber| 2139|\n",
+ "| South Station| Uber| 1871|\n",
+ "| Theatre District| Uber| 2252|\n",
+ "| West End| Uber| 2076|\n",
+ "+--------------------+--------+-----+\n",
+ "\n",
+ "[가성비가 좋은 택시 회사]\n",
+ "+--------+----------------------+\n",
+ "|cab_type|avg_price_per_distance|\n",
+ "+--------+----------------------+\n",
+ "| Uber| 9.682674403228246|\n",
+ "| Lyft| 9.71325494447119|\n",
+ "+--------+----------------------+\n",
+ "\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file