Merge pull request #2952 from cal-itp/ah_update_docs

Update docs to reference calitp_data_analysis package
cal-itp · Sep 29, 2023 · bfbc6a8 · bfbc6a8
2 parents 8516a1a + 5bbddc0
commit bfbc6a8
Show file tree

Hide file tree

Showing 13 changed files with 111 additions and 65 deletions.
diff --git a/docs/analytics_new_analysts/01-data-analysis-intro.md b/docs/analytics_new_analysts/01-data-analysis-intro.md
@@ -82,8 +82,13 @@ Dataframe #3: `council_boundaries` (geospatial)
 First, merge `paunch_locations` with `council_population` using the `CD` column, which they have in common.
 
 ```
-merge1 = pd.merge(paunch_locations, council_population, on = 'CD',
-    how = 'inner', validate = 'm:1')
+merge1 = pd.merge(
+    paunch_locations,
+    council_population,
+    on = 'CD',
+    how = 'inner',
+    validate = 'm:1'
+)
 
 # m:1 many-to-1 merge means that CD appears multiple times in
 # paunch_locations, but only once in council_population.
@@ -92,8 +97,14 @@ merge1 = pd.merge(paunch_locations, council_population, on = 'CD',
 Next, merge `merge1` and `council_boundaries`. Columns don't have to have the same names to be matched on, as long as they hold the same values.
 
 ```
-merge2 = pd.merge(merge1, council_boundaries, left_on = 'CD',
-    right_on = 'District', how = 'left', validate = 'm:1')
+merge2 = pd.merge(
+    merge1,
+    council_boundaries,
+    left_on = 'CD',
+    right_on = 'District',
+    how = 'left',
+    validate = 'm:1'
+)
 ```
 
 Here are some things to know about `merge2`:
@@ -247,7 +258,7 @@ sales_group = []
 
 for row in paunch_locations['Sales_millions']:
     # If sales are more than $3M, but less than $5M, tag as moderate.
-    if (row >= 3) & (row <= 5) :
+    if (row >= 3) and (row <= 5):
         sales_group.append('moderate')
     # If sales are more than $5M, tag as high.
     elif row >=5:
@@ -256,6 +267,7 @@ for row in paunch_locations['Sales_millions']:
     else:
         sales_group.append('low')
 
+
 paunch_locations['sales_group'] = sales_group
 
 paunch_locations
@@ -279,14 +291,22 @@ To answer the question of how many Paunch Burger locations there are per Council
 
 ```
 # Method #1: groupby and agg
-pivot = merge2.groupby(['CD', 'Geometry_y']).agg({'Sales_millions': 'sum',
-     'Store': 'count', 'Population': 'mean'}).reset_index()
+pivot = (merge2.groupby(['CD'])
+        .agg({'Sales_millions': 'sum',
+              'Store': 'count',
+              'Population': 'mean'}
+             ).reset_index()
+        )
 
 # Method #2: pivot table
-pivot = merge2.pivot_table(index= ['CD', 'Geometry_y'],
-    values = ['Sales_millions', 'Store', 'Population'],
-    aggfunc= {'Sales_millions': 'sum', 'Store': 'count',
-        'Population': 'mean'}).reset_index()
+pivot = merge2.pivot_table(
+        index= ['CD'],
+        values = ['Sales_millions', 'Store', 'Population'],
+        aggfunc= {
+            'Sales_millions': 'sum',
+            'Store': 'count',
+            'Population': 'mean'}
+       ).reset_index()
 
     # to only find one type of summary statistic, use aggfunc = 'sum'
 
@@ -296,11 +316,11 @@ pivot = merge2.pivot_table(index= ['CD', 'Geometry_y'],
 
 `pivot` looks like this:
 
-| CD  | Geometry_y | Sales_millions | Store | Council_Member  | Population |
-| --- | ---------- | -------------- | ----- | --------------- | ---------- |
-| 1   | polygon    | $9             | 2     | Leslie Knope    | 1,500      |
-| 2   | polygon    | $8.5           | 2     | Jeremy Jamm     | 2,000      |
-| 3   | polygon    | $2.5           | 1     | Douglass Howser | 2,250      |
+| CD  | Sales_millions | Store | Council_Member  | Population |
+| --- | -------------- | ----- | --------------- | ---------- |
+| 1   | $9             | 2     | Leslie Knope    | 1,500      |
+| 2   | $8.5           | 2     | Jeremy Jamm     | 2,000      |
+| 3   | $2.5           | 1     | Douglass Howser | 2,250      |
 
 ## Export Aggregated Output
 

diff --git a/docs/analytics_new_analysts/02-data-analysis-intermediate.md b/docs/analytics_new_analysts/02-data-analysis-intermediate.md
@@ -154,8 +154,10 @@ for key, value in dfs.items():
     # Use f string to define a variable join_df (result of our spatial join)
     ## join_{key} would be join_pawnee or join_tom in the loop
     join_df = "join_{key}"
+
     # Spatial join
     join_df = gpd.sjoin(value, council_district, how = 'inner', op = 'intersects')
+
     # Calculate summary stats with groupby, agg, then save it into summary_dfs,
     # naming it 'pawnee' or 'tom'.
     summary_dfs[key] = join.groupby('ID').agg(

diff --git a/docs/analytics_new_analysts/05-spatial-analysis-basics.md b/docs/analytics_new_analysts/05-spatial-analysis-basics.md
@@ -42,15 +42,17 @@ gdf.to_file(driver = 'ESRI Shapefile', filename = '../folder/my_shapefile.shp' )
 To read in our dataframe (df) and geodataframe (gdf) from GCS:
 
 ```
-df = pd.read_csv('gs://calitp-analytics-data/data-analyses/bucket-name/my-csv.csv')
-gdf = gpd.read_file('gs://calitp-analytics-data/data-analyses/bucket-name/my-geojson.geojson')
-gdf = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/bucket-name/my-geoparquet.parquet', engine= 'auto')
-gdf = gpd.read_file('gs://calitp-analytics-data/data-analyses/bucket-name/my-shapefile.zip')
+GCS_BUCKET = 'gs://calitp-analytics-data/data-analyses/bucket_name/'
+
+df = pd.read_csv(f'{GCS_BUCKET}my-csv.csv')
+gdf = gpd.read_file(f'{GCS_BUCKET}my-geojson.geojson')
+gdf = gpd.read_parquet(f'{GCS_BUCKET}my-geoparquet.parquet')
+gdf = gpd.read_file(f'{GCS_BUCKET}my-shapefile.zip')
 
 # Write a file to GCS
-gdf.to_file('gs://calitp-analytics-data/data-analyses/bucket-name/my-geojson.geojson', driver='GeoJSON')
+gdf.to_file(f'{GCS_BUCKET}my-geojson.geojson', driver='GeoJSON')
 
-#Using shared utils
+# Using calitp_data_analysis
 GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
 FILE_NAME = "test_geoparquet"
 utils.geoparquet_gcs_export(gdf, GCS_FILE_PATH, FILE_NAME)
@@ -93,7 +95,7 @@ gdf.crs
 gdf = gdf.to_crs('EPSG:2229')
 ```
 
-Sometimes, the gdf does not have a CRS set and you will need to be manually set it. This might occur if you create the `geometry` column from latitude and longitude points. More on this in the [intermediate tutorial](geo-intermediate):
+Sometimes, the gdf does not have a CRS set and you will need to be manually set it. This might occur if you create the `geometry` column from latitude and longitude points. More on this in the [intermediate tutorial](geo-intermediate).
 
 There are [lots of different CRS available](https://epsg.io). The most common ones used for California are:
 

diff --git a/docs/analytics_new_analysts/06-spatial-analysis-intro.md b/docs/analytics_new_analysts/06-spatial-analysis-intro.md
@@ -115,16 +115,30 @@ The `join` gdf looks like this. We lost Stores 4 (Eagleton) and 7 (Indianapolis)
 We want to count the number of Paunch Burger locations and their total sales within each District.
 
 ```
-summary = join.pivot_table(index = ['District', 'Geometry_y],
+
+summary = join.pivot_table(
+    index = ['District'],
     values = ['Store', 'Sales_millions'],
-    aggfunc = {'Store': 'count', 'Sales_millions': 'sum'}).reset_index()
+    aggfunc = {'Store': 'count', 'Sales_millions': 'sum'}
+).reset_index()
 
 OR
 
-summary = join.groupby(['District', 'Geometry_y']).agg({'Store': 'count',
-    'Sales_millions': 'sum'}).reset_index()
+summary = (join.groupby(['District'])
+            .agg({
+                'Store': 'count',
+                'Sales_millions': 'sum'}
+            ).reset_index()
+          )
+
+# Make sure to merge in district geometry again
+summary = pd.merge(
+    gdf,
+    summary,
+    on = 'District',
+    how = 'inner'
+)
 
-summary.rename(column = {'Geometry_y': 'Geometry'}, inplace = True)
 summary
 ```
 
@@ -146,7 +160,7 @@ summary.to_file(driver = 'ESRI Shapefile',
 
 ## Buffers
 
-Buffers are areas of a certain distance around a given point, line, or polygon. Buffers are used to determine <i> proximity </i>. A 5 mile buffer around a point would be a circle of 5 mile radius centered at the point. This [ESRI page](http://desktop.arcgis.com/en/arcmap/10.3/tools/analysis-toolbox/buffer.htm) shows how buffers for points, lines, and polygons look.
+Buffers are areas of a certain distance around a given point, line, or polygon. Buffers are used to determine <i> proximity</i>. A 5 mile buffer around a point would be a circle of 5 mile radius centered at the point. This [ESRI page](http://desktop.arcgis.com/en/arcmap/10.3/tools/analysis-toolbox/buffer.htm) shows how buffers for points, lines, and polygons look.
 
 Some examples of questions that buffers help answer are:
 
@@ -202,11 +216,17 @@ homes_buffer['geometry'] = homes.geometry.buffer(two_miles)
 Do a spatial join between `locations` and `homes_buffer`. Repeat the process of spatial join and aggregation in Python as illustrated in the previous section (spatial join and dissolve in ArcGIS).
 
 ```
-sjoin = gpd.sjoin(locations, homes_buffer, how = 'inner', predicate = 'intersects')
+sjoin = gpd.sjoin(
+    locations,
+    homes_buffer,
+    how = 'inner',
+    predicate = 'intersects'
+)
+
 sjoin
 ```
 
-`sjoin` looks like this.
+`sjoin` looks like this (without `Geometry_y` column). Using `how='left' or how='inner'` will show `Geometry_x` as the resulting geometry column, while using `how = 'right'` would show `Geometry_y` as the resulting geometry column. Only one geometry column is returned in a spatial join, but you have the flexibility in determining which one it is by changing `how=`.
 
 - Geometry_x is the point geometry from our left df `locations`.
 - Geometry_y is the polygon geometry from our right df `homes_buffer`.

diff --git a/docs/analytics_new_analysts/07-spatial-analysis-intermediate.md b/docs/analytics_new_analysts/07-spatial-analysis-intermediate.md
@@ -52,7 +52,8 @@ Then, create the `geometry` column.  We use a lambda function and apply it to al
 df.rename(columns = {'X': 'longitude', 'Y':'latitude'}, inplace=True)
 
 # Create geometry column
-gdf = gpd.points_from_xy(df.longitude, df.latitude, crs="EPSG:4326")
+geom_col = gpd.points_from_xy(df.longitude, df.latitude, crs="EPSG:4326")
+gdf = gpd.GeoDataFrame(df, geometry=geom_col, crs = "EPSG:4326")
 
 # Project to different CRS. Pawnee is in Indiana, so we'll use EPSG:2965.
 # In Southern California, use EPSG:2229.
@@ -152,8 +153,10 @@ for key, value in boundaries.items():
     # Define new variables using f string
     join_df = f"{key}_join"
     agg_df = f"{key}_summary"
+
     # Spatial join, but don't save it into the results dictionary
     join_df = gpd.sjoin(df, value, how = 'inner', predicate = 'intersects')
+
     # Aggregate and save results into results dictionary
     results[agg_df] = join_df.groupby('ID').agg(
         {'Business': 'count', 'Sales_millions': 'sum'})

diff --git a/docs/analytics_new_analysts/08-spatial-analysis-advanced.md b/docs/analytics_new_analysts/08-spatial-analysis-advanced.md
@@ -23,12 +23,12 @@ from geoalchemy2 import WKTElement
 
 There are six possible geometric shapes that are represented in geospatial data. [More description here.](http://postgis.net/workshops/postgis-intro/geometries.html#representing-real-world-objects)
 
-- Point
-- MultiPoint: collection of points
-- LineString
-- MultiLineString: collection of linestrings, which are disconnected from each other
-- Polygon
-- MultiPolygon: collection of polygons, which can be disconnected or overlapping from each other
+- `Point`
+- `MultiPoint`: collection of points
+- `LineString`
+- `MultiLineString`: collection of linestrings, which are disconnected from each other
+- `Polygon`
+- `MultiPolygon`: collection of polygons, which can be disconnected or overlapping from each other
 
 The ArcGIS equivalent of these are just points, lines, and polygons.
 

diff --git a/docs/analytics_new_analysts/overview.md b/docs/analytics_new_analysts/overview.md
@@ -4,7 +4,7 @@
 
 This section is geared towards data analysts who are new to Python. The following tutorials highlight the most relevant Python skills used at Cal ITP. Use them to guide you through completing [practice exercises #1-9](https://github.com/cal-itp/data-analyses/tree/main/starter_kit).
 
-## Content:
+## Content
 
 - [Data Analysis: Introduction](pandas-intro)
 - [Data Analysis: Intermediate](pandas-intermediate)
@@ -15,18 +15,19 @@ This section is geared towards data analysts who are new to Python. The followin
 - [Working with Geospatial Data: Intermediate](geo-intermediate)
 - [Working with Geospatial Data: Advanced](geo-advanced)
 
-## Additional Resources:
+## Additional Resources
 
 - If you are new to Python, take a look at [all the Python tutorials](https://www.linkedin.com/learning/search?keywords=python&u=36029164) available through Caltrans. There are many introductory Python courses [such as this one.](https://www.linkedin.com/learning/python-essential-training-18764650/getting-started-with-python?autoplay=true&u=36029164)
 - [Joris van den Bossche's Geopandas Tutorial](https://github.com/jorisvandenbossche/geopandas-tutorial)
 - [Practical Python for Data Science by Jill Cates](https://www.practicalpythonfordatascience.com/intro.html)
 - [Ben-Gurion University of the Negev - Geometric operations](https://geobgu.xyz/py/geopandas2.html)
 - [Geographic Thinking for Data Scientists](https://geographicdata.science/book/notebooks/01_geo_thinking.html)
+- [PyGIS Geospatial Tutorials](https://pygis.io/docs/a_intro.html)
 - [Python Courses, compiled by our team](https://docs.google.com/spreadsheets/d/1Omow8F0SUiMx1jyG7GpbwnnJ5yWqlLeMH7SMtKxwG80/edit?usp=sharing)
 - [Why Dask?](https://docs.dask.org/en/stable/why.html)
 - [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html)
 
-### Books:
+### Books
 
 - [The Performance Stat Potential](https://www.brookings.edu/book/the-performancestat-potential/)
 - [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do)

diff --git a/docs/analytics_onboarding/overview.md b/docs/analytics_onboarding/overview.md
@@ -27,14 +27,14 @@
 - [ ] **[notebooks.calitp.org](https://notebooks.calitp.org/)** - JupyterHub cloud-based notebooks for querying Python, SQL, R | ([Docs](jupyterhub-intro))
 - [ ] **[dashboards.calitp.org](https://dashboards.calitp.org/)** - Metabase business insights & dashboards | ([Docs](metabase))
 - [ ] **[dbt-docs.calitp.org](https://dbt-docs.calitp.org/)** - Documentation for the Cal-ITP data warehouse.
-- [ ] **[analysis.calitp.org](https://analysis.calitp.org/)** - The Cal-ITP analytics portfolio website. | (Docs WIP)
+- [ ] **[analysis.calitp.org](https://analysis.calitp.org/)** - The Cal-ITP analytics portfolio website.
 - [ ] [**Google BigQuery**](https://console.cloud.google.com/bigquery) - Viewing the data warehouse and querying SQL
 
 **Python Libraries:**
 
 - [ ] **calitp-data-analysis** - Cal-ITP's internal Python library for analysis | ([Docs](calitp-data-analysis))
 - [ ] **siuba** - Recommended data analysis library | ([Docs](siuba))
-- [ ] [**shared_utils**](https://github.com/cal-itp/data-analyses/tree/main/_shared_utils) - A shared utilities library for the analytics team | ([Docs](shared-utils))
+- [ ] [**shared_utils**](https://github.com/cal-itp/data-analyses/tree/main/_shared_utils) and [**here**](https://github.com/cal-itp/data-infra/tree/main/packages/calitp-data-analysis/calitp_data_analysis) - A shared utilities library for the analytics team | ([Docs](shared-utils))
 
 **Caltrans Employee Resources:**
 

diff --git a/docs/analytics_tools/jupyterhub.md b/docs/analytics_tools/jupyterhub.md
@@ -60,6 +60,8 @@ gcloud config set project cal-itp-data-infra
 gcloud auth application-default login
 ```
 
+If you are still not able to connect, make sure you have the suite of permissions associated with other analysts.
+
 ### Increasing the Query Limit
 
 By default, there is a query limit set within the Jupyter Notebook. Most queries should be within that limit, and running into `DatabaseError: 500 Query exceeded limit for bytes billed` should be a red flag to investigate whether such a large query is needed for the analysis. To increase the query limit, add and execute the following in your notebook:

diff --git a/docs/analytics_tools/python_libraries.md b/docs/analytics_tools/python_libraries.md
@@ -36,7 +36,7 @@ The following libraries are available and recommended for use by Cal-ITP data an
 
 ## shared utils
 
-A set of shared utility functions can also be installed, similarly to any Python library. The [shared_utils](https://github.com/cal-itp/data-analyses/shared_utils) are stored here. Generalized functions for analysis are added as collaborative work evolves so we aren't constantly reinventing the wheel.
+A set of shared utility functions can also be installed, similarly to any Python library. The `shared_utils` are stored in two places: [here](https://github.com/cal-itp/data-analyses/shared_utils) in `data-analyses`, which houses functions that are more likely to be updated. Shared functions that are updated less frequently are housed [here](https://github.com/cal-itp/data-infra/tree/main/packages/calitp-data-analysis/calitp_data_analysis) in the `calitp_data_analysis` package in `data-infra`. Generalized functions for analysis are added as collaborative work evolves so we aren't constantly reinventing the wheel.
 
 ### In terminal
 
@@ -53,13 +53,12 @@ A set of shared utility functions can also be installed, similarly to any Python
 ### In notebook
 
 ```python
-import shared_utils
+from calitp_data_analysis import geography_utils
 
-#example of using shared_utils
-shared_utils.geography_utils.WGS84
+geography_utils.WGS84
 ```
 
-See [data-analyses/example_reports](https://github.com/cal-itp/data-analyses/tree/main/example_report) for examples on how to use `shared_utils` for general functions, charts, and maps.
+See [data-analyses/starter_kit](https://github.com/cal-itp/data-analyses/tree/main/starter_kit) for examples on how to use `shared_utils` for general functions, charts, and maps.
 
 (calitp-data-analysis)=
 

diff --git a/docs/analytics_tools/saving_code.md b/docs/analytics_tools/saving_code.md
@@ -109,7 +109,7 @@ If you discover merge conflicts and they are within a single notebook that only
 
 These are helpful Git commands an analyst might need, listed in no particular order.
 
-- During collaboration, if another analyst already created a remote branch, and you want to work off of the same branch: `git fetch origin`, `git checkout -b our-project-branch origin/our-project-branch`
+- During collaboration, if another analyst already created a remote branch, and you want to work off of the same branch: `git fetch origin`, `git switch -c our-project-branch origin/our-project-branch`
 - To discard the changes you made to a file, `git checkout my-notebook.ipynb`, and you can revert back to the version that was last committed.
 - Temporarily stash changes, move to a different branch, and come back and retain those changes: `git stash`, `git switch some-other-branch`, do stuff on the other branch, `git switch original-branch`, `git stash pop`
 - Rename files and retain the version history associated (`mv` is move, and renaming is moving the file path): `git mv old-notebook.ipynb new-notebook.ipynb`

diff --git a/docs/analytics_tools/storing_data.md b/docs/analytics_tools/storing_data.md
@@ -63,7 +63,7 @@ import geopandas as gpd
 import gcsfs
 import pandas as pd
 
-from calitp_data.storage import get_fs
+from calitp_data_analysis import get_fs
 fs = get_fs()
 ```
 
@@ -112,12 +112,12 @@ gdf.to_parquet("./my-geoparqet.parquet")
 fs.put("./my-geoparquet.parquet", f"{GCS_FOLDER}my-geoparquet.parquet")
 ```
 
-Or, use the `shared_utils` package.
+Or, use the `calitp_data_analysis` package that lives in [data-infra](https://github.com/cal-itp/data-infra/tree/main/packages/calitp-data-analysis/calitp_data_analysis)
 
 ```python
-import shared_utils
+from calitp_data_analysis import utils
 
-shared_utils.utils.geoparquet_gcs_export(
+utils.geoparquet_gcs_export(
     gdf,
     GCS_FOLDER,
     "my-geoparquet"
@@ -132,21 +132,21 @@ Refer to the [data catalogs doc](catalogue-cloud-storage) to list a zipped shape
 
 Refer to the [data catalogs doc](catalogue-cloud-storage) to list a GeoJSON, and read in the GeoJSON with the `intake` method. GeoJSONs saved in GCS cannot be read in directly using `geopandas`.
 
-Use the `shared_utils` package to read in or export geojsons.
+Use the `calitp_data_analysis` package to read in or export geojsons.
 
 ```python
-import shared_utils
+from calitp_data_analysis import utils
 
 GCS_FOLDER = "gs://calitp-analytics-data/data-analyses/task-subfolder/"
 
-gdf = shared_utils.utils.read_geojson(
+gdf = utils.read_geojson(
     GCS_FOLDER,
     "my-geojson.geojson",
     geojson_type = "geojson",
     save_locally = True
 )
 
-shared_utils.utils.geojson_gcs_export(
+utils.geojson_gcs_export(
     gdf,
     GCS_FOLDER,
     "my-geojson.geojson",