pre-existing metadata dataframe scripts

arcus · Aug 23, 2023 · a1ac6d4 · a1ac6d4
1 parent 59d84cc
commit a1ac6d4
Show file tree

Hide file tree

Showing 2 changed files with 171 additions and 0 deletions.
diff --git a/.github/scripts/process_metadata.sh b/.github/scripts/process_metadata.sh
@@ -0,0 +1,123 @@
+### This script will create the file knowledge_graph.py containing all of the possible data for visualizing connections between modules.
+### Run this script from the main education_modules directory
+
+metadata_df=assets/media/module_data.py
+
+### Set up the basics of creating a pandas dataframe
+echo "import pandas as pd
+df=pd.DataFrame()" > $metadata_df
+
+
+### Make every module a graph node
+
+echo >> $metadata_df
+
+for FOLDER in *
+do
+  if [[ -s $FOLDER/$FOLDER.md && "$FOLDER" != "a_sample_module_template" ]]      ## Only do this for folders that have a course .md file inside an identically named folder in education_modules
+    then
+      ### pull the one-line macros
+      for CATEGORY in "title" "author" "estimated_time_in_minutes"
+      do
+        category_metadata="`grep -m 1 "$CATEGORY": $FOLDER/$FOLDER.md | sed "s/^[^ ]* //" | sed "s/^[ ]* //" | tr -dc '[:print:]'`"
+        echo "df.loc[\"$FOLDER\", \"$CATEGORY\"] = \"$category_metadata\"" >> $metadata_df
+      done
+
+      ### good_first_module is not yet everywhere, but will be a required field 
+      if grep "good_first_module" -q $FOLDER/$FOLDER.md
+      then
+        good_first_module="`grep -m 1 good_first_module: $FOLDER/$FOLDER.md | sed "s/^[^ ]* //" | sed "s/^[ ]* //" | tr -dc '[:print:]'`"
+      echo "df.loc[\"$FOLDER\", \"good_first_module\"] = \"$good_first_module\" " >> $metadata_df
+      fi
+
+      ### Coding metadata and sequence metadata will always be in some modules but not others
+      for CATEGORY in "coding_required" "coding_language" "coding_level" "sequence_name" "next_sequential_module" "data_task" "data_domain"
+      do
+        if grep "$CATEGORY" -q $FOLDER/$FOLDER.md
+        then
+          category_metadata="`grep -m 1 "$CATEGORY": $FOLDER/$FOLDER.md | sed "s/^[^ ]* //" | sed "s/^[ ]* //" | tr -dc '[:print:]'`"
+        echo "df.loc[\"$FOLDER\", \"$CATEGORY\"] = \"$category_metadata\"" >> $metadata_df
+        fi
+      done
+
+      #### TODO Some comments and long descriptions contain double quotes... this is a problem. For the moment they have been replaced with the character +
+      comment="`grep -m 1 comment: $FOLDER/$FOLDER.md | sed "s/^[^ ]* //" | sed "s/^[ ]* //" | tr -dc '[:print:]' | tr '"' '+'`"
+      echo "df.loc[\"$FOLDER\", \"comment\"] = \"$comment\" " >> $metadata_df
+      long_description="`grep -m 1 long_description: $FOLDER/$FOLDER.md | sed "s/^[^ ]* //" | sed "s/^[ ]* //" | tr -dc '[:print:]' | tr '"' '+'`"
+      echo "df.loc[\"$FOLDER\", \"long_description\"] = \"$long_description\" " >> $metadata_df
+
+      ### Start pulling the data from block macros. So far there is only one of those. First find the line number where the "@learning_objectives" first appears
+      start=$(( $(grep -n -m 1 "@learning_objectives" $FOLDER/$FOLDER.md  | cut -f1 -d:) +1 ))
+
+      end=$(( $(tail -n +$start $FOLDER/$FOLDER.md | grep -n -m 1 "@end" | cut -f1 -d:) - 1 ))
+      #### TODO figure out line breaks!
+      learning_objectives=$(tail -n +$start $FOLDER/$FOLDER.md | head -n $end | tr '\n' '&' | tr '"' '+')
+      echo "df.loc[\"$FOLDER\", \"learning_objectives\"] = \"$learning_objectives\" " >> $metadata_df
+
+      #### pre_reqs (The "IF" is because not every module has the prereqs in this format yet...)
+      if grep "@pre_reqs" -q $FOLDER/$FOLDER.md
+      then
+        start=$(( $(grep -n -m 1 "@pre_reqs" $FOLDER/$FOLDER.md  | cut -f1 -d:) +1 ))
+
+        end=$(( $(tail -n +$start $FOLDER/$FOLDER.md | grep -n -m 1 "@end" | cut -f1 -d:) - 1 ))
+        #### TODO figure out line breaks!
+        pre_reqs=$(tail -n +$start $FOLDER/$FOLDER.md | head -n $end | tr '\n' '&' | tr '"' '+')
+        echo "df.loc[\"$FOLDER\", \"Prerequisties\"] = \"$pre_reqs\" " >> $metadata_df
+      fi
+
+      #### sets_you_up_for (The "IF" is because not every module has this yet...)
+      if grep "@sets_you_up_for" -q $FOLDER/$FOLDER.md
+      then
+        start=$(( $(grep -n -m 1 "@sets_you_up_for" $FOLDER/$FOLDER.md  | cut -f1 -d:) +1 ))
+
+        end=$(( $(tail -n +$start $FOLDER/$FOLDER.md | grep -n -m 1 "@end" | cut -f1 -d:) - 1 ))
+
+        sets_you_up_for=$(tail -n +$start $FOLDER/$FOLDER.md | head -n $end | tr '\n' ' ' | tr '-' ' ')
+        echo "df.loc[\"$FOLDER\", \"Sets You Up For\"] = \"$sets_you_up_for\" " >> $metadata_df
+      fi
+
+      #### depends_on_knowledge_available_in (The "IF" is because not every module has this yet...)
+      if grep "@depends_on_knowledge_available_in" -q $FOLDER/$FOLDER.md
+      then
+        start=$(( $(grep -n -m 1 "@depends_on_knowledge_available_in" $FOLDER/$FOLDER.md  | cut -f1 -d:) +1 ))
+
+        end=$(( $(tail -n +$start $FOLDER/$FOLDER.md | grep -n -m 1 "@end" | cut -f1 -d:) - 1 ))
+
+        depends_on_knowledge_available_in=$(tail -n +$start $FOLDER/$FOLDER.md | head -n $end | tr '\n' ' ' | tr '-' ' ')
+        echo "df.loc[\"$FOLDER\", \"Depends On Knowledge In\"] = \"$depends_on_knowledge_available_in\" " >> $metadata_df
+      fi
+  fi
+done
+
+### Find all links to other modules:
+
+echo  "df[\"Linked Courses\"] = [list() for x in range(len(df.index))]" >> $metadata_df
+
+for FOLDER in *
+do
+  if [[ -s $FOLDER/$FOLDER.md && "$FOLDER" != "a_sample_module_template" ]] 
+  then
+    echo "a = df.loc[\"$FOLDER\", \"Linked Courses\"]" >> $metadata_df
+    for LINKED_COURSE in *
+    do
+      if [[ -s $LINKED_COURSE/$LINKED_COURSE.md && "$LINKED_COURSE" != "a_sample_module_template" && "$LINKED_COURSE" != "$FOLDER" ]] 
+      then
+#          echo $FOLDER, $LINKED_COURSE
+         if [ "$(grep -c $LINKED_COURSE $FOLDER/$FOLDER.md)" -ge 1 ]
+           then
+             echo "a.append(\"$LINKED_COURSE\")" >> $metadata_df
+
+         fi
+
+      fi 
+    done
+    echo "df.at[\"$FOLDER\", \"Linked Courses\"] = list(a)" >> $metadata_df
+  fi
+done
+
+### Debugging code, modify as needed:
+
+#echo "print(df.loc[:,[\"coding_required\", \"coding_language\", \"coding_level\", \"sequence_name\", \"next_sequential_module\"]])">>$metadata_df
+#
+#
+#python assets/module_discovery_app/module_data.py
diff --git a/.github/workflows/pull_metadata.yml b/.github/workflows/pull_metadata.yml
@@ -0,0 +1,48 @@
+# This is a workflow to pull all of the metadata from all of the modules into a single file
+# each time something is merged to main, this workflow will run and rebuild the metadata file.
+
+name: pull_metadata
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the "main" branch
+  merge:
+    branches: [ "main" ]
+    paths: ['Prompts/**', 'Weekly_Emails/Email_Text/**', 'scripts/**']
+
+  #pull_request:
+  #  branches: [ "main" ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v3
+
+      # Runs a set of commands using the runners shell
+      - name: Run all scripts
+        run: |
+          bash .github/scripts/process_metadata.sh
+          
+      - name: Commit newly updated files
+        run:
+          git config --local user.name actions-user
+
+          git config --local user.email "[email protected]" 
+
+          git fetch
+
+          git add *
+
+          git commit -am "update metadata records" 
+
+          git push -f origin main