Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move coursebook scraper methods #30

Merged
merged 3 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
<<<<<<< HEAD
=======
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

# dependencies
Expand Down Expand Up @@ -43,6 +41,7 @@ deploy_log.sh
.idea/
.vscode/
.firebase/
/api-tools

# output data and logs
data/
Expand Down
7 changes: 0 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,8 @@ github.com/bytedance/sonic v1.11.5/go.mod h1:X2PC2giUdj/Cv2lliWFLk6c/DUQok5rViJS
github.com/bytedance/sonic/loader v0.1.0/go.mod h1:UmRT+IRTGKz/DAkzcEGzyVqQFJ7H9BqwBO3pm9H/+HY=
github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d h1:x9d0XwRV3aWw1gAZtv0LrI39U+Efjp0mtyXRyikGb9Y=
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg=
github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y=
github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
Expand Down Expand Up @@ -45,7 +40,6 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
Expand Down Expand Up @@ -147,7 +141,6 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
Expand Down
89 changes: 3 additions & 86 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,100 +6,17 @@ package scrapers

import (
"bytes"
"context"
"errors"
"fmt"
"log"
"net/http"
"os"
"strconv"
"strings"
"time"

"github.com/UTDNebula/api-tools/utils"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/joho/godotenv"
)

func initChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) {
log.Printf("Initializing chromedp...")
headlessEnv, present := os.LookupEnv("HEADLESS_MODE")
doHeadless, _ := strconv.ParseBool(headlessEnv)
if present && doHeadless {
chromedpCtx, cancelFnc = chromedp.NewContext(context.Background())
log.Printf("Initialized chromedp!")
} else {
allocCtx, _ := chromedp.NewExecAllocator(context.Background())
chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx)
}
return
}

// This function generates a fresh auth token and returns the new headers
func refreshToken(chromedpCtx context.Context) map[string][]string {
netID, present := os.LookupEnv("LOGIN_NETID")
if !present {
log.Panic("LOGIN_NETID is missing from .env!")
}
password, present := os.LookupEnv("LOGIN_PASSWORD")
if !present {
log.Panic("LOGIN_PASSWORD is missing from .env!")
}

utils.VPrintf("Getting new token...")
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
err := network.ClearBrowserCookies().Do(ctx)
return err
}),
chromedp.Navigate(`https://wat.utdallas.edu/login`),
chromedp.WaitVisible(`form#login-form`),
chromedp.SendKeys(`input#netid`, netID),
chromedp.SendKeys(`input#password`, password),
chromedp.WaitVisible(`input#login-button`),
chromedp.Click(`input#login-button`),
//chromedp.WaitVisible(`body`),
)
if err != nil {
panic(err)
}

var cookieStrs []string
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(`https://coursebook.utdallas.edu/`),
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := network.GetCookies().Do(ctx)
cookieStrs = make([]string, len(cookies))
gotToken := false
for i, cookie := range cookies {
cookieStrs[i] = fmt.Sprintf("%s=%s", cookie.Name, cookie.Value)
if cookie.Name == "PTGSESSID" {
utils.VPrintf("Got new token: PTGSESSID = %s", cookie.Value)
gotToken = true
}
}
if !gotToken {
return errors.New("failed to get a new token")
}
return err
}),
)
if err != nil {
panic(err)
}

return map[string][]string{
"Host": {"coursebook.utdallas.edu"},
"User-Agent": {"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"},
"Accept": {"text/html"},
"Accept-Language": {"en-US"},
"Content-Type": {"application/x-www-form-urlencoded"},
"Cookie": cookieStrs,
"Connection": {"keep-alive"},
}
}

func ScrapeCoursebook(term string, startPrefix string, outDir string) {

// Load env vars
Expand All @@ -108,7 +25,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
}

// Start chromedp
chromedpCtx, cancel := initChromeDp()
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

// Find index of starting prefix, if one has been given
Expand Down Expand Up @@ -156,7 +73,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
panic(err)
}
// Get a fresh token at the start of each new prefix because we can lol
coursebookHeaders := refreshToken(chromedpCtx)
coursebookHeaders := utils.RefreshToken(chromedpCtx)
// Give coursebook some time to recognize the new token
time.Sleep(500 * time.Millisecond)
// String builder to store accumulated course HTML data for both class levels
Expand Down Expand Up @@ -226,7 +143,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
utils.VPrintf("Got section: %s", id)
if sectionIndex%30 == 0 && sectionIndex != 0 {
// Ratelimit? What ratelimit?
coursebookHeaders = refreshToken(chromedpCtx)
coursebookHeaders = utils.RefreshToken(chromedpCtx)
// Give coursebook some time to recognize the new token
time.Sleep(500 * time.Millisecond)
}
Expand Down
2 changes: 1 addition & 1 deletion scrapers/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var trailingSpaceRegex *regexp.Regexp = regexp.MustCompile(`(\s{2,}?\s{2,})|(\n)

func ScrapeEvents(outDir string) {

chromedpCtx, cancel := initChromeDp()
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

err := os.MkdirAll(outDir, 0777)
Expand Down
2 changes: 1 addition & 1 deletion scrapers/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func scrapeProfessorLinks(chromedpCtx context.Context) []string {

func ScrapeProfiles(outDir string) {

chromedpCtx, cancel := initChromeDp()
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

err := os.MkdirAll(outDir, 0777)
Expand Down
86 changes: 86 additions & 0 deletions utils/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,101 @@
package utils

import (
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
)

// Initializes Chrome DevTools Protocol
func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) {
log.Printf("Initializing chromedp...")
headlessEnv, present := os.LookupEnv("HEADLESS_MODE")
doHeadless, _ := strconv.ParseBool(headlessEnv)
if present && doHeadless {
chromedpCtx, cancelFnc = chromedp.NewContext(context.Background())
log.Printf("Initialized chromedp!")
} else {
allocCtx, _ := chromedp.NewExecAllocator(context.Background())
chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx)
}
return
}

// This function generates a fresh auth token and returns the new headers
func RefreshToken(chromedpCtx context.Context) map[string][]string {
netID, present := os.LookupEnv("LOGIN_NETID")
if !present {
log.Panic("LOGIN_NETID is missing from .env!")
}
password, present := os.LookupEnv("LOGIN_PASSWORD")
if !present {
log.Panic("LOGIN_PASSWORD is missing from .env!")
}

VPrintf("Getting new token...")
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
err := network.ClearBrowserCookies().Do(ctx)
return err
}),
chromedp.Navigate(`https://wat.utdallas.edu/login`),
chromedp.WaitVisible(`form#login-form`),
chromedp.SendKeys(`input#netid`, netID),
chromedp.SendKeys(`input#password`, password),
chromedp.WaitVisible(`input#login-button`),
chromedp.Click(`input#login-button`),
//chromedp.WaitVisible(`body`),
)
if err != nil {
panic(err)
}

var cookieStrs []string
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(`https://coursebook.utdallas.edu/`),
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := network.GetCookies().Do(ctx)
cookieStrs = make([]string, len(cookies))
gotToken := false
for i, cookie := range cookies {
cookieStrs[i] = fmt.Sprintf("%s=%s", cookie.Name, cookie.Value)
if cookie.Name == "PTGSESSID" {
VPrintf("Got new token: PTGSESSID = %s", cookie.Value)
gotToken = true
}
}
if !gotToken {
return errors.New("failed to get a new token")
}
return err
}),
)
if err != nil {
panic(err)
}

return map[string][]string{
"Host": {"coursebook.utdallas.edu"},
"User-Agent": {"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"},
"Accept": {"text/html"},
"Accept-Language": {"en-US"},
"Content-Type": {"application/x-www-form-urlencoded"},
"Cookie": cookieStrs,
"Connection": {"keep-alive"},
}
}

// Encodes and writes the given data as tab-indented JSON to the given filepath.
func WriteJSON(filepath string, data interface{}) error {
fptr, err := os.Create(filepath)
Expand Down
Loading