Skip to content

Commit

Permalink
Fix profile scraper, add HEADLESS_MODE env var
Browse files Browse the repository at this point in the history
  • Loading branch information
jpahm committed Aug 22, 2024
1 parent 3347140 commit fc53624
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 19 deletions.
1 change: 1 addition & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#Scrapers
LOGIN_NETID=
LOGIN_PASSWORD=
HEADLESS_MODE=false

#Uploader
MONGODB_URI=
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Part of [Project Nebula](https://about.utdnebula.com).

### Prerequisites

- Golang 1.19 (or higher)
- Golang 1.23 (or higher)

### Development

Expand Down
10 changes: 5 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
module github.com/UTDNebula/api-tools

go 1.19
go 1.23

require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/UTDNebula/nebula-api/api v0.0.0-20240423212728-2ef02f280c6c
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d
github.com/chromedp/chromedp v0.9.5
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335
github.com/chromedp/chromedp v0.10.0
github.com/joho/godotenv v1.5.1
go.mongodb.org/mongo-driver v1.15.0
)
Expand All @@ -26,7 +26,7 @@ require (
github.com/go-playground/validator/v10 v10.19.0 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.3.2 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gorilla/schema v1.3.0 // indirect
Expand All @@ -51,7 +51,7 @@ require (
golang.org/x/crypto v0.22.0 // indirect
golang.org/x/net v0.24.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.19.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
14 changes: 11 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@ github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d h1:x9d0XwRV3aWw1gAZtv0LrI39U+Efjp0mtyXRyikGb9Y=
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg=
github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y=
github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
github.com/cloudwego/base64x v0.1.3 h1:b5J/l8xolB7dyDTTmhJP2oTs5LdrjyrUFuNxdfq5hAg=
Expand All @@ -30,6 +34,7 @@ github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm
github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
Expand All @@ -40,13 +45,15 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.3.2 h1:zlnbNHxumkRvfPWgfXu8RBwyNR1x8wh9cf5PTOCqs9Q=
github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/gorilla/schema v1.3.0 h1:rbciOzXAx3IB8stEFnfTwO3sYa6EWlQk79XdyustPDA=
github.com/gorilla/schema v1.3.0/go.mod h1:Dg5SSm5PV60mhF2NFaTV1xuYYj8tV8NOPRo4FggUMnM=
Expand Down Expand Up @@ -141,8 +148,8 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
Expand All @@ -159,6 +166,7 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
Expand Down
16 changes: 12 additions & 4 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"log"
"net/http"
"os"
"strconv"
"strings"
"time"

Expand All @@ -23,9 +24,15 @@ import (

func initChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) {
log.Printf("Initializing chromedp...")
allocCtx, cancelFnc := chromedp.NewExecAllocator(context.Background())
chromedpCtx, _ = chromedp.NewContext(allocCtx)
log.Printf("Initialized chromedp!")
headlessEnv, present := os.LookupEnv("HEADLESS_MODE")
doHeadless, _ := strconv.ParseBool(headlessEnv)
if present && doHeadless {
chromedpCtx, cancelFnc = chromedp.NewContext(context.Background())
log.Printf("Initialized chromedp!")
} else {
allocCtx, _ := chromedp.NewExecAllocator(context.Background())
chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx)
}
return
}

Expand All @@ -50,8 +57,9 @@ func refreshToken(chromedpCtx context.Context) map[string][]string {
chromedp.WaitVisible(`form#login-form`),
chromedp.SendKeys(`input#netid`, netID),
chromedp.SendKeys(`input#password`, password),
chromedp.WaitVisible(`input#login-button`),
chromedp.Click(`input#login-button`),
chromedp.WaitVisible(`body`),
//chromedp.WaitVisible(`body`),
)
if err != nil {
panic(err)
Expand Down
12 changes: 6 additions & 6 deletions scrapers/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func parseList(list []string) (string, schema.Location) {
var office schema.Location

for _, element := range list {
element = strings.Trim(element, " ")
element = strings.TrimSpace(element)
utils.VPrintf("Element is: %s", element)
if strings.Contains(element, "-") {
phoneNumber = element
Expand Down Expand Up @@ -168,7 +168,7 @@ func ScrapeProfiles(outDir string) {
chromedp.Navigate(link),
chromedp.ActionFunc(func(ctx context.Context) error {
var text string
err := chromedp.Text("//h2", &text).Do(ctx)
err := chromedp.Text("div.contact_info>h1", &text).Do(ctx)
firstName, lastName = parseName(text)
return err
}),
Expand Down Expand Up @@ -223,7 +223,7 @@ func ScrapeProfiles(outDir string) {
utils.VPrint("Scraping titles...")

err = chromedp.Run(chromedpCtx,
chromedp.QueryAfter("//h6",
chromedp.QueryAfter("div.profile-title",
func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error {
for _, node := range nodes {
tempText := getNodeText(node)
Expand Down Expand Up @@ -257,11 +257,11 @@ func ScrapeProfiles(outDir string) {
utils.VPrint("Scraping list text...")

err = chromedp.Run(chromedpCtx,
chromedp.QueryAfter("div.contact_info > div",
chromedp.QueryAfter("div.contact_info>div ~ div",
func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error {
var tempText string
err := chromedp.Text("div.contact_info > div", &tempText).Do(ctx)
texts = strings.Split(tempText, "")
err := chromedp.Text("div.contact_info>div ~ div", &tempText).Do(ctx)
texts = strings.Split(tempText, "\n")
return err
},
),
Expand Down

0 comments on commit fc53624

Please sign in to comment.