Skip to content

Commit

Permalink
Merge pull request #293 from jakopako/jakopako/issue218
Browse files Browse the repository at this point in the history
Add debug flag - wip
  • Loading branch information
jakopako authored Apr 20, 2024
2 parents 581082f + d27f68e commit de29d88
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 54 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,19 @@ jobs:
fail-fast: false
matrix:
language: ["go"]
go-version: ["1.22"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://git.io/codeql-language-support

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
Expand All @@ -66,5 +72,5 @@ jobs:
# make bootstrap
# make release

- name: Perform CodeQL Analysi2
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
2 changes: 1 addition & 1 deletion .github/workflows/go-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
go-version: ["1.18", "1.19"]
go-version: ["1.22"]

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
PACKAGE_NAME := github.com/jakopako/goskyr
GOLANG_CROSS_VERSION ?= v1.19
GOLANG_CROSS_VERSION ?= v1.22

.PHONY: release-dry-run
release-dry-run:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ Note that the machine learning feature is rather new and might not always work w

## Manual Configuration & Usage

Despite the option to automatically generate a configuration file for goskyr there are a lot more options that can be configured manually.
Despite the option to automatically generate a configuration file for goskyr there are a lot more options that can be configured manually. Note that while writing and testing a new configuration it might make sense to use the `-debug` flag when running goskyr, to enable more detailed logging and have the scraped html's written to files.

A very simple configuration would look something like this:

Expand Down
4 changes: 4 additions & 0 deletions fetch/fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"io"
"log/slog"
"net/http"
"time"

Expand All @@ -28,6 +29,7 @@ type StaticFetcher struct {
}

func (s *StaticFetcher) Fetch(url string, opts FetchOpts) (string, error) {
slog.Debug("fetching page", slog.String("fetcher", "static"), slog.String("url", url), slog.String("user-agent", s.UserAgent))
var resString string
client := &http.Client{}

Expand Down Expand Up @@ -89,6 +91,8 @@ func (d *DynamicFetcher) Cancel() {
}

func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", url))
logger.Debug("fetching page", slog.String("user-agent", d.UserAgent))
// start := time.Now()
ctx, cancel := chromedp.NewContext(d.allocContext)
// ctx, cancel := chromedp.NewContext(d.allocContext,
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/jakopako/goskyr

go 1.19
go 1.22

require (
github.com/PuerkitoBio/goquery v1.9.1
Expand Down
60 changes: 43 additions & 17 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package main
import (
"flag"
"fmt"
"log"
"log/slog"
"math"
"os"
"sync"
Expand All @@ -18,19 +18,21 @@ import (
var version = "dev"

func worker(sc chan scraper.Scraper, ic chan map[string]interface{}, gc *scraper.GlobalConfig, threadNr int) {
workerLogger := slog.With(slog.Int("thread", threadNr))
for s := range sc {
log.Printf("thread %d: scraping %s\n", threadNr, s.Name)
scraperLogger := workerLogger.With(slog.String("name", s.Name))
scraperLogger.Info("starting scraping task")
items, err := s.GetItems(gc, false)
if err != nil {
log.Printf("%s ERROR: %s", s.Name, err)
scraperLogger.Error(fmt.Sprintf("%s: %s", s.Name, err))
continue
}
log.Printf("thread %d: fetched %d %s items\n", threadNr, len(items), s.Name)
scraperLogger.Info(fmt.Sprintf("fetched %d items", len(items)))
for _, item := range items {
ic <- item
}
}
log.Printf("thread %d: done working\n", threadNr)
workerLogger.Info("done working")
}

func main() {
Expand All @@ -41,11 +43,12 @@ func main() {
generateConfig := flag.String("g", "", "Automatically generate a config file for the given url.")
m := flag.Int("m", 20, "The minimum number of items on a page. This is needed to filter out noise. Works in combination with the -g flag.")
f := flag.Bool("f", false, "Only show fields that have varying values across the list of items. Works in combination with the -g flag.")
d := flag.Bool("d", false, "Render JS before generating a configuration file. Works in combination with the -g flag.")
renderJs := flag.Bool("r", false, "Render JS before generating a configuration file. Works in combination with the -g flag.")
extractFeatures := flag.String("e", "", "Extract ML features based on the given configuration file (-c) and write them to the given file in csv format.")
wordsDir := flag.String("w", "word-lists", "The directory that contains a number of files containing words of different languages. This is needed for the ML part (use with -e or -b).")
buildModel := flag.String("t", "", "Train a ML model based on the given csv features file. This will generate 2 files, goskyr.model and goskyr.class")
modelPath := flag.String("model", "", "Use a pre-trained ML model to infer names of extracted fields. Works in combination with the -g flag.")
debug := flag.Bool("debug", false, "Prints debug logs and writes scraped html's to files.")

flag.Parse()

Expand All @@ -54,14 +57,27 @@ func main() {
return
}

var logLevel slog.Level
if *debug {
logLevel = slog.LevelDebug
} else {
logLevel = slog.LevelInfo
}

logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: logLevel}))
slog.SetDefault(logger)

if *generateConfig != "" {
slog.Debug("starting to generate config")
s := &scraper.Scraper{URL: *generateConfig}
if *d {
if *renderJs {
s.RenderJs = true
}
slog.Debug(fmt.Sprintf("analyzing url %s", s.URL))
err := automate.GetDynamicFieldsConfig(s, *m, *f, *modelPath, *wordsDir)
if err != nil {
log.Fatal(err)
slog.Error(fmt.Sprintf("%v", err))
os.Exit(1)
}
c := scraper.Config{
Scrapers: []scraper.Scraper{
Expand All @@ -70,41 +86,47 @@ func main() {
}
yamlData, err := yaml.Marshal(&c)
if err != nil {
log.Fatalf("Error while Marshaling. %v", err)
slog.Error(fmt.Sprintf("error while marshaling. %v", err))
os.Exit(1)
}

if *toStdout {
fmt.Println(string(yamlData))
} else {
f, err := os.Create(*configLoc)
if err != nil {
log.Fatalf("ERROR while trying to open file: %v", err)
slog.Error(fmt.Sprintf("error opening file: %v", err))
os.Exit(1)
}
defer f.Close()
_, err = f.Write(yamlData)
if err != nil {
log.Fatalf("ERROR while trying to write to file: %v", err)
slog.Error(fmt.Sprintf("error writing to file: %v", err))
os.Exit(1)
}
log.Printf("successfully wrote config to file %s", *configLoc)
slog.Info(fmt.Sprintf("successfully wrote config to file %s", *configLoc))
}
return
}

if *buildModel != "" {
if err := ml.TrainModel(*buildModel); err != nil {
log.Fatal(err)
slog.Error(fmt.Sprintf("%v", err))
os.Exit(1)
}
return
}

config, err := scraper.NewConfig(*configLoc)
if err != nil {
log.Fatal(err)
slog.Error(fmt.Sprintf("%v", err))
os.Exit(1)
}

if *extractFeatures != "" {
if err := ml.ExtractFeatures(config, *extractFeatures, *wordsDir); err != nil {
log.Fatal(err)
slog.Error(fmt.Sprintf("%v", err))
os.Exit(1)
}
return
}
Expand All @@ -125,7 +147,8 @@ func main() {
case output.FILE_WRITER_TYPE:
writer = output.NewFileWriter(&config.Writer)
default:
log.Fatalf("writer of type %s not implemented", config.Writer.Type)
slog.Error(fmt.Sprintf("writer of type %s not implemented", config.Writer.Type))
os.Exit(1)
}
}

Expand All @@ -139,6 +162,7 @@ func main() {
go func() {
for _, s := range config.Scrapers {
if *singleScraper == "" || *singleScraper == s.Name {
s.Debug = *debug
sc <- s
}
}
Expand All @@ -150,8 +174,9 @@ func main() {
if *singleScraper == "" {
nrWorkers = int(math.Min(20, float64(len(config.Scrapers))))
}
log.Printf("running with %d threads\n", nrWorkers)
slog.Info(fmt.Sprintf("running with %d threads\n", nrWorkers))
workerWg.Add(nrWorkers)
slog.Debug("starting workers")
for i := 0; i < nrWorkers; i++ {
go func(j int) {
defer workerWg.Done()
Expand All @@ -161,6 +186,7 @@ func main() {

// start writer
writerWg.Add(1)
slog.Debug("starting writer")
go func() {
defer writerWg.Done()
writer.Write(ic)
Expand Down
15 changes: 9 additions & 6 deletions output/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ import (
"encoding/json"
"fmt"
"io"
"log"
"log/slog"
"net/http"
"net/url"
"os"
"time"
)

Expand All @@ -25,6 +26,7 @@ func NewAPIWriter(wc *WriterConfig) *APIWriter {
}

func (f *APIWriter) Write(items chan map[string]interface{}) {
logger := slog.With(slog.String("writer", API_WRITER_TYPE))
client := &http.Client{
Timeout: time.Second * 10,
}
Expand All @@ -45,7 +47,7 @@ func (f *APIWriter) Write(items chan map[string]interface{}) {
// delete all items from the given source
firstDate, ok := item["date"].(time.Time)
if !ok {
log.Printf("error while trying to cast the date field of item %v to time.Time", item)
logger.Error(fmt.Sprintf("error while trying to cast the date field of item %v to time.Time", item))
continue
}
firstDateUTCF := firstDate.UTC().Format("2006-01-02 15:04")
Expand All @@ -54,15 +56,16 @@ func (f *APIWriter) Write(items chan map[string]interface{}) {
req.SetBasicAuth(apiUser, apiPassword)
resp, err := client.Do(req)
if err != nil {
log.Printf("error while deleting items from the api: %v\n", err)
logger.Error(fmt.Sprintf("error while deleting items from the api: %v\n", err))
continue
}
if resp.StatusCode != 200 {
body, err := io.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
logger.Error(fmt.Sprintf("%v", err))
}
log.Fatalf("error while deleting items. Status Code: %d\nUrl: %s Response: %s\n", resp.StatusCode, deleteURL, body)
logger.Error(fmt.Sprintf("error while deleting items. Status Code: %d\nUrl: %s Response: %s\n", resp.StatusCode, deleteURL, body))
os.Exit(1)
}
resp.Body.Close()
}
Expand All @@ -82,7 +85,7 @@ func (f *APIWriter) Write(items chan map[string]interface{}) {
nrItemsWritten = nrItemsWritten + len(batch)
}

log.Printf("wrote %d items from %d sources to the api", nrItemsWritten, len(deletedSources))
logger.Info(fmt.Sprintf("wrote %d items from %d sources to the api", nrItemsWritten, len(deletedSources)))
}

func postBatch(client *http.Client, batch []map[string]interface{}, apiURL, apiUser, apiPassword string) error {
Expand Down
19 changes: 11 additions & 8 deletions output/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package output
import (
"bytes"
"encoding/json"
"log"
"fmt"
"log/slog"
"os"
)

Expand All @@ -18,9 +19,11 @@ func NewFileWriter(wc *WriterConfig) *FileWriter {
}
}
func (fr *FileWriter) Write(items chan map[string]interface{}) {
logger := slog.With(slog.String("writer", FILE_WRITER_TYPE))
f, err := os.Create(fr.writerConfig.FilePath)
if err != nil {
log.Fatalf("FileWriter ERROR while trying to open file: %v", err)
logger.Error(fmt.Sprintf("error while trying to open file: %v", err))
os.Exit(1)
}
defer f.Close()
allItems := []map[string]interface{}{}
Expand All @@ -41,18 +44,18 @@ func (fr *FileWriter) Write(items chan map[string]interface{}) {
encoder := json.NewEncoder(buffer)
encoder.SetEscapeHTML(false)
if err := encoder.Encode(allItems); err != nil {
log.Printf("FileWriter ERROR while encoding items: %v", err)
logger.Error(fmt.Sprintf("error while encoding items: %v", err))
return
}

var indentBuffer bytes.Buffer
if err := json.Indent(&indentBuffer, buffer.Bytes(), "", " "); err != nil {
log.Printf("FileWriter ERROR while indenting json: %v", err)
logger.Error(fmt.Sprintf("error while indenting json: %v", err))
return
}
_, err = f.Write(indentBuffer.Bytes())
if err != nil {
log.Printf("FileWriter ERROR while writing json to file: %v", err)
if _, err = f.Write(indentBuffer.Bytes()); err != nil {
logger.Error(fmt.Sprintf("error while writing json to file: %v", err))
} else {
logger.Info(fmt.Sprintf("wrote %d items to file %s", len(allItems), fr.writerConfig.FilePath))
}
log.Printf("wrote %d items to file %s", len(allItems), fr.writerConfig.FilePath)
}
7 changes: 4 additions & 3 deletions output/stdout.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ import (
"bytes"
"encoding/json"
"fmt"
"log"
"log/slog"
)

type StdoutWriter struct{}

func (s *StdoutWriter) Write(items chan map[string]interface{}) {
logger := slog.With(slog.String("writer", STDOUT_WRITER_TYPE))
for item := range items {
// We cannot use the following line of code because it automatically replaces certain html characters
// with the corresponding Unicode replacement rune.
Expand All @@ -24,13 +25,13 @@ func (s *StdoutWriter) Write(items chan map[string]interface{}) {
encoder := json.NewEncoder(buffer)
encoder.SetEscapeHTML(false)
if err := encoder.Encode(item); err != nil {
log.Printf("StdoutWriter ERROR while writing item %v: %v", item, err)
logger.Error(fmt.Sprintf("error while writing item %v: %v", item, err))
continue
}

var indentBuffer bytes.Buffer
if err := json.Indent(&indentBuffer, buffer.Bytes(), "", " "); err != nil {
log.Printf("StdoutWriter ERROR while writing item %v: %v", item, err)
logger.Error(fmt.Sprintf("error while writing item %v: %v", item, err))
continue
}
fmt.Print(indentBuffer.String())
Expand Down
Loading

0 comments on commit de29d88

Please sign in to comment.