Skip to content

Commit

Permalink
Added skeleton of the main program
Browse files Browse the repository at this point in the history
  • Loading branch information
MewX committed Nov 1, 2020
1 parent bc285a3 commit 67085eb
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 36 deletions.
3 changes: 1 addition & 2 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ go_library(
importpath = "github.com/its-my-data/doubak",
visibility = ["//visibility:private"],
deps = [
"//collector",
"//proto",
"@com_github_gocolly_colly_v2//:colly",
"//task",
],
)

Expand Down
9 changes: 0 additions & 9 deletions collector/BUILD.bazel

This file was deleted.

13 changes: 0 additions & 13 deletions collector/collector.go

This file was deleted.

88 changes: 76 additions & 12 deletions doubak.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package main

import (
"errors"
"flag"
"fmt"
"github.com/gocolly/colly/v2"
"github.com/its-my-data/doubak/collector"
p "github.com/its-my-data/doubak/proto"
"github.com/its-my-data/doubak/task"
"log"
"math"
"regexp"
"strings"
"time"
)

Expand Down Expand Up @@ -34,17 +36,79 @@ var requestDelay = flag.Duration(p.Flag_req_delay.String(), defaultRequestDelay,
"Min time between any two requests, used to reduce server load. This may "+
"be replaced by a QPS flag when proxy pool and parallel requests are implemented.")

func validateFlags() (tasks []string, categories []string, err error) {
spaceRegex := regexp.MustCompile(`\s`)

// Validate task list (order matters).
strippedTasks := spaceRegex.ReplaceAllString(*tasksToRun, "")
tasks = strings.Split(strippedTasks, ",")
for _, t := range tasks {
if _, ok := p.Task_value[t]; !ok {
err = errors.New("unknown task name: " + t)
return
}
}

// Validate category list (order doesn't matter).
strippedCategories := spaceRegex.ReplaceAllString(*targetCategories, "")
categories = strings.Split(strippedCategories, ",")
for _, c := range categories {
if _, ok := p.Category_value[c]; !ok {
err = errors.New("unknown category name: " + c)
return
}
}

return
}

func main() {
flag.Parse()

collector.Collect()
// Precheck flags that need preprosessing.
log.Print("Validating flags... ")
tasks, categories, parseErr := validateFlags()
if parseErr != nil {
log.Print("FAILED")
log.Fatal(parseErr)
} else {
log.Print("PASS")
}

// Create selected tasks.
taskMap := map[string]task.BaseInterface{}
for _, t := range tasks {
var taskImpl task.BaseInterface
switch t {
case p.Task_collect.String():
taskImpl = task.NewCollector(categories)

// TODO: add other tasks.
// case p.Task_parse:
// case p.Task_publish:
}
taskMap[t] = taskImpl
}

// Run the specific tasks' prechecks first.
for taskName, t := range taskMap {
log.Printf("Prechecking \"%s\"... ", taskName)
if err := t.Precheck(); err != nil {
log.Print("FAILED")
log.Fatal(err)
} else {
log.Print("PASS")
}
}

c := colly.NewCollector()
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
fmt.Println("Found ULR: ", e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("http://douban.com/")
// Execute the tasks in input order.
for _, taskName := range tasks {
log.Printf("Running task \"%s\"... ", taskName)
if err := taskMap[taskName].Execute(); err != nil {
log.Printf("Task \"%s\" execution failed", taskName)
log.Fatal(err)
} else {
log.Printf("Task \"%s\" passed", taskName)
}
}
}
15 changes: 15 additions & 0 deletions task/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "task",
srcs = [
"collector.go",
"task.go",
],
importpath = "github.com/its-my-data/doubak/task",
visibility = ["//visibility:public"],
deps = [
"//proto",
"@com_github_gocolly_colly_v2//:colly",
],
)
42 changes: 42 additions & 0 deletions task/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package task

import (
"flag"
"github.com/gocolly/colly/v2"
p "github.com/its-my-data/doubak/proto"
"log"
)

// Collector contains the information used by the collector.
type Collector struct {
user string
categories []string
}

// NewCollector returns a new collector task and initialise it.
func NewCollector(categories []string) *Collector {
return &Collector{
user: flag.Lookup(p.Flag_categories.String()).Value.(flag.Getter).Get().(string),
categories: categories,
}
}

// Precheck validates the flags.
func (task *Collector) Precheck() error {
// TODO: check user existance, etc.
return nil
}

// Execute starts the collection.
func (task *Collector) Execute() error {
// TODO: update the implementation.
c := colly.NewCollector()
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
log.Println("Found ULR: ", e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
log.Println("Visiting", r.URL)
})
c.Visit("http://douban.com/")
return nil
}
10 changes: 10 additions & 0 deletions task/task.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package task

// BaseInterface defines the interface of each type of task.
type BaseInterface interface {
// Checking flag combinations, validities, etc.
Precheck() error

// Execute the task.
Execute() error
}

0 comments on commit 67085eb

Please sign in to comment.