Skip to content

Commit

Permalink
Added category enum proto
Browse files Browse the repository at this point in the history
  • Loading branch information
MewX committed Nov 1, 2020
1 parent a8bd0cf commit bc285a3
Show file tree
Hide file tree
Showing 9 changed files with 214 additions and 28 deletions.
1 change: 1 addition & 0 deletions collector/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ go_library(
srcs = ["collector.go"],
importpath = "github.com/its-my-data/doubak/collector",
visibility = ["//visibility:public"],
deps = ["//proto"],
)
4 changes: 3 additions & 1 deletion collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ package collector
import (
"flag"
"fmt"
p "github.com/its-my-data/doubak/proto"
)

// Collect starts the major collection process.
func Collect() {
fmt.Println(flag.Lookup("tasks").Value.(flag.Getter).Get().(string))
user := flag.Lookup(p.Flag_categories.String()).Value.(flag.Getter).Get().(string)
fmt.Println(user)
}
32 changes: 18 additions & 14 deletions doubak.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,39 @@ import (
"fmt"
"github.com/gocolly/colly/v2"
"github.com/its-my-data/doubak/collector"
"github.com/its-my-data/doubak/proto"
p "github.com/its-my-data/doubak/proto"
"math"
"time"
)

// Defining flags.
var userName = flag.String("user", "", "The Douban user name. e.g. mewcatcher")
var tasksToRun = flag.String("tasks", "collect, parse, publish",
var userName = flag.String(p.Flag_user.String(), "",
"The Douban user name. e.g. mewcatcher")
var tasksToRun = flag.String(p.Flag_tasks.String(),
p.ConcatProtoEnum(p.Task_name, ", "),
"Tasks to run (order doesn't matter). Can be one/more of the following: "+
"collect, parse, publish.")
var targetCategories = flag.String("categories", "",
p.ConcatProtoEnum(p.Task_name, ", ")+".")
var targetCategories = flag.String(p.Flag_categories.String(),
p.ConcatProtoEnum(p.Category_name, ", "),
"A comma separated content types list to crawl. Default is all. "+
"Supported types are: book, movie, music, game, app, review.")
var outputDir = flag.String("output_dir", "./output", "The output path.")
var continueRun = flag.Bool("continue", true,
"Supported types are: "+p.ConcatProtoEnum(p.Category_name, ", ")+".")
var outputDir = flag.String(p.Flag_output_dir.String(), "./output",
"The output path.")
var continueRun = flag.Bool(p.Flag_continue.String(), true,
"Continue or restart with override.")
var proxy = flag.String("proxy", "", "Proxy to use when crawling.")
var numRetry = flag.Uint64("max_retry", math.MaxUint64,
var proxy = flag.String(p.Flag_proxy.String(), "",
"Proxy to use when crawling.")
var numRetry = flag.Uint64(p.Flag_max_retry.String(), math.MaxUint64,
"The number of retries when errors encountered.")
var defaultRequestDelay, _ = time.ParseDuration("100ms")
var requestDelay = flag.Duration("req_delay", defaultRequestDelay,
"Delay betwee two requests, used to control QPS. This may be replaced by "+
"a QPS flag when proxy pool and parallel requests are added.")
var requestDelay = flag.Duration(p.Flag_req_delay.String(), defaultRequestDelay,
"Min time between any two requests, used to reduce server load. This may "+
"be replaced by a QPS flag when proxy pool and parallel requests are implemented.")

func main() {
flag.Parse()

collector.Collect()
fmt.Println(proto.Flag_user.String() + proto.ConcatProtoEnum(nil, ""))

c := colly.NewCollector()
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
1 change: 1 addition & 0 deletions proto/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library")
proto_library(
name = "proto_proto",
srcs = [
"category.proto",
"flag.proto",
"task.proto",
],
Expand Down
95 changes: 95 additions & 0 deletions proto/category.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions proto/category.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
syntax = "proto3";

package proto;

option go_package = "github.com/its-my-data/doubak/proto";

// Category defines the supported categories.
// Full list supported by Douban are:
// - 书籍 book
// - 电影 movie
// - 音乐 music
// - 游戏 game
// - 移动应用 app
// - 评论 review
// - 小组 group (not supported)
// - 日记 note (not supported)
// - 图片 album (not supported)
// - 小站 site (not supported)
// - 同城活动 activity (not supported)
// - 舞台剧 drama (not supported)
// - 豆品 thing (not supported)
enum Category {
book = 0;
movie = 1;
music = 2;
game = 3;
app = 4;
review = 5;
}
49 changes: 39 additions & 10 deletions proto/flag.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions proto/flag.proto
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,22 @@ enum Flag {

// Tasks to run.
tasks = 1;

// Categories to run on.
categories = 2;

// Output path/directory.
output_dir = 3;

// Continue running or starting over with overriding existing files.
continue = 4;

// Proxy used to send each request via.
proxy = 5;

// Max number of retries.
max_retry = 6;

// Min time between any two requets.
req_delay = 7;
}
13 changes: 10 additions & 3 deletions proto/util.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
package proto

import (
"strings"
)

// ConcatProtoEnum concats enum proto values to a string.
func ConcatProtoEnum(p interface{}, separator string) string {
var _ Task
return ""
func ConcatProtoEnum(nameMap map[int32]string, separator string) string {
list := []string{}
for _, v := range nameMap {
list = append(list, v)
}
return strings.Join(list, separator)
}

0 comments on commit bc285a3

Please sign in to comment.