From bc285a3b4790a0cb885d14f8bf3fd7125e802574 Mon Sep 17 00:00:00 2001 From: MewX Date: Sun, 1 Nov 2020 16:40:34 +1100 Subject: [PATCH] Added category enum proto --- collector/BUILD.bazel | 1 + collector/collector.go | 4 +- doubak.go | 32 +++++++------- proto/BUILD.bazel | 1 + proto/category.pb.go | 95 ++++++++++++++++++++++++++++++++++++++++++ proto/category.proto | 29 +++++++++++++ proto/flag.pb.go | 49 +++++++++++++++++----- proto/flag.proto | 18 ++++++++ proto/util.go | 13 ++++-- 9 files changed, 214 insertions(+), 28 deletions(-) create mode 100644 proto/category.pb.go create mode 100644 proto/category.proto diff --git a/collector/BUILD.bazel b/collector/BUILD.bazel index 809f91d..f170573 100644 --- a/collector/BUILD.bazel +++ b/collector/BUILD.bazel @@ -5,4 +5,5 @@ go_library( srcs = ["collector.go"], importpath = "github.com/its-my-data/doubak/collector", visibility = ["//visibility:public"], + deps = ["//proto"], ) diff --git a/collector/collector.go b/collector/collector.go index 414a84d..6eeaa7a 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -3,9 +3,11 @@ package collector import ( "flag" "fmt" + p "github.com/its-my-data/doubak/proto" ) // Collect starts the major collection process. func Collect() { - fmt.Println(flag.Lookup("tasks").Value.(flag.Getter).Get().(string)) + user := flag.Lookup(p.Flag_categories.String()).Value.(flag.Getter).Get().(string) + fmt.Println(user) } diff --git a/doubak.go b/doubak.go index 7132f50..8cf4da1 100644 --- a/doubak.go +++ b/doubak.go @@ -5,35 +5,39 @@ import ( "fmt" "github.com/gocolly/colly/v2" "github.com/its-my-data/doubak/collector" - "github.com/its-my-data/doubak/proto" + p "github.com/its-my-data/doubak/proto" "math" "time" ) // Defining flags. -var userName = flag.String("user", "", "The Douban user name. e.g. mewcatcher") -var tasksToRun = flag.String("tasks", "collect, parse, publish", +var userName = flag.String(p.Flag_user.String(), "", + "The Douban user name. e.g. mewcatcher") +var tasksToRun = flag.String(p.Flag_tasks.String(), + p.ConcatProtoEnum(p.Task_name, ", "), "Tasks to run (order doesn't matter). Can be one/more of the following: "+ - "collect, parse, publish.") -var targetCategories = flag.String("categories", "", + p.ConcatProtoEnum(p.Task_name, ", ")+".") +var targetCategories = flag.String(p.Flag_categories.String(), + p.ConcatProtoEnum(p.Category_name, ", "), "A comma separated content types list to crawl. Default is all. "+ - "Supported types are: book, movie, music, game, app, review.") -var outputDir = flag.String("output_dir", "./output", "The output path.") -var continueRun = flag.Bool("continue", true, + "Supported types are: "+p.ConcatProtoEnum(p.Category_name, ", ")+".") +var outputDir = flag.String(p.Flag_output_dir.String(), "./output", + "The output path.") +var continueRun = flag.Bool(p.Flag_continue.String(), true, "Continue or restart with override.") -var proxy = flag.String("proxy", "", "Proxy to use when crawling.") -var numRetry = flag.Uint64("max_retry", math.MaxUint64, +var proxy = flag.String(p.Flag_proxy.String(), "", + "Proxy to use when crawling.") +var numRetry = flag.Uint64(p.Flag_max_retry.String(), math.MaxUint64, "The number of retries when errors encountered.") var defaultRequestDelay, _ = time.ParseDuration("100ms") -var requestDelay = flag.Duration("req_delay", defaultRequestDelay, - "Delay betwee two requests, used to control QPS. This may be replaced by "+ - "a QPS flag when proxy pool and parallel requests are added.") +var requestDelay = flag.Duration(p.Flag_req_delay.String(), defaultRequestDelay, + "Min time between any two requests, used to reduce server load. This may "+ + "be replaced by a QPS flag when proxy pool and parallel requests are implemented.") func main() { flag.Parse() collector.Collect() - fmt.Println(proto.Flag_user.String() + proto.ConcatProtoEnum(nil, "")) c := colly.NewCollector() c.OnHTML("a[href]", func(e *colly.HTMLElement) { diff --git a/proto/BUILD.bazel b/proto/BUILD.bazel index 6899de4..ca6df42 100644 --- a/proto/BUILD.bazel +++ b/proto/BUILD.bazel @@ -5,6 +5,7 @@ load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") proto_library( name = "proto_proto", srcs = [ + "category.proto", "flag.proto", "task.proto", ], diff --git a/proto/category.pb.go b/proto/category.pb.go new file mode 100644 index 0000000..1bc916d --- /dev/null +++ b/proto/category.pb.go @@ -0,0 +1,95 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: proto/category.proto + +package proto + +import ( + fmt "fmt" + proto "github.com/golang/protobuf/proto" + math "math" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +// Category defines the supported categories. +// Full list supported by Douban are: +// - 书籍 book +// - 电影 movie +// - 音乐 music +// - 游戏 game +// - 移动应用 app +// - 评论 review +// - 小组 group (not supported) +// - 日记 note (not supported) +// - 图片 album (not supported) +// - 小站 site (not supported) +// - 同城活动 activity (not supported) +// - 舞台剧 drama (not supported) +// - 豆品 thing (not supported) +type Category int32 + +const ( + Category_book Category = 0 + Category_movie Category = 1 + Category_music Category = 2 + Category_game Category = 3 + Category_app Category = 4 + Category_review Category = 5 +) + +var Category_name = map[int32]string{ + 0: "book", + 1: "movie", + 2: "music", + 3: "game", + 4: "app", + 5: "review", +} + +var Category_value = map[string]int32{ + "book": 0, + "movie": 1, + "music": 2, + "game": 3, + "app": 4, + "review": 5, +} + +func (x Category) String() string { + return proto.EnumName(Category_name, int32(x)) +} + +func (Category) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_1bfb247fa9b1cc73, []int{0} +} + +func init() { + proto.RegisterEnum("proto.Category", Category_name, Category_value) +} + +func init() { + proto.RegisterFile("proto/category.proto", fileDescriptor_1bfb247fa9b1cc73) +} + +var fileDescriptor_1bfb247fa9b1cc73 = []byte{ + // 152 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0x29, 0x28, 0xca, 0x2f, + 0xc9, 0xd7, 0x4f, 0x4e, 0x2c, 0x49, 0x4d, 0xcf, 0x2f, 0xaa, 0xd4, 0x03, 0x73, 0x85, 0x58, 0xc1, + 0x94, 0x96, 0x27, 0x17, 0x87, 0x33, 0x54, 0x42, 0x88, 0x83, 0x8b, 0x25, 0x29, 0x3f, 0x3f, 0x5b, + 0x80, 0x41, 0x88, 0x93, 0x8b, 0x35, 0x37, 0xbf, 0x2c, 0x33, 0x55, 0x80, 0x11, 0xcc, 0x2c, 0x2d, + 0xce, 0x4c, 0x16, 0x60, 0x02, 0xc9, 0xa7, 0x27, 0xe6, 0xa6, 0x0a, 0x30, 0x0b, 0xb1, 0x73, 0x31, + 0x27, 0x16, 0x14, 0x08, 0xb0, 0x08, 0x71, 0x71, 0xb1, 0x15, 0xa5, 0x96, 0x65, 0xa6, 0x96, 0x0b, + 0xb0, 0x3a, 0xa9, 0x46, 0x29, 0xa7, 0x67, 0x96, 0x64, 0x94, 0x26, 0xe9, 0x25, 0xe7, 0xe7, 0xea, + 0x67, 0x96, 0x14, 0xeb, 0xe6, 0x56, 0xea, 0xa6, 0x24, 0x96, 0x24, 0xea, 0xa7, 0xe4, 0x97, 0x26, + 0x25, 0x66, 0xeb, 0x83, 0x6d, 0x4c, 0x62, 0x03, 0x53, 0xc6, 0x80, 0x00, 0x00, 0x00, 0xff, 0xff, + 0x32, 0xb2, 0x10, 0x92, 0x97, 0x00, 0x00, 0x00, +} diff --git a/proto/category.proto b/proto/category.proto new file mode 100644 index 0000000..9a59503 --- /dev/null +++ b/proto/category.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; + +package proto; + +option go_package = "github.com/its-my-data/doubak/proto"; + +// Category defines the supported categories. +// Full list supported by Douban are: +// - 书籍 book +// - 电影 movie +// - 音乐 music +// - 游戏 game +// - 移动应用 app +// - 评论 review +// - 小组 group (not supported) +// - 日记 note (not supported) +// - 图片 album (not supported) +// - 小站 site (not supported) +// - 同城活动 activity (not supported) +// - 舞台剧 drama (not supported) +// - 豆品 thing (not supported) +enum Category { + book = 0; + movie = 1; + music = 2; + game = 3; + app = 4; + review = 5; +} diff --git a/proto/flag.pb.go b/proto/flag.pb.go index 7838c8f..1144372 100644 --- a/proto/flag.pb.go +++ b/proto/flag.pb.go @@ -28,16 +28,40 @@ const ( Flag_user Flag = 0 // Tasks to run. Flag_tasks Flag = 1 + // Categories to run on. + Flag_categories Flag = 2 + // Output path/directory. + Flag_output_dir Flag = 3 + // Continue running or starting over with overriding existing files. + Flag_continue Flag = 4 + // Proxy used to send each request via. + Flag_proxy Flag = 5 + // Max number of retries. + Flag_max_retry Flag = 6 + // Min time between any two requets. + Flag_req_delay Flag = 7 ) var Flag_name = map[int32]string{ 0: "user", 1: "tasks", + 2: "categories", + 3: "output_dir", + 4: "continue", + 5: "proxy", + 6: "max_retry", + 7: "req_delay", } var Flag_value = map[string]int32{ - "user": 0, - "tasks": 1, + "user": 0, + "tasks": 1, + "categories": 2, + "output_dir": 3, + "continue": 4, + "proxy": 5, + "max_retry": 6, + "req_delay": 7, } func (x Flag) String() string { @@ -57,12 +81,17 @@ func init() { } var fileDescriptor_840b304d78fa0728 = []byte{ - // 112 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0x28, 0x28, 0xca, 0x2f, - 0xc9, 0xd7, 0x4f, 0xcb, 0x49, 0x4c, 0xd7, 0x03, 0x33, 0x85, 0x58, 0xc1, 0x94, 0x96, 0x34, 0x17, - 0x8b, 0x5b, 0x4e, 0x62, 0xba, 0x10, 0x07, 0x17, 0x4b, 0x69, 0x71, 0x6a, 0x91, 0x00, 0x83, 0x10, - 0x27, 0x17, 0x6b, 0x49, 0x62, 0x71, 0x76, 0xb1, 0x00, 0xa3, 0x93, 0x6a, 0x94, 0x72, 0x7a, 0x66, - 0x49, 0x46, 0x69, 0x92, 0x5e, 0x72, 0x7e, 0xae, 0x7e, 0x66, 0x49, 0xb1, 0x6e, 0x6e, 0xa5, 0x6e, - 0x4a, 0x62, 0x49, 0xa2, 0x7e, 0x4a, 0x7e, 0x69, 0x52, 0x62, 0xb6, 0x3e, 0xd8, 0x8c, 0x24, 0x36, - 0x30, 0x65, 0x0c, 0x08, 0x00, 0x00, 0xff, 0xff, 0xce, 0xf7, 0xa4, 0xed, 0x65, 0x00, 0x00, 0x00, + // 181 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x1c, 0x8e, 0x4b, 0x4e, 0xc4, 0x30, + 0x10, 0x05, 0xf9, 0x24, 0xc3, 0x4c, 0x0b, 0x50, 0xcb, 0x57, 0x40, 0x6c, 0x90, 0x26, 0x5e, 0x70, + 0x03, 0x16, 0x1c, 0x82, 0x4d, 0xd4, 0x89, 0x8d, 0xb1, 0xf2, 0xe9, 0xd0, 0x6e, 0x4b, 0xf1, 0xed, + 0x47, 0xf1, 0xaa, 0x5e, 0x2d, 0x9e, 0x54, 0x80, 0x9b, 0xb0, 0xb2, 0xfd, 0x9d, 0x29, 0x74, 0x75, + 0x9a, 0xb6, 0xe2, 0x43, 0xa0, 0xf9, 0x9e, 0x29, 0x98, 0x33, 0x34, 0x39, 0x79, 0xc1, 0x3b, 0x73, + 0x81, 0x56, 0x29, 0x4d, 0x09, 0xef, 0xcd, 0x2b, 0xc0, 0x48, 0xea, 0x03, 0x4b, 0xf4, 0x09, 0x1f, + 0x0e, 0xe7, 0xac, 0x5b, 0xd6, 0xde, 0x45, 0xc1, 0x47, 0xf3, 0x0c, 0xe7, 0x91, 0x57, 0x8d, 0x6b, + 0xf6, 0xd8, 0x1c, 0xc7, 0x4d, 0x78, 0x2f, 0xd8, 0x9a, 0x17, 0xb8, 0x2c, 0xb4, 0xf7, 0xe2, 0x55, + 0x0a, 0x9e, 0x0e, 0x15, 0xff, 0xdf, 0x3b, 0x3f, 0x53, 0xc1, 0xa7, 0xaf, 0xf7, 0x9f, 0xb7, 0x10, + 0xf5, 0x2f, 0x0f, 0xdd, 0xc8, 0x8b, 0x8d, 0x9a, 0xae, 0x4b, 0xb9, 0x3a, 0x52, 0xb2, 0x8e, 0xf3, + 0x40, 0x93, 0xad, 0x69, 0xc3, 0xa9, 0xe2, 0xf3, 0x16, 0x00, 0x00, 0xff, 0xff, 0xb7, 0x1a, 0x64, + 0x18, 0xbc, 0x00, 0x00, 0x00, } diff --git a/proto/flag.proto b/proto/flag.proto index 6f13464..6784abf 100644 --- a/proto/flag.proto +++ b/proto/flag.proto @@ -11,4 +11,22 @@ enum Flag { // Tasks to run. tasks = 1; + + // Categories to run on. + categories = 2; + + // Output path/directory. + output_dir = 3; + + // Continue running or starting over with overriding existing files. + continue = 4; + + // Proxy used to send each request via. + proxy = 5; + + // Max number of retries. + max_retry = 6; + + // Min time between any two requets. + req_delay = 7; } diff --git a/proto/util.go b/proto/util.go index 3950b74..899e957 100644 --- a/proto/util.go +++ b/proto/util.go @@ -1,7 +1,14 @@ package proto +import ( + "strings" +) + // ConcatProtoEnum concats enum proto values to a string. -func ConcatProtoEnum(p interface{}, separator string) string { - var _ Task - return "" +func ConcatProtoEnum(nameMap map[int32]string, separator string) string { + list := []string{} + for _, v := range nameMap { + list = append(list, v) + } + return strings.Join(list, separator) }