Skip to content

Commit

Permalink
Merge pull request #241 from jakopako/feature/date-filtering
Browse files Browse the repository at this point in the history
Feature/date filtering
  • Loading branch information
jakopako authored Oct 13, 2023
2 parents 7f333ef + 63b0bca commit acedf0b
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 41 deletions.
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,10 @@ scrapers:
date_location: "Europe/Berlin"
filters:
- field: "title"
regex: "Verschoben.*"
exp: "Verschoben.*"
match: false
- field: "title"
regex: "Abgesagt.*"
exp: "Abgesagt.*"
match: false
```

Expand Down Expand Up @@ -424,19 +424,24 @@ Since version 0.3.0 js rendering is supported. For this to work the `google-chro

### Filters

Filters can be used to define what items should make it into the resulting list of items. A filter configuration looks as follows:
Filters can be used to define what items should make it into the resulting list of items. A filter configuration can look as follows:

```yml
filters:
- field: "status"
regex: "cancelled"
exp: "cancelled"
match: false
- field: "status"
regex: "delayed"
exp: ".*(?i)(delayed).*"
match: false
- field: "date"
exp: "> now" # format: <|> now|YYYY-MM-ddTHH:mm
match: true
```

The `field` key determines to which field the regular expression will be applied. `regex` defines the regular expression and `match` determines whether the item should be included or excluded on match. Note, that as soon as there is one match for a regular expression that has `match` set to **false** the respective item will be exlcuded from the results without looking at the other filters.
The `field` key determines to which field the expression will be applied. `exp` defines the expression and `match` determines whether the item should be included or excluded on match. Note, that as soon as there is one match for an expression that has `match` set to **false** the respective item will be excluded from the results without looking at the other filters.

The expression `exp` can be either a regular expression or a date comparison. Depending on the type of the respective `field` in the `fields` section of the configuration it has to be either one or the other. If the corresponding field is of type `date` the expression has to be a date comparison. For every other field type it has to be a regular expression.

### Interaction

Expand Down
18 changes: 9 additions & 9 deletions concerts-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ scrapers:
date_language: "it_IT"
filters:
- field: "title"
regex: ".*CANCELED.*"
exp: ".*CANCELED.*"
match: false
- field: "title"
regex: "ANNULLATO!.*"
exp: "ANNULLATO!.*"
match: false
- field: "title"
regex: ".*Postponed.*"
exp: ".*Postponed.*"
match: false
paginator:
location:
Expand Down Expand Up @@ -164,10 +164,10 @@ scrapers:
selector: ".pager__item a"
filters:
- field: "title"
regex: ".*POSTPONED.*"
exp: ".*POSTPONED.*"
match: false
- field: "title"
regex: ".*CANCELLED.*"
exp: ".*CANCELLED.*"
match: false

##########
Expand Down Expand Up @@ -321,16 +321,16 @@ scrapers:
date_location: "Europe/Berlin"
filters:
- field: "location"
regex: "Zenith" # duplicate (also present on Motorworld website)
exp: "Zenith" # duplicate (also present on Motorworld website)
match: false
- field: "location"
regex: "Strom" # duplicate
exp: "Strom" # duplicate
match: false
- field: "location"
regex: "Tonhalle" # duplicate
exp: "Tonhalle" # duplicate
match: false
- field: "location"
regex: "TonHalle" # duplicate
exp: "TonHalle" # duplicate
match: false

#########
Expand Down
110 changes: 94 additions & 16 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,69 @@ func (e *ElementLocations) UnmarshalYAML(value *yaml.Node) error {

// A Filter is used to filter certain items from the result list
type Filter struct {
Field string `yaml:"field"`
Regex string `yaml:"regex"`
Match bool `yaml:"match"`
Field string `yaml:"field"`
Type string
Expression string `yaml:"exp"` // changed from 'regex' to 'exp' in version 0.5.7
RegexComp *regexp.Regexp
DateComp time.Time
DateOp string
Match bool `yaml:"match"`
}

func (f *Filter) FilterMatch(value interface{}) bool {
switch f.Type {
case "regex":
return f.RegexComp.MatchString(fmt.Sprint(value))
case "date":
d, _ := value.(time.Time)
if f.DateOp == ">" {
return d.After(f.DateComp)
} else {
return d.Before(f.DateComp)
}
default:
return false
}
}

func (f *Filter) Initialize(fieldType string) error {
if fieldType == "date" {
f.Type = "date"
} else {
f.Type = "regex" // default for everything except date fields
}
switch f.Type {
case "regex":
regex, err := regexp.Compile(f.Expression)
if err != nil {
return err
}
f.RegexComp = regex
return nil
case "date":
initErr := fmt.Errorf("the expression for filtering by date should be of the following format: '<|> now|YYYY-MM-ddTHH:mm'")
tokens := strings.Split(f.Expression, " ")
if len(tokens) != 2 {
return initErr
}
if tokens[0] != ">" && tokens[0] != "<" {
return initErr
}
f.DateOp = tokens[0]
// parse date, return error
if tokens[1] != "now" {
t, err := time.Parse("2006-01-02T15:04", tokens[1])
if err != nil {
return initErr
}
f.DateComp = t
} else {
f.DateComp = time.Now().UTC()
}
return nil
default:
return fmt.Errorf("type '%s' does not exist for filters", f.Type)
}
}

// A Paginator is used to paginate through a website
Expand All @@ -145,7 +205,7 @@ type Scraper struct {
Item string `yaml:"item"`
ExcludeWithSelector []string `yaml:"exclude_with_selector,omitempty"`
Fields []Field `yaml:"fields,omitempty"`
Filters []Filter `yaml:"filters,omitempty"`
Filters []*Filter `yaml:"filters,omitempty"`
Paginator Paginator `yaml:"paginator,omitempty"`
RenderJs bool `yaml:"renderJs,omitempty"`
Interaction types.Interaction `yaml:"interaction,omitempty"`
Expand All @@ -161,6 +221,10 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string

var items []map[string]interface{}

if err := c.initializeFilters(); err != nil {
return items, err
}

hasNextPage := true
currentPage := 0
var doc *goquery.Document
Expand Down Expand Up @@ -248,7 +312,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string
}

// check if item should be filtered
filter, err := c.filterItem(currentItem)
filter := c.filterItem(currentItem)
if err != nil {
log.Fatalf("%s ERROR: error while applying filter: %v.", c.Name, err)
}
Expand Down Expand Up @@ -297,23 +361,37 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string
return items, nil
}

func (c *Scraper) filterItem(item map[string]interface{}) (bool, error) {
func (c *Scraper) initializeFilters() error {
// build temporary map field name -> field type
fieldTypes := map[string]string{}
for _, field := range c.Fields {
fieldTypes[field.Name] = field.Type
}
for _, f := range c.Filters {
if fieldType, ok := fieldTypes[f.Field]; ok {
if err := f.Initialize(fieldType); err != nil {
return err
}
} else {
return fmt.Errorf("filter error. There is no field with the name '%s'", f.Field)
}
}
return nil
}

func (c *Scraper) filterItem(item map[string]interface{}) bool {
nrMatchTrue := 0
filterMatchTrue := false
filterMatchFalse := true
for _, filter := range c.Filters {
regex, err := regexp.Compile(filter.Regex)
if err != nil {
return false, err
}
if fieldValue, found := item[filter.Field]; found {
if filter.Match {
for _, f := range c.Filters {
if fieldValue, found := item[f.Field]; found {
if f.Match {
nrMatchTrue++
if regex.MatchString(fmt.Sprint(fieldValue)) {
if f.FilterMatch(fieldValue) {
filterMatchTrue = true
}
} else {
if regex.MatchString(fmt.Sprint(fieldValue)) {
if f.FilterMatch(fieldValue) {
filterMatchFalse = false
}
}
Expand All @@ -322,7 +400,7 @@ func (c *Scraper) filterItem(item map[string]interface{}) (bool, error) {
if nrMatchTrue == 0 {
filterMatchTrue = true
}
return filterMatchTrue && filterMatchFalse, nil
return filterMatchTrue && filterMatchFalse
}

func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]interface{} {
Expand Down
116 changes: 106 additions & 10 deletions scraper/scraper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,24 @@ const (
func TestFilterItemMatchTrue(t *testing.T) {
item := map[string]interface{}{"title": "Jacob Collier - Concert"}
s := &Scraper{
Filters: []Filter{
Fields: []Field{
{
Name: "title",
},
},
Filters: []*Filter{
{
Field: "title",
Regex: ".*Concert",
Match: true,
Field: "title",
Expression: ".*Concert",
Match: true,
},
},
}
f, err := s.filterItem(item)
err := s.initializeFilters()
if err != nil {
t.Fatalf("got unexpected error: %v", err)
}
f := s.filterItem(item)
if !f {
t.Fatalf("expected 'true' but got 'false'")
}
Expand All @@ -102,18 +108,108 @@ func TestFilterItemMatchTrue(t *testing.T) {
func TestFilterItemMatchFalse(t *testing.T) {
item := map[string]interface{}{"title": "Jacob Collier - Cancelled"}
s := &Scraper{
Filters: []Filter{
Fields: []Field{
{
Name: "title",
},
},
Filters: []*Filter{
{
Field: "title",
Expression: ".*Cancelled",
Match: false,
},
},
}
err := s.initializeFilters()
if err != nil {
t.Fatalf("got unexpected error: %v", err)
}
f := s.filterItem(item)
if f {
t.Fatalf("expected 'false' but got 'true'")
}
}

func TestFilterItemByDateMatchTrue(t *testing.T) {
loc, _ := time.LoadLocation("UTC")
item := map[string]interface{}{"date": time.Date(2023, 10, 20, 19, 1, 0, 0, loc)}
s := &Scraper{
Fields: []Field{
{
Name: "date",
Type: "date",
},
},
Filters: []*Filter{
{
Field: "date",
Expression: "> 2023-10-20T19:00",
Match: true,
},
},
}
err := s.initializeFilters()
if err != nil {
t.Fatalf("got unexpected error: %v", err)
}
f := s.filterItem(item)
if !f {
t.Fatalf("expected 'true' but got 'false'")
}
}

func TestFilterItemByDateMatchTrue2(t *testing.T) {
loc, _ := time.LoadLocation("UTC")
item := map[string]interface{}{"date": time.Date(2023, 10, 20, 19, 0, 0, 0, loc)}
s := &Scraper{
Fields: []Field{
{
Name: "date",
Type: "date",
},
},
Filters: []*Filter{
{
Field: "date",
Expression: "> 2023-10-20T19:00",
Match: true,
},
},
}
err := s.initializeFilters()
if err != nil {
t.Fatalf("got unexpected error: %v", err)
}
f := s.filterItem(item)
if f {
t.Fatalf("expected 'false' but got 'true'")
}
}

func TestFilterItemByDateMatchFalse(t *testing.T) {
loc, _ := time.LoadLocation("UTC")
item := map[string]interface{}{"date": time.Date(2023, 10, 20, 19, 1, 0, 0, loc)}
s := &Scraper{
Fields: []Field{
{
Name: "date",
Type: "date",
},
},
Filters: []*Filter{
{
Field: "title",
Regex: ".*Cancelled",
Match: false,
Field: "date",
Expression: "> 2023-10-20T19:00",
Match: false,
},
},
}
f, err := s.filterItem(item)
err := s.initializeFilters()
if err != nil {
t.Fatalf("got unexpected error: %v", err)
}
f := s.filterItem(item)
if f {
t.Fatalf("expected 'false' but got 'true'")
}
Expand Down

0 comments on commit acedf0b

Please sign in to comment.