From ef22982027871f3b6e5f96c0e0206dd62917ff16 Mon Sep 17 00:00:00 2001 From: jakopako Date: Mon, 17 Jun 2024 13:45:38 +0200 Subject: [PATCH 1/3] improve year guessing --- scraper/scraper.go | 56 ++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/scraper/scraper.go b/scraper/scraper.go index 4afbdec..60e49aa 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -370,6 +370,12 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string } } + c.GuessYear(items) + + return items, nil +} + +func (c *Scraper) GuessYear(items []map[string]interface{}) { // get date field names where we need to adapt the year dateFieldsGuessYear := map[string]bool{} for _, f := range c.Fields { @@ -384,46 +390,42 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string // event websites mostly contain a list of events ordered by date. Sometimes the date does // not contain the year. In that case we could simply set the year to the current year but // it might happen that the list of events spans across more than one year into the next - // year. In that case we still want to set the correct year which would be current year + 1. + // year. In that case we still want to set the correct year which would be current year + n. + // Moreover, the list might not be ordered at all. In that case we also want to try to set + // the correct year. if len(dateFieldsGuessYear) > 0 { for i, item := range items { for name, val := range item { if dateFieldsGuessYear[name] { if t, ok := val.(time.Time); ok { - now := time.Now() - yesterday := now.AddDate(0, 0, -1) - // we compare the date with yesterday, not now, to accomodate for the fact that at the time we scrape - // the event might have already taken place but not yet removed from the website. Let's see if 1 day - // is a reasonable margin. - if t.Before(yesterday) { - newT := time.Date(t.Year()+1, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - item[name] = newT - continue + + // for the first item we compare this item's date with 'now' and try + // to find the most suitable year, ie the year that brings this item's + // date closest to now. + // for the remaining items we do the same as with the first item except + // that we compare this item's date to the previous item's date instead + // of 'now'. + var prev time.Time + if i == 0 { + prev = time.Now() + } else { + prev, _ = items[i-1][name].(time.Time) } - if i > 0 { - if prevT, ok := items[i-1][name].(time.Time); ok { - // here we do not compare the current date directly to the previous date. There - // are cases where we wouldn't want the year to be increased by one even though - // the previous date is bigger than the current one. Such cases occur when a - // website contains a list of items that are sorted by date but within a day are - // not sorted by time. To prevent the year from being increased wrongly in that - // case we introduce a min delta of 1 day. - tmpT := prevT.AddDate(0, 0, -1) - if t.Before(tmpT) { - // probably there is still a bug here when we have a list that spans two years - // changes.. - newT := time.Date(t.Year()+1, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - item[name] = newT - } + diff := time.Since(time.Unix(0, 0)) + newDate := t + for y := prev.Year() - 1; y <= prev.Year()+1; y++ { + tmpT := time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) + if newDiff := tmpT.Sub(prev).Abs(); newDiff < diff { + diff = newDiff + newDate = time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) } } + item[name] = newDate } } } } } - - return items, nil } func (c *Scraper) initializeFilters() error { From 6e05b29398151a763bd98dfe360a2783a22dbdc9 Mon Sep 17 00:00:00 2001 From: jakopako Date: Mon, 17 Jun 2024 14:44:30 +0200 Subject: [PATCH 2/3] wip --- scraper/scraper.go | 15 ++++++--------- scraper/scraper_test.go | 13 +++++++++++++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/scraper/scraper.go b/scraper/scraper.go index 60e49aa..cf04683 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -370,12 +370,12 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string } } - c.GuessYear(items) + c.guessYear(items, time.Now()) return items, nil } -func (c *Scraper) GuessYear(items []map[string]interface{}) { +func (c *Scraper) guessYear(items []map[string]interface{}, ref time.Time) { // get date field names where we need to adapt the year dateFieldsGuessYear := map[string]bool{} for _, f := range c.Fields { @@ -405,17 +405,14 @@ func (c *Scraper) GuessYear(items []map[string]interface{}) { // for the remaining items we do the same as with the first item except // that we compare this item's date to the previous item's date instead // of 'now'. - var prev time.Time - if i == 0 { - prev = time.Now() - } else { - prev, _ = items[i-1][name].(time.Time) + if i > 0 { + ref, _ = items[i-1][name].(time.Time) } diff := time.Since(time.Unix(0, 0)) newDate := t - for y := prev.Year() - 1; y <= prev.Year()+1; y++ { + for y := ref.Year() - 1; y <= ref.Year()+1; y++ { tmpT := time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) - if newDiff := tmpT.Sub(prev).Abs(); newDiff < diff { + if newDiff := tmpT.Sub(ref).Abs(); newDiff < diff { diff = newDiff newDate = time.Date(y, t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) } diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index d9e99ca..cd75adc 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -648,3 +648,16 @@ func TestExtractFieldDate29Feb(t *testing.T) { t.Fatalf("expected '2024' as year of date but got '%d'", dt.Year()) } } + +func TestGuessYearSimple(t *testing.T) { + // events span period around change of year + +} + +func TestGuessYearUnordered(t *testing.T) { + // events are not perfectly ordered +} + +func TestGuessYear2Years(t *testing.T) { + // events span more than 2 years +} From 5db2ccdb5641cecd4c2f8cbe634dba2894515d0b Mon Sep 17 00:00:00 2001 From: jakopako Date: Mon, 17 Jun 2024 18:05:03 +0200 Subject: [PATCH 3/3] added tests --- scraper/scraper_test.go | 188 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 184 insertions(+), 4 deletions(-) diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go index cd75adc..5552332 100644 --- a/scraper/scraper_test.go +++ b/scraper/scraper_test.go @@ -650,14 +650,194 @@ func TestExtractFieldDate29Feb(t *testing.T) { } func TestGuessYearSimple(t *testing.T) { - // events span period around change of year - + // items dates span period around change of year + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2023, 11, 30, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } } func TestGuessYearUnordered(t *testing.T) { - // events are not perfectly ordered + // items dates are not perfectly ordered and span + // period around change of year + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 11, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 11, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2023, 11, 1, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } } func TestGuessYear2Years(t *testing.T) { - // events span more than 2 years + // items dates span more than 2 years + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 5, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 9, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 2, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 14, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 5, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 9, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2025, 2, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2023, 11, 1, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } +} + +func TestGuessYearStartBeforeReference(t *testing.T) { + // items date start before given reference + s := &Scraper{ + Fields: []Field{ + { + Type: "date", + GuessYear: true, + Name: "date", + }, + }, + } + loc, _ := time.LoadLocation("CET") + items := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 1, 2, 20, 0, 0, 0, loc), + }, + } + expectedItems := []map[string]interface{}{ + { + "date": time.Date(2023, 12, 2, 20, 30, 0, 0, loc), + }, + { + "date": time.Date(2023, 12, 24, 21, 30, 0, 0, loc), + }, + { + "date": time.Date(2024, 1, 2, 20, 0, 0, 0, loc), + }, + } + s.guessYear(items, time.Date(2024, 1, 30, 20, 30, 0, 0, loc)) + for i, d := range items { + if d["date"] != expectedItems[i]["date"] { + t.Fatalf("expected '%v' as year of date but got '%v'", expectedItems[i]["date"], d["date"]) + } + } }