From 58bf0ec2840e58ac8c4f0ffabe685eb69ba28298 Mon Sep 17 00:00:00 2001 From: ziv Date: Mon, 9 Sep 2024 21:38:09 +0300 Subject: [PATCH 1/6] Adding pagination fo news-flash api --- anyway/views/news_flash/api.py | 64 ++++++++++++++++++++++++++++------ tests/test_news_flash_api.py | 6 ++-- 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/anyway/views/news_flash/api.py b/anyway/views/news_flash/api.py index 77a37264..7903cc6e 100644 --- a/anyway/views/news_flash/api.py +++ b/anyway/views/news_flash/api.py @@ -7,7 +7,7 @@ import pandas as pd import os -from typing import List, Optional +from typing import List, Optional, Tuple, Any from http import HTTPStatus from collections import OrderedDict @@ -37,12 +37,20 @@ DEFAULT_OFFSET_REQ_PARAMETER = 0 DEFAULT_LIMIT_REQ_PARAMETER = 100 DEFAULT_NUMBER_OF_YEARS_AGO = 5 +PAGE_NUMBER = "pageNumber" +PAGE_SIZE = "pageSize" +NEWS_FALSH_ID = "newsFlash_Id" +ID = "id" +LIMIT = "limit" +OFFSET = "offset" class NewsFlashQuery(BaseModel): id: Optional[int] road_number: Optional[int] offset: Optional[int] = DEFAULT_OFFSET_REQ_PARAMETER + pageNumber: Optional[int] + pageSize: Optional[int] limit: Optional[int] = DEFAULT_LIMIT_REQ_PARAMETER resolution: Optional[List[str]] source: Optional[List[BE_CONST.Source]] @@ -75,19 +83,54 @@ def news_flash(): except ValidationError as e: return make_response(jsonify(e.errors()[0]["msg"]), 404) - if "id" in validated_query_params: - return get_news_flash_by_id(validated_query_params["id"]) + pagination, validated_query_params = set_pagination_params(validated_query_params) - query = gen_news_flash_query(db.session, validated_query_params) + if ID in validated_query_params: + return get_news_flash_by_id(validated_query_params[ID]) + + total, query = gen_news_flash_query(db.session, validated_query_params) news_flashes = query.all() - news_flashes_jsons = [n.serialize() for n in news_flashes] - for news_flash in news_flashes_jsons: + news_flashes_dicts = [n.serialize() for n in news_flashes] + for news_flash in news_flashes_dicts: set_display_source(news_flash) - return Response(json.dumps(news_flashes_jsons, default=str), mimetype="application/json") - -def gen_news_flash_query(session, valid_params: dict): + if pagination: + res = add_pagination_to_result(validated_query_params, news_flashes_dicts, total) + else: + res = news_flashes_dicts + return Response(json.dumps(res, default=str), mimetype="application/json") + + +def set_pagination_params(validated_params: dict) -> Tuple[bool, dict]: + pagination = False + if NEWS_FALSH_ID in validated_params: + validated_params[ID] = validated_params.pop(NEWS_FALSH_ID) + if PAGE_NUMBER in validated_params: + pagination = True + page_number = validated_params[PAGE_NUMBER] + page_size = validated_params.get(PAGE_SIZE, DEFAULT_LIMIT_REQ_PARAMETER) + validated_params[OFFSET] = (page_number - 1) * page_size + validated_params[LIMIT] = page_size + return pagination, validated_params + + +def add_pagination_to_result(validated_params: dict, news_flashes: list, num_nf: int) -> dict: + page_size = validated_params[PAGE_SIZE] + page_num = validated_params[PAGE_NUMBER] + total_pages = num_nf // page_size + (1 if num_nf % page_size else 0) + return { + "data": news_flashes, + "pagination": { + "pageNumber": page_num, + "pageSize": page_size, + "totalPages": total_pages, + "totalRecords": num_nf + } + } + + +def gen_news_flash_query(session, valid_params: dict) -> Tuple[int, Any]: query = session.query(NewsFlash) supported_resolutions = set([x.value for x in BE_CONST.SUPPORTED_RESOLUTIONS]) query = query.filter(NewsFlash.resolution.in_(supported_resolutions)) @@ -113,11 +156,12 @@ def gen_news_flash_query(session, valid_params: dict): not_(and_(NewsFlash.lat == None, NewsFlash.lon == None)), ) ).order_by(NewsFlash.date.desc()) + total = query.count() if "offset" in valid_params: query = query.offset(valid_params["offset"]) if "limit" in valid_params: query = query.limit(valid_params["limit"]) - return query + return total, query def set_display_source(news_flash): diff --git a/tests/test_news_flash_api.py b/tests/test_news_flash_api.py index 06deabd5..1bcfc103 100644 --- a/tests/test_news_flash_api.py +++ b/tests/test_news_flash_api.py @@ -202,7 +202,7 @@ def _test_update_news_flash_qualifying_not_manual_exists_location_db(self): def test_gen_news_flash_query(self): orig_supported_resolutions = BE_CONST.SUPPORTED_RESOLUTIONS BE_CONST.SUPPORTED_RESOLUTIONS = [BE_CONST.ResolutionCategories.DISTRICT] - actual = gen_news_flash_query(self.session, {"road_number": 12345678}) + _, actual = gen_news_flash_query(self.session, {"road_number": 12345678}) news_flashes = actual.all() self.assertEqual(len(news_flashes), 1, "single news flash") self.assertEqual( @@ -210,13 +210,13 @@ def test_gen_news_flash_query(self): ) BE_CONST.SUPPORTED_RESOLUTIONS = [BE_CONST.ResolutionCategories.REGION] - actual = gen_news_flash_query(self.session, {"road_number": 12345678}) + _, actual = gen_news_flash_query(self.session, {"road_number": 12345678}) news_flashes = actual.all() self.assertEqual(len(news_flashes), 1, "single news flash") self.assertEqual(news_flashes[0].description, self.region_description, "region description") BE_CONST.SUPPORTED_RESOLUTIONS = [BE_CONST.ResolutionCategories.CITY] - actual = gen_news_flash_query(self.session, {"road_number": 12345678}) + _, actual = gen_news_flash_query(self.session, {"road_number": 12345678}) news_flashes = actual.all() self.assertEqual(len(news_flashes), 0, "zero news flash") From 681722163f59c233b8c7871ecc7d7def87076201 Mon Sep 17 00:00:00 2001 From: tkalir Date: Sun, 15 Sep 2024 11:48:01 +0300 Subject: [PATCH 2/6] refactoring extract_geo_features --- anyway/parsers/location_extraction.py | 94 +++++++++++++++------------ anyway/parsers/news_flash.py | 6 +- 2 files changed, 54 insertions(+), 46 deletions(-) diff --git a/anyway/parsers/location_extraction.py b/anyway/parsers/location_extraction.py index d86464cf..9211caa8 100644 --- a/anyway/parsers/location_extraction.py +++ b/anyway/parsers/location_extraction.py @@ -15,7 +15,6 @@ from sqlalchemy.orm import load_only from datetime import date - INTEGER_FIELDS = ["road1", "road2", "road_segment_id", "yishuv_symbol", "street1", "street2"] @@ -62,7 +61,7 @@ def get_road_segment_by_name_and_road(road_segment_name: str, road: int) -> Road segments = db.session.query(RoadSegments).filter(RoadSegments.road == road).all() for segment in segments: if road_segment_name.startswith(segment.from_name) and road_segment_name.endswith( - segment.to_name + segment.to_name ): return segment err_msg = f"get_road_segment_by_name_and_road:{road_segment_name},{road}: not found" @@ -199,9 +198,9 @@ def get_db_matching_location(db, latitude, longitude, resolution, road_no=None): markers_orig = markers.copy() if resolution != "אחר": if ( - road_no is not None - and road_no > 0 - and ("road1" in relevant_fields or "road2" in relevant_fields) + road_no is not None + and road_no > 0 + and ("road1" in relevant_fields or "road2" in relevant_fields) ): markers = markers.loc[(markers["road1"] == road_no) | (markers["road2"] == road_no)] for field in relevant_fields: @@ -453,30 +452,30 @@ def extract_location_text(text): punc_after_ind = text.find(punc_to_try, forbid_ind) if punc_before_ind != -1 or punc_after_ind != -1: if punc_before_ind == -1: - text = text[(punc_after_ind + 1) :] + text = text[(punc_after_ind + 1):] elif punc_after_ind == -1: text = text[:punc_before_ind] else: - text = text[:punc_before_ind] + " " + text[(punc_after_ind + 1) :] + text = text[:punc_before_ind] + " " + text[(punc_after_ind + 1):] removed_punc = True break if (not removed_punc) and (forbid_word in hospital_words): for hospital_name in hospital_names: hospital_ind = text.find(hospital_name) if ( - hospital_ind == forbid_ind + len(forbid_word) + 1 - or hospital_ind == forbid_ind + len(forbid_word) + 2 + hospital_ind == forbid_ind + len(forbid_word) + 1 + or hospital_ind == forbid_ind + len(forbid_word) + 2 ): text = ( - text[:hospital_ind] + text[hospital_ind + len(hospital_name) + 1 :] + text[:hospital_ind] + text[hospital_ind + len(hospital_name) + 1:] ) forbid_ind = text.find(forbid_word) - text = text[:forbid_ind] + text[forbid_ind + len(forbid_word) + 1 :] + text = text[:forbid_ind] + text[forbid_ind + len(forbid_word) + 1:] found_hospital = True if (not found_hospital) and (not removed_punc): text = ( - text[:forbid_ind] - + text[text.find(" ", forbid_ind + len(forbid_word) + 2) :] + text[:forbid_ind] + + text[text.find(" ", forbid_ind + len(forbid_word) + 2):] ) except Exception as _: @@ -512,40 +511,49 @@ def extract_location_text(text): for token in near_tokens: i = text.find(token) if i >= 0: - text = text[:i] + token + text[i + len(token) :] + text = text[:i] + token + text[i + len(token):] return text -def extract_geo_features(db, newsflash: NewsFlash, update_cbs_location_only: bool) -> None: - location_from_db = None - if update_cbs_location_only: - if newsflash.resolution in ["כביש בינעירוני", "רחוב"]: - location_from_db = get_db_matching_location( - db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1 - ) - else: - newsflash.location = extract_location_text(newsflash.description) or extract_location_text( - newsflash.title +def update_coordinates_and_resolution_using_location_text(newsflash): + newsflash.location = extract_location_text(newsflash.description) or extract_location_text( + newsflash.title + ) + geo_location = geocode_extract(newsflash.location) + if geo_location is not None: + newsflash.lat = geo_location["geom"]["lat"] + newsflash.lon = geo_location["geom"]["lng"] + newsflash.road1 = geo_location["road_no"] + newsflash.resolution = set_accident_resolution(geo_location) + return geo_location is not None + + +def extract_geo_features(db, newsflash: NewsFlash, use_existing_coordinates_only: bool) -> None: + if not use_existing_coordinates_only: + update_coordinates_and_resolution_using_location_text(newsflash) + + if newsflash.resolution in [BE_CONST.ResolutionCategories.STREET, + BE_CONST.ResolutionCategories.SUBURBAN_ROAD]: + location_from_db = get_db_matching_location( + db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1 ) - geo_location = geocode_extract(newsflash.location) - if geo_location is not None: - newsflash.lat = geo_location["geom"]["lat"] - newsflash.lon = geo_location["geom"]["lng"] - newsflash.resolution = set_accident_resolution(geo_location) - location_from_db = get_db_matching_location( - db, newsflash.lat, newsflash.lon, newsflash.resolution, geo_location["road_no"] - ) - if location_from_db is not None: - for k, v in location_from_db.items(): - setattr(newsflash, k, v) - for field in RF.get_all_location_fields(): - if field not in location_from_db: - setattr(newsflash, field, None) - if ( - newsflash.road_segment_id is None - and newsflash.road_segment_name is not None - and newsflash.road1 is not None - ): + if location_from_db is not None: + update_location_fields(newsflash, location_from_db) + try_find_segment_id(newsflash) + + +def update_location_fields(newsflash, location_from_db): + for k, v in location_from_db.items(): + setattr(newsflash, k, v) + for field in RF.get_all_location_fields(): + if field not in location_from_db: + setattr(newsflash, field, None) + + +def try_find_segment_id(newsflash): + if (newsflash.road_segment_id is None + and newsflash.road_segment_name is not None + and newsflash.road1 is not None): try: seg = get_road_segment_by_name_and_road(newsflash.road_segment_name, newsflash.road1) newsflash.road_segment_id = seg.segment_id diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index ba902cea..a93dafd1 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -36,7 +36,7 @@ def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=Fa newsflash.accident = classify(newsflash.title) if newsflash.accident: extract_geo_features( - db=db, newsflash=newsflash, update_cbs_location_only=update_cbs_location_only + db=db, newsflash=newsflash, use_existing_coordinates_only=update_cbs_location_only ) if i % 1000 == 0: db.commit() @@ -53,7 +53,7 @@ def scrape_extract_store_rss(site_name, db): newsflash.organization = classify_organization(site_name) if newsflash.accident: # FIX: No accident-accurate date extracted - extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False) + extract_geo_features(db=db, newsflash=newsflash, use_existing_coordinates_only=False) db.insert_new_newsflash(newsflash) @@ -66,7 +66,7 @@ def scrape_extract_store_twitter(screen_name, db): newsflash.accident = classify_tweets(newsflash.description) newsflash.organization = classify_organization("twitter") if newsflash.accident: - extract_geo_features(db=db, newsflash=newsflash, update_cbs_location_only=False) + extract_geo_features(db=db, newsflash=newsflash, use_existing_coordinates_only=False) db.insert_new_newsflash(newsflash) From 429c395c144ccbd229b5cca20b32d6113dbb8c46 Mon Sep 17 00:00:00 2001 From: tkalir Date: Sun, 15 Sep 2024 16:25:33 +0300 Subject: [PATCH 3/6] black fix for location_extraction.py --- anyway/parsers/location_extraction.py | 40 +++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/anyway/parsers/location_extraction.py b/anyway/parsers/location_extraction.py index 9211caa8..b3c8a308 100644 --- a/anyway/parsers/location_extraction.py +++ b/anyway/parsers/location_extraction.py @@ -61,7 +61,7 @@ def get_road_segment_by_name_and_road(road_segment_name: str, road: int) -> Road segments = db.session.query(RoadSegments).filter(RoadSegments.road == road).all() for segment in segments: if road_segment_name.startswith(segment.from_name) and road_segment_name.endswith( - segment.to_name + segment.to_name ): return segment err_msg = f"get_road_segment_by_name_and_road:{road_segment_name},{road}: not found" @@ -198,9 +198,9 @@ def get_db_matching_location(db, latitude, longitude, resolution, road_no=None): markers_orig = markers.copy() if resolution != "אחר": if ( - road_no is not None - and road_no > 0 - and ("road1" in relevant_fields or "road2" in relevant_fields) + road_no is not None + and road_no > 0 + and ("road1" in relevant_fields or "road2" in relevant_fields) ): markers = markers.loc[(markers["road1"] == road_no) | (markers["road2"] == road_no)] for field in relevant_fields: @@ -452,30 +452,30 @@ def extract_location_text(text): punc_after_ind = text.find(punc_to_try, forbid_ind) if punc_before_ind != -1 or punc_after_ind != -1: if punc_before_ind == -1: - text = text[(punc_after_ind + 1):] + text = text[(punc_after_ind + 1) :] elif punc_after_ind == -1: text = text[:punc_before_ind] else: - text = text[:punc_before_ind] + " " + text[(punc_after_ind + 1):] + text = text[:punc_before_ind] + " " + text[(punc_after_ind + 1) :] removed_punc = True break if (not removed_punc) and (forbid_word in hospital_words): for hospital_name in hospital_names: hospital_ind = text.find(hospital_name) if ( - hospital_ind == forbid_ind + len(forbid_word) + 1 - or hospital_ind == forbid_ind + len(forbid_word) + 2 + hospital_ind == forbid_ind + len(forbid_word) + 1 + or hospital_ind == forbid_ind + len(forbid_word) + 2 ): text = ( - text[:hospital_ind] + text[hospital_ind + len(hospital_name) + 1:] + text[:hospital_ind] + text[hospital_ind + len(hospital_name) + 1 :] ) forbid_ind = text.find(forbid_word) - text = text[:forbid_ind] + text[forbid_ind + len(forbid_word) + 1:] + text = text[:forbid_ind] + text[forbid_ind + len(forbid_word) + 1 :] found_hospital = True if (not found_hospital) and (not removed_punc): text = ( - text[:forbid_ind] - + text[text.find(" ", forbid_ind + len(forbid_word) + 2):] + text[:forbid_ind] + + text[text.find(" ", forbid_ind + len(forbid_word) + 2) :] ) except Exception as _: @@ -511,7 +511,7 @@ def extract_location_text(text): for token in near_tokens: i = text.find(token) if i >= 0: - text = text[:i] + token + text[i + len(token):] + text = text[:i] + token + text[i + len(token) :] return text @@ -532,8 +532,10 @@ def extract_geo_features(db, newsflash: NewsFlash, use_existing_coordinates_only if not use_existing_coordinates_only: update_coordinates_and_resolution_using_location_text(newsflash) - if newsflash.resolution in [BE_CONST.ResolutionCategories.STREET, - BE_CONST.ResolutionCategories.SUBURBAN_ROAD]: + if newsflash.resolution in [ + BE_CONST.ResolutionCategories.STREET, + BE_CONST.ResolutionCategories.SUBURBAN_ROAD, + ]: location_from_db = get_db_matching_location( db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1 ) @@ -551,9 +553,11 @@ def update_location_fields(newsflash, location_from_db): def try_find_segment_id(newsflash): - if (newsflash.road_segment_id is None - and newsflash.road_segment_name is not None - and newsflash.road1 is not None): + if ( + newsflash.road_segment_id is None + and newsflash.road_segment_name is not None + and newsflash.road1 is not None + ): try: seg = get_road_segment_by_name_and_road(newsflash.road_segment_name, newsflash.road1) newsflash.road_segment_id = seg.segment_id From c5aa1ff3e218a9f7c48ee25b29f2201760aec845 Mon Sep 17 00:00:00 2001 From: tkalir Date: Tue, 17 Sep 2024 14:35:55 +0300 Subject: [PATCH 4/6] minor refacoring in extract_geo_features --- anyway/parsers/location_extraction.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/anyway/parsers/location_extraction.py b/anyway/parsers/location_extraction.py index b3c8a308..49574f00 100644 --- a/anyway/parsers/location_extraction.py +++ b/anyway/parsers/location_extraction.py @@ -532,10 +532,7 @@ def extract_geo_features(db, newsflash: NewsFlash, use_existing_coordinates_only if not use_existing_coordinates_only: update_coordinates_and_resolution_using_location_text(newsflash) - if newsflash.resolution in [ - BE_CONST.ResolutionCategories.STREET, - BE_CONST.ResolutionCategories.SUBURBAN_ROAD, - ]: + if newsflash.resolution in BE_CONST.SUPPORTED_RESOLUTIONS: location_from_db = get_db_matching_location( db, newsflash.lat, newsflash.lon, newsflash.resolution, newsflash.road1 ) From dda200aaa9b2379e3c1a6601ae27beb3c0c3abab Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Thu, 26 Sep 2024 16:40:54 +0300 Subject: [PATCH 5/6] remove update_cbs_location_only --- anyway/parsers/news_flash.py | 6 +++--- main.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index a93dafd1..fb85bfa1 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -15,7 +15,7 @@ news_flash_classifiers = {"ynet": classify_rss, "twitter": classify_tweets, "walla": classify_rss} -def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=False): +def update_all_in_db(source=None, newsflash_id=None, use_existing_coordinates_only=False): """ main function for newsflash updating. @@ -30,13 +30,13 @@ def update_all_in_db(source=None, newsflash_id=None, update_cbs_location_only=Fa newsflash_items = db.get_all_newsflash() for i, newsflash in enumerate(newsflash_items): logging.debug(f"Updating news-flash:{newsflash.id}") - if not update_cbs_location_only: + if not use_existing_coordinates_only: classify = news_flash_classifiers[newsflash.source] newsflash.organization = classify_organization(newsflash.source) newsflash.accident = classify(newsflash.title) if newsflash.accident: extract_geo_features( - db=db, newsflash=newsflash, use_existing_coordinates_only=update_cbs_location_only + db=db, newsflash=newsflash, use_existing_coordinates_only=use_existing_coordinates_only ) if i % 1000 == 0: db.commit() diff --git a/main.py b/main.py index 91c86874..1db1d813 100755 --- a/main.py +++ b/main.py @@ -67,15 +67,15 @@ def update_news_flash(): @update_news_flash.command() @click.option("--source", default="", type=str) @click.option("--news_flash_id", default="", type=str) -@click.option("--update_cbs_location_only", is_flag=True) -def update(source, news_flash_id, update_cbs_location_only): +@click.option("--use_existing_coordinates_only", is_flag=True) +def update(source, news_flash_id, use_existing_coordinates_only): from anyway.parsers import news_flash if not source: source = None if not news_flash_id: news_flash_id = None - return news_flash.update_all_in_db(source, news_flash_id, update_cbs_location_only) + return news_flash.update_all_in_db(source, news_flash_id, use_existing_coordinates_only) @update_news_flash.command() From 0b8414f69dea1a9fa6bbe5b0a270a4c9ffb2b904 Mon Sep 17 00:00:00 2001 From: Atalya Alon Date: Thu, 26 Sep 2024 16:51:37 +0300 Subject: [PATCH 6/6] black formatting --- anyway/parsers/news_flash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anyway/parsers/news_flash.py b/anyway/parsers/news_flash.py index fb85bfa1..4d2191dd 100644 --- a/anyway/parsers/news_flash.py +++ b/anyway/parsers/news_flash.py @@ -36,7 +36,9 @@ def update_all_in_db(source=None, newsflash_id=None, use_existing_coordinates_on newsflash.accident = classify(newsflash.title) if newsflash.accident: extract_geo_features( - db=db, newsflash=newsflash, use_existing_coordinates_only=use_existing_coordinates_only + db=db, + newsflash=newsflash, + use_existing_coordinates_only=use_existing_coordinates_only, ) if i % 1000 == 0: db.commit()