Skip to content

Commit

Permalink
Merge pull request #87 from hynky1999/cc_indexes_server_arg
Browse files Browse the repository at this point in the history
🔥 Removal of cc indexes arg
  • Loading branch information
hynky1999 authored Nov 20, 2023
2 parents f630dbd + eda7acb commit 02d6c05
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 11 deletions.
7 changes: 2 additions & 5 deletions cmoncrawl/aggregator/athena_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
crawl_url_to_name,
prepare_athena_sql_query,
)
from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
from cmoncrawl.aggregator.utils.helpers import (
get_all_CC_indexes,
remove_bucket_prefix,
Expand Down Expand Up @@ -79,7 +80,6 @@ class AthenaAggregator(IAggregator):
def __init__(
self,
urls: List[str],
cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json",
match_type: MatchType = MatchType.EXACT,
cc_servers: Optional[List[str]] = None,
since: datetime = datetime.min,
Expand All @@ -97,7 +97,6 @@ def __init__(
table_name: str = "ccindex",
) -> None:
self.urls = urls
self.cc_indexes_server = cc_indexes_server
self.match_type = match_type
self.cc_servers = cc_servers
self.since = since
Expand Down Expand Up @@ -143,9 +142,7 @@ async def aopen(self) -> AthenaAggregator:
)
async with ClientSession() as client:
if not self.cc_servers:
self.cc_servers = await get_all_CC_indexes(
client, self.cc_indexes_server
)
self.cc_servers = await get_all_CC_indexes(client, CC_INDEXES_SERVER)
# create bucket if not exists
async with self.aws_client.client("s3") as s3:
# Check if bucket exists
Expand Down
7 changes: 2 additions & 5 deletions cmoncrawl/aggregator/gateway_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)

from cmoncrawl.aggregator.base import IAggregator
from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
from cmoncrawl.aggregator.utils.helpers import (
crawl_to_year,
get_all_CC_indexes,
Expand Down Expand Up @@ -67,7 +68,6 @@ class GatewayAggregator(IAggregator):
def __init__(
self,
urls: List[str],
cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json",
match_type: MatchType = MatchType.EXACT,
cc_servers: Optional[List[str]] = None,
since: datetime = datetime.min,
Expand All @@ -79,7 +79,6 @@ def __init__(
max_requests_per_second: int = 20,
) -> None:
self.urls = urls
self.cc_indexes_server = cc_indexes_server
self.cc_servers = cc_servers
self.since = since
self.to = to
Expand All @@ -96,9 +95,7 @@ async def aopen(self) -> GatewayAggregator:
await self.client.__aenter__()

if not self.cc_servers:
self.cc_servers = await get_all_CC_indexes(
self.client, self.cc_indexes_server
)
self.cc_servers = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER)
return self

async def __aenter__(self) -> GatewayAggregator:
Expand Down
1 change: 1 addition & 0 deletions cmoncrawl/aggregator/utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CC_INDEXES_SERVER = "https://index.commoncrawl.org/collinfo.json"
3 changes: 2 additions & 1 deletion tests/gateway_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
MatchType,
)
from cmoncrawl.aggregator.gateway_query import GatewayAggregator
from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER
from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes, unify_url_id


Expand Down Expand Up @@ -38,7 +39,7 @@ async def test_indexer_num_pages(self):
self.assertEqual(num_pages, 14)

async def test_indexer_all_CC(self):
indexes = await get_all_CC_indexes(self.client, self.di.cc_indexes_server)
indexes = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER)
indexes = sorted(indexes)
indexes = indexes[
: indexes.index("https://index.commoncrawl.org/CC-MAIN-2022-27-index") + 1
Expand Down

0 comments on commit 02d6c05

Please sign in to comment.