-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
73 lines (64 loc) · 2.63 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
class DonorScrapper(object):
def __init__(self):
self.start_url = (
"https://ereceipt.tn.gov.in/cmprf/Interface/CMPRF/MonthWiseReport"
)
self.browser = webdriver.Chrome()
def start(self):
self.browser.get(self.start_url)
for year in ["2020"]:
for i in range(1, 13):
year_select = Select(
self.browser.find_element_by_name(
"ctl00$ContentPlaceHolder1$ddl_Trans_Status"
)
)
year_select.select_by_visible_text(year)
if year == "2020" and i not in [3, 4]:
continue
month = Select(
self.browser.find_element_by_name(
"ctl00$ContentPlaceHolder1$ddl_paymnet_Mode"
)
)
month.select_by_value(str(i))
submit_btn = self.browser.find_element_by_name(
"ctl00$ContentPlaceHolder1$btnshow"
)
submit_btn.click()
self.scrape_pages(int(year), i)
self.browser.get("https://google.com")
self.browser.get(self.start_url)
self.browser.close()
def scrape_pages(self, year, month):
print(f"Scrapping details for year: {year} month: {month}")
outfile = os.path.join("data", f"{year}_{month}.csv")
table_element = self.browser.find_element_by_id("ContentPlaceHolder1_grid_View")
main = pd.read_html(table_element.get_attribute("outerHTML"))[0]
main.drop(main.tail(1).index, inplace=True)
try:
next_btn = self.browser.find_element_by_id(
"ContentPlaceHolder1_grid_View_LinkButton3"
)
while next_btn:
next_btn.click()
table_element = self.browser.find_element_by_id(
"ContentPlaceHolder1_grid_View"
)
df = pd.read_html(table_element.get_attribute("outerHTML"))[0]
df.drop(df.tail(1).index, inplace=True)
main = main.append(df)
next_btn = self.browser.find_element_by_id(
"ContentPlaceHolder1_grid_View_LinkButton3"
)
except NoSuchElementException:
# all pages done
main.to_csv(outfile, index=False)
if __name__ == "__main__":
scrapper = DonorScrapper()
scrapper.start()