diff --git a/posts/2024-12-10-cpcb-download.ipynb b/posts/2024-12-10-cpcb-download.ipynb
index 17af3d7..72d227c 100644
--- a/posts/2024-12-10-cpcb-download.ipynb
+++ b/posts/2024-12-10-cpcb-download.ipynb
@@ -18,10 +18,13 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
+ "import os\n",
+ "import re\n",
+ "from glob import glob\n",
"import pandas as pd\n",
"from tqdm.notebook import tqdm\n",
"from selenium import webdriver\n",
@@ -30,79 +33,105 @@
"from selenium.webdriver.support.ui import Select, WebDriverWait\n",
"from selenium.webdriver.common.action_chains import ActionChains\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
+ "from selenium.webdriver.chrome.options import Options\n",
"from time import sleep\n",
"\n",
"HOME_URL = \"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing\"\n",
+ "DOWNLOAD_OLD_DATA_URL = \"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/caaqm-data-repository\"\n",
"DOWNLOAD_PAGE_URL = \"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/data\""
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 2,
"metadata": {},
+ "outputs": [],
"source": [
- "## Dry run to get metadata"
+ "def click_it(driver, element):\n",
+ " driver.execute_script(\"arguments[0].click();\", element)\n",
+ " \n",
+ "def find_it(element, option):\n",
+ " return element.find_element(By.XPATH, f\"//li[contains(text(), '{option}')]\")\n",
+ "\n",
+ "def select_dropdown_option(driver, element, option):\n",
+ " element.click()\n",
+ " option = find_it(element, option)\n",
+ " click_it(driver, option)"
]
},
{
- "cell_type": "code",
- "execution_count": 219,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "driver = webdriver.Chrome()\n",
- "driver.get(HOME_URL)"
+ "## Dry run to get metadata"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 3,
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "Enter Captcha manually before moving ahead"
+ "# headless chrome\n",
+ "options = Options()\n",
+ "options.add_argument(\"--headless\")\n",
+ "\n",
+ "# open the browser\n",
+ "driver = webdriver.Chrome(options=options)\n",
+ "\n",
+ "# open the website\n",
+ "driver.get(DOWNLOAD_OLD_DATA_URL)\n",
+ "\n",
+ "# wait for the page to load and the dropdowns to appear\n",
+ "dropdowns = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, \".select-box\")))\n",
+ "len(dropdowns)"
]
},
{
"cell_type": "code",
- "execution_count": 214,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
- "driver.get(DOWNLOAD_PAGE_URL)"
+ "drop_data_type, drop_frequency, drop_states, drop_cities, drop_stations = dropdowns"
]
},
{
"cell_type": "code",
- "execution_count": 215,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
- "ename": "AssertionError",
- "evalue": "0",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[215], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m dropdowns \u001b[38;5;241m=\u001b[39m driver\u001b[38;5;241m.\u001b[39mfind_elements(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcss selector\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.select-box\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(dropdowns) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m3\u001b[39m, \u001b[38;5;28mlen\u001b[39m(dropdowns)\n",
- "\u001b[0;31mAssertionError\u001b[0m: 0"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of states: 31\n"
]
}
],
"source": [
- "dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- "assert len(dropdowns) == 3, len(dropdowns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dropdowns[0].click() # Open\n",
- "states = dropdowns[0].text.replace(\"Select ...\\n▲\\n\", \"\").split(\"\\n\")\n",
- "dropdowns[0].click() # Close\n",
- "assert len(states) == 31\n",
- "for state in states:\n",
- " metadata_dict[state] = {}"
+ "# Select data type\n",
+ "select_dropdown_option(driver, drop_data_type, \"Raw data\")\n",
+ "\n",
+ "# Select frequency\n",
+ "select_dropdown_option(driver, drop_frequency, \"1 day\")\n",
+ "\n",
+ "# Get the states\n",
+ "drop_states.click() # Open the dropdown\n",
+ "states = drop_states.text.replace(\"▲\\n\", \"\").split(\"\\n\")\n",
+ "print(\"Number of states:\", len(states))\n",
+ "drop_states.click() # Close the dropdown"
]
},
{
@@ -113,419 +142,61 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "3a2d9b1eb2fa4322b3727b021b0b7629",
+ "model_id": "957f5ecd8a40427aa882ef39f65d90c7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
- " 0%| | 0/31 [00:00, ?it/s]"
+ " 0%| | 0/600 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "state='Andhra Pradesh'\n",
- "cities=['Amaravati', 'Anantapur', 'Chittoor', 'Kadapa', 'Rajamahendravaram', 'Tirupati', 'Vijayawada', 'Visakhapatnam']\n",
- "state='Andhra Pradesh', city='Amaravati'\n",
- "state='Andhra Pradesh', city='Anantapur'\n",
- "state='Andhra Pradesh', city='Chittoor'\n",
- "state='Andhra Pradesh', city='Kadapa'\n",
- "state='Andhra Pradesh', city='Rajamahendravaram'\n",
- "state='Andhra Pradesh', city='Tirupati'\n",
- "state='Andhra Pradesh', city='Vijayawada'\n",
- "state='Andhra Pradesh', city='Visakhapatnam'\n",
- "state='Arunachal Pradesh'\n",
- "cities=['Naharlagun']\n",
- "state='Arunachal Pradesh', city='Naharlagun'\n",
- "state='Assam'\n",
- "cities=['Byrnihat', 'Guwahati', 'Nagaon', 'Nalbari', 'Silchar', 'Sivasagar']\n",
- "state='Assam', city='Byrnihat'\n",
- "state='Assam', city='Guwahati'\n",
- "state='Assam', city='Nagaon'\n",
- "state='Assam', city='Nalbari'\n",
- "state='Assam', city='Silchar'\n",
- "state='Assam', city='Sivasagar'\n",
- "state='Bihar'\n",
- "cities=['Araria', 'Arrah', 'Aurangabad', 'Begusarai', 'Bettiah', 'Bhagalpur', 'Bihar Sharif', 'Buxar', 'Chhapra', 'Darbhanga', 'Gaya', 'Hajipur', 'Katihar', 'Kishanganj', 'Manguraha', 'Motihari', 'Munger', 'Muzaffarpur', 'Patna', 'Purnia', 'Rajgir', 'Saharsa', 'Samastipur', 'Sasaram', 'Siwan']\n",
- "state='Bihar', city='Araria'\n",
- "state='Bihar', city='Arrah'\n",
- "state='Bihar', city='Aurangabad'\n",
- "state='Bihar', city='Begusarai'\n",
- "state='Bihar', city='Bettiah'\n",
- "state='Bihar', city='Bhagalpur'\n",
- "state='Bihar', city='Bihar Sharif'\n",
- "state='Bihar', city='Buxar'\n",
- "state='Bihar', city='Chhapra'\n",
- "state='Bihar', city='Darbhanga'\n",
- "state='Bihar', city='Gaya'\n",
- "state='Bihar', city='Hajipur'\n",
- "state='Bihar', city='Katihar'\n",
- "state='Bihar', city='Kishanganj'\n",
- "state='Bihar', city='Manguraha'\n",
- "state='Bihar', city='Motihari'\n",
- "state='Bihar', city='Munger'\n",
- "state='Bihar', city='Muzaffarpur'\n",
- "state='Bihar', city='Patna'\n",
- "state='Bihar', city='Purnia'\n",
- "state='Bihar', city='Rajgir'\n",
- "state='Bihar', city='Saharsa'\n",
- "state='Bihar', city='Samastipur'\n",
- "state='Bihar', city='Sasaram'\n",
- "state='Bihar', city='Siwan'\n",
- "state='Chandigarh'\n",
- "cities=['Chandigarh']\n",
- "state='Chandigarh', city='Chandigarh'\n",
- "state='Chhattisgarh'\n",
- "cities=['Bhilai', 'Bilaspur', 'Chhal', 'Korba', 'Kunjemura', 'Milupara', 'Raipur', 'Tumidih']\n",
- "state='Chhattisgarh', city='Bhilai'\n",
- "state='Chhattisgarh', city='Bilaspur'\n",
- "state='Chhattisgarh', city='Chhal'\n",
- "state='Chhattisgarh', city='Korba'\n",
- "state='Chhattisgarh', city='Kunjemura'\n",
- "state='Chhattisgarh', city='Milupara'\n",
- "state='Chhattisgarh', city='Raipur'\n",
- "state='Chhattisgarh', city='Tumidih'\n",
- "state='Delhi'\n",
- "cities=['Delhi']\n",
- "state='Delhi', city='Delhi'\n",
- "state='Gujarat'\n",
- "cities=['Ahmedabad', 'Ankleshwar', 'Gandhinagar', 'Nandesari', 'Surat', 'Vapi', 'Vatva']\n",
- "state='Gujarat', city='Ahmedabad'\n",
- "state='Gujarat', city='Ankleshwar'\n",
- "state='Gujarat', city='Gandhinagar'\n",
- "state='Gujarat', city='Nandesari'\n",
- "state='Gujarat', city='Surat'\n",
- "state='Gujarat', city='Vapi'\n",
- "state='Gujarat', city='Vatva'\n",
- "state='Haryana'\n",
- "cities=['Ambala', 'Bahadurgarh', 'Ballabgarh', 'Bhiwani', 'Charkhi Dadri', 'Dharuhera', 'Faridabad', 'Fatehabad', 'Gurugram', 'Hisar', 'Jind', 'Kaithal', 'Karnal', 'Kurukshetra', 'Mandikhera', 'Manesar', 'Narnaul', 'Palwal', 'Panchkula', 'Panipat', 'Rohtak', 'Sirsa', 'Sonipat', 'Yamuna Nagar']\n",
- "state='Haryana', city='Ambala'\n",
- "state='Haryana', city='Bahadurgarh'\n",
- "state='Haryana', city='Ballabgarh'\n",
- "state='Haryana', city='Bhiwani'\n",
- "state='Haryana', city='Charkhi Dadri'\n",
- "state='Haryana', city='Dharuhera'\n",
- "state='Haryana', city='Faridabad'\n",
- "state='Haryana', city='Fatehabad'\n",
- "state='Haryana', city='Gurugram'\n",
- "state='Haryana', city='Hisar'\n",
- "state='Haryana', city='Jind'\n",
- "state='Haryana', city='Kaithal'\n",
- "state='Haryana', city='Karnal'\n",
- "state='Haryana', city='Kurukshetra'\n",
- "state='Haryana', city='Mandikhera'\n",
- "state='Haryana', city='Manesar'\n",
- "state='Haryana', city='Narnaul'\n",
- "state='Haryana', city='Palwal'\n",
- "state='Haryana', city='Panchkula'\n",
- "state='Haryana', city='Panipat'\n",
- "state='Haryana', city='Rohtak'\n",
- "state='Haryana', city='Sirsa'\n",
- "state='Haryana', city='Sonipat'\n",
- "state='Haryana', city='Yamuna Nagar'\n",
- "state='Himachal Pradesh'\n",
- "cities=['Baddi']\n",
- "state='Himachal Pradesh', city='Baddi'\n",
- "state='Jammu and Kashmir'\n",
- "cities=['Srinagar']\n",
- "state='Jammu and Kashmir', city='Srinagar'\n",
- "state='Jharkhand'\n",
- "cities=['Dhanbad', 'Jorapokhar', 'Pathardih']\n",
- "state='Jharkhand', city='Dhanbad'\n",
- "state='Jharkhand', city='Jorapokhar'\n",
- "state='Jharkhand', city='Pathardih'\n",
- "state='Karnataka'\n",
- "cities=['Bagalkot', 'Belgaum', 'Bengaluru', 'Bidar', 'Chamarajanagar', 'Chikkaballapur', 'Chikkamagaluru', 'Davanagere', 'Dharwad', 'Gadag', 'Hassan', 'Haveri', 'Hubballi', 'Kalaburagi', 'Karwar', 'Kolar', 'Koppal', 'Madikeri', 'Mangalore', 'Mysuru', 'Raichur', 'Ramanagara', 'Shivamogga', 'Tumakuru', 'Udupi', 'Vijayapura', 'Yadgir']\n",
- "state='Karnataka', city='Bagalkot'\n",
- "state='Karnataka', city='Belgaum'\n",
- "state='Karnataka', city='Bengaluru'\n",
- "state='Karnataka', city='Bidar'\n",
- "state='Karnataka', city='Chamarajanagar'\n",
- "state='Karnataka', city='Chikkaballapur'\n",
- "state='Karnataka', city='Chikkamagaluru'\n",
- "state='Karnataka', city='Davanagere'\n",
- "state='Karnataka', city='Dharwad'\n",
- "state='Karnataka', city='Gadag'\n",
- "state='Karnataka', city='Hassan'\n",
- "state='Karnataka', city='Haveri'\n",
- "state='Karnataka', city='Hubballi'\n",
- "state='Karnataka', city='Kalaburagi'\n",
- "state='Karnataka', city='Karwar'\n",
- "state='Karnataka', city='Kolar'\n",
- "state='Karnataka', city='Koppal'\n",
- "state='Karnataka', city='Madikeri'\n",
- "state='Karnataka', city='Mangalore'\n",
- "state='Karnataka', city='Mysuru'\n",
- "state='Karnataka', city='Raichur'\n",
- "state='Karnataka', city='Ramanagara'\n",
- "state='Karnataka', city='Shivamogga'\n",
- "state='Karnataka', city='Tumakuru'\n",
- "state='Karnataka', city='Udupi'\n",
- "state='Karnataka', city='Vijayapura'\n",
- "state='Karnataka', city='Yadgir'\n",
- "state='Kerala'\n",
- "cities=['Eloor', 'Ernakulam', 'Kannur', 'Kochi', 'Kollam', 'Kozhikode', 'Thiruvananthapuram', 'Thrissur']\n",
- "state='Kerala', city='Eloor'\n",
- "state='Kerala', city='Ernakulam'\n",
- "state='Kerala', city='Kannur'\n",
- "state='Kerala', city='Kochi'\n",
- "state='Kerala', city='Kollam'\n",
- "state='Kerala', city='Kozhikode'\n",
- "state='Kerala', city='Thiruvananthapuram'\n",
- "state='Kerala', city='Thrissur'\n",
- "state='Madhya Pradesh'\n",
- "cities=['Bhopal', 'Damoh', 'Dewas', 'Gwalior', 'Indore', 'Jabalpur', 'Katni', 'Maihar', 'Mandideep', 'Pithampur', 'Ratlam', 'Sagar', 'Satna', 'Singrauli', 'Ujjain']\n",
- "state='Madhya Pradesh', city='Bhopal'\n",
- "state='Madhya Pradesh', city='Damoh'\n",
- "state='Madhya Pradesh', city='Dewas'\n",
- "state='Madhya Pradesh', city='Gwalior'\n",
- "state='Madhya Pradesh', city='Indore'\n",
- "state='Madhya Pradesh', city='Jabalpur'\n",
- "state='Madhya Pradesh', city='Katni'\n",
- "state='Madhya Pradesh', city='Maihar'\n",
- "state='Madhya Pradesh', city='Mandideep'\n",
- "state='Madhya Pradesh', city='Pithampur'\n",
- "state='Madhya Pradesh', city='Ratlam'\n",
- "state='Madhya Pradesh', city='Sagar'\n",
- "state='Madhya Pradesh', city='Satna'\n",
- "state='Madhya Pradesh', city='Singrauli'\n",
- "state='Madhya Pradesh', city='Ujjain'\n",
- "state='Maharashtra'\n",
- "cities=['Ahmednagar', 'Akola', 'Amravati', 'Aurangabad', 'Badlapur', 'Belapur', 'Bhiwandi', 'Boisar', 'Chandrapur', 'Dhule', 'Jalgaon', 'Jalna', 'Kalyan', 'Kolhapur', 'Latur', 'Mahad', 'Malegaon', 'Mira-Bhayandar', 'Mumbai', 'Nagpur', 'Nanded', 'Nashik', 'Navi Mumbai', 'Parbhani', 'Pimpri-Chinchwad', 'Pune', 'Sangli', 'Solapur', 'Thane', 'Ulhasnagar', 'Virar']\n",
- "state='Maharashtra', city='Ahmednagar'\n",
- "state='Maharashtra', city='Akola'\n",
- "state='Maharashtra', city='Amravati'\n",
- "state='Maharashtra', city='Aurangabad'\n",
- "state='Maharashtra', city='Badlapur'\n",
- "state='Maharashtra', city='Belapur'\n",
- "state='Maharashtra', city='Bhiwandi'\n",
- "state='Maharashtra', city='Boisar'\n",
- "state='Maharashtra', city='Chandrapur'\n",
- "state='Maharashtra', city='Dhule'\n",
- "state='Maharashtra', city='Jalgaon'\n",
- "state='Maharashtra', city='Jalna'\n",
- "state='Maharashtra', city='Kalyan'\n",
- "state='Maharashtra', city='Kolhapur'\n",
- "state='Maharashtra', city='Latur'\n",
- "state='Maharashtra', city='Mahad'\n",
- "state='Maharashtra', city='Malegaon'\n",
- "state='Maharashtra', city='Mira-Bhayandar'\n",
- "state='Maharashtra', city='Mumbai'\n",
- "state='Maharashtra', city='Nagpur'\n",
- "state='Maharashtra', city='Nanded'\n",
- "state='Maharashtra', city='Nashik'\n",
- "state='Maharashtra', city='Navi Mumbai'\n",
- "state='Maharashtra', city='Parbhani'\n",
- "state='Maharashtra', city='Pimpri-Chinchwad'\n",
- "state='Maharashtra', city='Pune'\n",
- "state='Maharashtra', city='Sangli'\n",
- "state='Maharashtra', city='Solapur'\n",
- "state='Maharashtra', city='Thane'\n",
- "state='Maharashtra', city='Ulhasnagar'\n",
- "state='Maharashtra', city='Virar'\n",
- "state='Manipur'\n",
- "cities=['Imphal']\n",
- "state='Manipur', city='Imphal'\n",
- "state='Meghalaya'\n",
- "cities=['Shillong']\n",
- "state='Meghalaya', city='Shillong'\n",
- "state='Mizoram'\n",
- "cities=['Aizawl']\n",
- "state='Mizoram', city='Aizawl'\n",
- "state='Nagaland'\n",
- "cities=['Kohima']\n",
- "state='Nagaland', city='Kohima'\n",
- "state='Odisha'\n",
- "cities=['Angul', 'Balasore', 'Barbil', 'Baripada', 'Bhubaneswar', 'Bileipada', 'Brajrajnagar', 'Byasanagar', 'Cuttack', 'Keonjhar', 'Nayagarh', 'Rairangpur', 'Rourkela', 'Suakati', 'Talcher', 'Tensa']\n",
- "state='Odisha', city='Angul'\n",
- "state='Odisha', city='Balasore'\n",
- "state='Odisha', city='Barbil'\n",
- "state='Odisha', city='Baripada'\n",
- "state='Odisha', city='Bhubaneswar'\n",
- "state='Odisha', city='Bileipada'\n",
- "state='Odisha', city='Brajrajnagar'\n",
- "state='Odisha', city='Byasanagar'\n",
- "state='Odisha', city='Cuttack'\n",
- "state='Odisha', city='Keonjhar'\n",
- "state='Odisha', city='Nayagarh'\n",
- "state='Odisha', city='Rairangpur'\n",
- "state='Odisha', city='Rourkela'\n",
- "state='Odisha', city='Suakati'\n",
- "state='Odisha', city='Talcher'\n",
- "state='Odisha', city='Tensa'\n",
- "state='Puducherry'\n",
- "cities=['Puducherry']\n",
- "state='Puducherry', city='Puducherry'\n",
- "state='Punjab'\n",
- "cities=['Amritsar', 'Bathinda', 'Jalandhar', 'Khanna', 'Ludhiana', 'Mandi Gobindgarh', 'Patiala', 'Rupnagar']\n",
- "state='Punjab', city='Amritsar'\n",
- "state='Punjab', city='Bathinda'\n",
- "state='Punjab', city='Jalandhar'\n",
- "state='Punjab', city='Khanna'\n",
- "state='Punjab', city='Ludhiana'\n",
- "state='Punjab', city='Mandi Gobindgarh'\n",
- "state='Punjab', city='Patiala'\n",
- "state='Punjab', city='Rupnagar'\n",
- "state='Rajasthan'\n",
- "cities=['Ajmer', 'Alwar', 'Banswara', 'Baran', 'Barmer', 'Bharatpur', 'Bhilwara', 'Bhiwadi', 'Bikaner', 'Bundi', 'Chittorgarh', 'Churu', 'Dausa', 'Dholpur', 'Dungarpur', 'Hanumangarh', 'Jaipur', 'Jaisalmer', 'Jalore', 'Jhalawar', 'Jhunjhunu', 'Jodhpur', 'Karauli', 'Kota', 'Nagaur', 'Pali', 'Pratapgarh', 'Rajsamand', 'Sawai Madhopur', 'Sikar', 'Sirohi', 'Sri Ganganagar', 'Tonk', 'Udaipur']\n",
- "state='Rajasthan', city='Ajmer'\n",
- "state='Rajasthan', city='Alwar'\n",
- "state='Rajasthan', city='Banswara'\n",
- "state='Rajasthan', city='Baran'\n",
- "state='Rajasthan', city='Barmer'\n",
- "state='Rajasthan', city='Bharatpur'\n",
- "state='Rajasthan', city='Bhilwara'\n",
- "state='Rajasthan', city='Bhiwadi'\n",
- "state='Rajasthan', city='Bikaner'\n",
- "state='Rajasthan', city='Bundi'\n",
- "state='Rajasthan', city='Chittorgarh'\n",
- "state='Rajasthan', city='Churu'\n",
- "state='Rajasthan', city='Dausa'\n",
- "state='Rajasthan', city='Dholpur'\n",
- "state='Rajasthan', city='Dungarpur'\n",
- "state='Rajasthan', city='Hanumangarh'\n",
- "state='Rajasthan', city='Jaipur'\n",
- "state='Rajasthan', city='Jaisalmer'\n",
- "state='Rajasthan', city='Jalore'\n",
- "state='Rajasthan', city='Jhalawar'\n",
- "state='Rajasthan', city='Jhunjhunu'\n",
- "state='Rajasthan', city='Jodhpur'\n",
- "state='Rajasthan', city='Karauli'\n",
- "state='Rajasthan', city='Kota'\n",
- "state='Rajasthan', city='Nagaur'\n",
- "state='Rajasthan', city='Pali'\n",
- "state='Rajasthan', city='Pratapgarh'\n",
- "state='Rajasthan', city='Rajsamand'\n",
- "state='Rajasthan', city='Sawai Madhopur'\n",
- "state='Rajasthan', city='Sikar'\n",
- "state='Rajasthan', city='Sirohi'\n",
- "state='Rajasthan', city='Sri Ganganagar'\n",
- "state='Rajasthan', city='Tonk'\n",
- "state='Rajasthan', city='Udaipur'\n",
- "state='Sikkim'\n",
- "cities=['Gangtok']\n",
- "state='Sikkim', city='Gangtok'\n",
- "state='Tamil Nadu'\n",
- "cities=['Ariyalur', 'Chengalpattu', 'Chennai', 'Coimbatore', 'Cuddalore', 'Dindigul', 'Gummidipoondi', 'Hosur', 'Kanchipuram', 'Karur', 'Madurai', 'Nagapattinam', 'Ooty', 'Palkalaiperur', 'Perundurai', 'Pudukottai', 'Ramanathapuram', 'Ranipet', 'Salem', 'Thanjavur', 'Thoothukudi', 'Tiruchirappalli', 'Tirunelveli', 'Tirupur', 'Vellore', 'Virudhunagar']\n",
- "state='Tamil Nadu', city='Ariyalur'\n",
- "state='Tamil Nadu', city='Chengalpattu'\n",
- "state='Tamil Nadu', city='Chennai'\n",
- "state='Tamil Nadu', city='Coimbatore'\n",
- "state='Tamil Nadu', city='Cuddalore'\n",
- "state='Tamil Nadu', city='Dindigul'\n",
- "state='Tamil Nadu', city='Gummidipoondi'\n",
- "state='Tamil Nadu', city='Hosur'\n",
- "state='Tamil Nadu', city='Kanchipuram'\n",
- "state='Tamil Nadu', city='Karur'\n",
- "state='Tamil Nadu', city='Madurai'\n",
- "state='Tamil Nadu', city='Nagapattinam'\n",
- "state='Tamil Nadu', city='Ooty'\n",
- "state='Tamil Nadu', city='Palkalaiperur'\n",
- "state='Tamil Nadu', city='Perundurai'\n",
- "state='Tamil Nadu', city='Pudukottai'\n",
- "state='Tamil Nadu', city='Ramanathapuram'\n",
- "state='Tamil Nadu', city='Ranipet'\n",
- "state='Tamil Nadu', city='Salem'\n",
- "state='Tamil Nadu', city='Thanjavur'\n",
- "state='Tamil Nadu', city='Thoothukudi'\n",
- "state='Tamil Nadu', city='Tiruchirappalli'\n",
- "state='Tamil Nadu', city='Tirunelveli'\n",
- "state='Tamil Nadu', city='Tirupur'\n",
- "state='Tamil Nadu', city='Vellore'\n",
- "state='Tamil Nadu', city='Virudhunagar'\n",
- "state='Telangana'\n",
- "cities=['Hyderabad']\n",
- "state='Telangana', city='Hyderabad'\n",
- "state='Tripura'\n",
- "cities=['Agartala']\n",
- "state='Tripura', city='Agartala'\n",
- "state='Uttar Pradesh'\n",
- "cities=['Agra', 'Baghpat', 'Bareilly', 'Bulandshahr', 'Firozabad', 'Ghaziabad', 'Gorakhpur', 'Greater Noida', 'Hapur', 'Jhansi', 'Kanpur', 'Khurja', 'Lucknow', 'Meerut', 'Moradabad', 'Muzaffarnagar', 'Noida', 'Prayagraj', 'Varanasi', 'Vrindavan']\n",
- "state='Uttar Pradesh', city='Agra'\n",
- "state='Uttar Pradesh', city='Baghpat'\n",
- "state='Uttar Pradesh', city='Bareilly'\n",
- "state='Uttar Pradesh', city='Bulandshahr'\n",
- "state='Uttar Pradesh', city='Firozabad'\n",
- "state='Uttar Pradesh', city='Ghaziabad'\n",
- "state='Uttar Pradesh', city='Gorakhpur'\n",
- "state='Uttar Pradesh', city='Greater Noida'\n",
- "state='Uttar Pradesh', city='Hapur'\n",
- "state='Uttar Pradesh', city='Jhansi'\n",
- "state='Uttar Pradesh', city='Kanpur'\n",
- "state='Uttar Pradesh', city='Khurja'\n",
- "state='Uttar Pradesh', city='Lucknow'\n",
- "state='Uttar Pradesh', city='Meerut'\n",
- "state='Uttar Pradesh', city='Moradabad'\n",
- "state='Uttar Pradesh', city='Muzaffarnagar'\n",
- "state='Uttar Pradesh', city='Noida'\n",
- "state='Uttar Pradesh', city='Prayagraj'\n",
- "state='Uttar Pradesh', city='Varanasi'\n",
- "state='Uttar Pradesh', city='Vrindavan'\n",
- "state='Uttarakhand'\n",
- "cities=['Dehradun', 'Kashipur', 'Rishikesh']\n",
- "state='Uttarakhand', city='Dehradun'\n",
- "state='Uttarakhand', city='Kashipur'\n",
- "state='Uttarakhand', city='Rishikesh'\n",
- "state='West Bengal'\n",
- "cities=['Asansol', 'Barrackpore', 'Durgapur', 'Haldia', 'Howrah', 'Kolkata', 'Siliguri']\n",
- "state='West Bengal', city='Asansol'\n",
- "state='West Bengal', city='Barrackpore'\n",
- "state='West Bengal', city='Durgapur'\n",
- "state='West Bengal', city='Haldia'\n",
- "state='West Bengal', city='Howrah'\n",
- "state='West Bengal', city='Kolkata'\n",
- "state='West Bengal', city='Siliguri'\n"
- ]
}
],
"source": [
- "metadata_df = pd.DataFrame(columns=[\"State\", \"City\", \"Station\"])\n",
- "for state in tqdm(metadata_dict):\n",
- " print(f\"{state=}\")\n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " dropdowns[0].click() # Open\n",
- " # select state\n",
- " option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{state}')]\")\n",
- " option.click() # Select and Close\n",
- " \n",
- " sleep(0.1)\n",
+ "metadata_df = pd.DataFrame(columns=[\"State\", \"City\", \"Station\", \"site_id\"])\n",
+ "\n",
+ "# This loop took less than a minute to run\n",
+ "progress_bar = tqdm(total=600) # as of 2024, 560 stations. update this number if it changes\n",
+ "for state in states:\n",
+ " select_dropdown_option(driver, drop_states, state)\n",
" \n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " dropdowns[1].click() # Open\n",
" # Get all cities\n",
- " cities = dropdowns[1].text.replace(\"Select ...\\n▲\\n\", \"\").split(\"\\n\")\n",
- " print(f\"{cities=}\")\n",
- " metadata_dict[state] = {city: {} for city in cities}\n",
- " dropdowns[1].click() # Close\n",
- " \n",
- " sleep(0.1)\n",
+ " drop_cities.click() # Open the dropdown\n",
+ " cities = drop_cities.text.replace(\"▲\\n\", \"\").split(\"\\n\")\n",
+ " drop_cities.click() # Close the dropdown\n",
" \n",
" for city in cities:\n",
- " print(f\"{state=}, {city=}\")\n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " dropdowns[1].click()\n",
- " option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{city}')]\")\n",
- " option.click() # Select and Close\n",
- " \n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
+ " select_dropdown_option(driver, drop_cities, city)\n",
" \n",
- " sleep(0.1)\n",
- "\n",
" # Get all stations\n",
- " dropdowns[2].click()\n",
- " stations = dropdowns[2].text.replace(\"Select ...\\n▲\\n\", \"\").split(\"\\n\")\n",
+ " drop_stations.click() # Open the dropdown\n",
+ " stations = drop_stations.text.replace(\"▲\\n\", \"\").split(\"\\n\")\n",
+ " drop_stations.click() # Close the dropdown\n",
+ " \n",
" for station in stations:\n",
- " metadata_df.loc[len(metadata_df)] = [state, city, station]\n",
- " sleep(0.1)"
+ " # corner cases\n",
+ " if station == \"Municipal Corporation Office, Dharuhera - HSPCB\":\n",
+ " site_id = \"site_5044\"\n",
+ " elif station == \"Civil Lines, Ajmer - RSPCB\":\n",
+ " site_id = \"site_1392\"\n",
+ " else:\n",
+ " try:\n",
+ " select_dropdown_option(driver, drop_stations, station)\n",
+ " except:\n",
+ " print(\"Unable to select station\")\n",
+ " print(station)\n",
+ " print(drop_stations.text)\n",
+ " continue\n",
+ " site_id = drop_stations.get_attribute(\"ng-reflect-model\")\n",
+ " metadata_df.loc[len(metadata_df)] = [state, city, station, site_id]\n",
+ " progress_bar.update(1)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -534,7 +205,7 @@
"560"
]
},
- "execution_count": 31,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -545,7 +216,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -572,6 +243,7 @@
"
State | \n",
" City | \n",
" Station | \n",
+ " site_id | \n",
" \n",
" \n",
" \n",
@@ -580,30 +252,35 @@
" Andhra Pradesh | \n",
" Amaravati | \n",
" Secretariat, Amaravati - APPCB | \n",
+ " site_1406 | \n",
" \n",
" \n",
" 1 | \n",
" Andhra Pradesh | \n",
" Anantapur | \n",
" Gulzarpet, Anantapur - APPCB | \n",
+ " site_5632 | \n",
"
\n",
" \n",
" 2 | \n",
" Andhra Pradesh | \n",
" Chittoor | \n",
" Gangineni Cheruvu, Chittoor - APPCB | \n",
+ " site_5665 | \n",
"
\n",
" \n",
" 3 | \n",
" Andhra Pradesh | \n",
" Kadapa | \n",
" Yerramukkapalli, Kadapa - APPCB | \n",
+ " site_5693 | \n",
"
\n",
" \n",
" 4 | \n",
" Andhra Pradesh | \n",
" Rajamahendravaram | \n",
" Anand Kala Kshetram, Rajamahendravaram - APPCB | \n",
+ " site_1399 | \n",
"
\n",
" \n",
"\n",
@@ -617,15 +294,15 @@
"3 Andhra Pradesh Kadapa \n",
"4 Andhra Pradesh Rajamahendravaram \n",
"\n",
- " Station \n",
- "0 Secretariat, Amaravati - APPCB \n",
- "1 Gulzarpet, Anantapur - APPCB \n",
- "2 Gangineni Cheruvu, Chittoor - APPCB \n",
- "3 Yerramukkapalli, Kadapa - APPCB \n",
- "4 Anand Kala Kshetram, Rajamahendravaram - APPCB "
+ " Station site_id \n",
+ "0 Secretariat, Amaravati - APPCB site_1406 \n",
+ "1 Gulzarpet, Anantapur - APPCB site_5632 \n",
+ "2 Gangineni Cheruvu, Chittoor - APPCB site_5665 \n",
+ "3 Yerramukkapalli, Kadapa - APPCB site_5693 \n",
+ "4 Anand Kala Kshetram, Rajamahendravaram - APPCB site_1399 "
]
},
- "execution_count": 32,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -636,7 +313,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -663,6 +340,7 @@
" State | \n",
" City | \n",
" Station | \n",
+ " site_id | \n",
" \n",
" \n",
" \n",
@@ -670,46 +348,58 @@
" 555 | \n",
" West Bengal | \n",
" Kolkata | \n",
- " Jadavpur, Kolkata - WBPCB | \n",
+ " Rabindra Bharati University, Kolkata - WBPCB | \n",
+ " site_296 | \n",
" \n",
" \n",
" 556 | \n",
" West Bengal | \n",
" Kolkata | \n",
- " Rabindra Bharati University, Kolkata - WBPCB | \n",
+ " Fort William, Kolkata - WBPCB | \n",
+ " site_5110 | \n",
"
\n",
" \n",
" 557 | \n",
" West Bengal | \n",
" Kolkata | \n",
- " Rabindra Sarobar, Kolkata - WBPCB | \n",
+ " Victoria, Kolkata - WBPCB | \n",
+ " site_309 | \n",
"
\n",
" \n",
" 558 | \n",
" West Bengal | \n",
" Kolkata | \n",
- " Victoria, Kolkata - WBPCB | \n",
+ " Bidhannagar, Kolkata - WBPCB | \n",
+ " site_5129 | \n",
"
\n",
" \n",
" 559 | \n",
" West Bengal | \n",
" Siliguri | \n",
" Ward-32 Bapupara, Siliguri - WBPCB | \n",
+ " site_1419 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " State City Station\n",
- "555 West Bengal Kolkata Jadavpur, Kolkata - WBPCB\n",
- "556 West Bengal Kolkata Rabindra Bharati University, Kolkata - WBPCB\n",
- "557 West Bengal Kolkata Rabindra Sarobar, Kolkata - WBPCB\n",
- "558 West Bengal Kolkata Victoria, Kolkata - WBPCB\n",
- "559 West Bengal Siliguri Ward-32 Bapupara, Siliguri - WBPCB"
+ " State City Station \\\n",
+ "555 West Bengal Kolkata Rabindra Bharati University, Kolkata - WBPCB \n",
+ "556 West Bengal Kolkata Fort William, Kolkata - WBPCB \n",
+ "557 West Bengal Kolkata Victoria, Kolkata - WBPCB \n",
+ "558 West Bengal Kolkata Bidhannagar, Kolkata - WBPCB \n",
+ "559 West Bengal Siliguri Ward-32 Bapupara, Siliguri - WBPCB \n",
+ "\n",
+ " site_id \n",
+ "555 site_296 \n",
+ "556 site_5110 \n",
+ "557 site_309 \n",
+ "558 site_5129 \n",
+ "559 site_1419 "
]
},
- "execution_count": 33,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -720,453 +410,315 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "metadata_df.to_csv(\"metadata.csv\", index=False)"
- ]
- },
- {
- "cell_type": "markdown",
+ "execution_count": 33,
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " State City \\\n",
+ "498 Uttar Pradesh Greater Noida \n",
+ "525 Uttar Pradesh Noida \n",
+ "\n",
+ " Station site_id \n",
+ "498 Knowledge Park - III, Greater Noida - UPPCB site_1541 \n",
+ "525 Knowledge Park - III, Greater Noida - UPPCB site_1541 \n",
+ " State City Station site_id\n",
+ "25 Bihar Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788\n",
+ "254 Maharashtra Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788\n",
+ " State City Station site_id\n",
+ "26 Bihar Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198\n",
+ "255 Maharashtra Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198\n",
+ " State City Station site_id\n",
+ "27 Bihar Aurangabad Gurdeo Nagar, Aurangabad - BSPCB site_5544\n",
+ "256 Maharashtra Aurangabad Gurdeo Nagar, Aurangabad - BSPCB site_5544\n",
+ " State City Station site_id\n",
+ "28 Bihar Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789\n",
+ "257 Maharashtra Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789\n",
+ " State City Station \\\n",
+ "499 Uttar Pradesh Greater Noida Knowledge Park - V, Greater Noida - UPPCB \n",
+ "526 Uttar Pradesh Noida Knowledge Park - V, Greater Noida - UPPCB \n",
+ "\n",
+ " site_id \n",
+ "499 site_5121 \n",
+ "526 site_5121 \n"
+ ]
+ }
+ ],
"source": [
- "## Once metadata is saved, run from here"
+ "for site_id, more_than_1 in (metadata_df.site_id.value_counts() > 1).items():\n",
+ " if more_than_1:\n",
+ " print(metadata_df[metadata_df.site_id == site_id])"
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "560"
+ "554"
]
},
- "execution_count": 23,
+ "execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "metadata_df = pd.read_csv('metadata.csv')\n",
+ "# clean up\n",
+ "drop_items = [metadata_df[(metadata_df.State == \"Bihar\") & (metadata_df.Station == \"MIDC Chilkalthana, Aurangabad - MPCB\")].index.item(),\n",
+ " metadata_df[(metadata_df.City == \"Noida\") & (metadata_df.Station == \"Knowledge Park - III, Greater Noida - UPPCB\")].index.item(),\n",
+ " metadata_df[(metadata_df.State == \"Bihar\") & (metadata_df.Station == \"More Chowk Waluj, Aurangabad - MPCB\")].index.item(),\n",
+ " metadata_df[(metadata_df.State == \"Bihar\") & (metadata_df.Station == \"MIDC Chilkalthana, Aurangabad - MPCB\")].index.item(),\n",
+ " metadata_df[(metadata_df.State == \"Maharashtra\") & (metadata_df.Station == \"Gurdeo Nagar, Aurangabad - BSPCB\")].index.item(),\n",
+ " metadata_df[(metadata_df.State == \"Bihar\") & (metadata_df.Station == \"Rachnakar Colony, Aurangabad - MPCB\")].index.item(),\n",
+ " metadata_df[(metadata_df.City == \"Noida\") & (metadata_df.Station == \"Knowledge Park - V, Greater Noida - UPPCB\")].index.item()]\n",
+ "\n",
+ "metadata_df.drop(drop_items, inplace=True)\n",
"len(metadata_df)"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
- "# add download directory\n",
- "options = webdriver.ChromeOptions()\n",
- "options.add_experimental_option(\"prefs\", {\n",
- " \"download.default_directory\": \"/Users/project561/blog/cpcb_downloads\"\n",
- "})\n",
- "\n",
- "driver = webdriver.Chrome(options=options)\n",
- "driver.get(HOME_URL)"
+ "assert set(metadata_df.site_id.value_counts()) == {1}"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 48,
"metadata": {},
+ "outputs": [],
"source": [
- "Enter Captcha manually before moving ahead"
+ "metadata_df.to_csv(\"metadata.csv\", index=False)"
]
},
{
- "cell_type": "code",
- "execution_count": 15,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "driver.get(DOWNLOAD_PAGE_URL)"
+ "## Downloading data"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 56,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "state='Andhra Pradesh', city='Amaravati', station='Secretariat, Amaravati - APPCB'\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "i = 0\n",
- "entry = metadata_df.loc[i]\n",
- "state = entry[\"State\"]\n",
- "city = entry[\"City\"]\n",
- "station = entry[\"Station\"]\n",
- "print(f\"{state=}, {city=}, {station=}\")"
+ "# URL is specific to PM2.5 and PM10 so update it as per your needs\n",
+ "def get_url(state, city, site_id):\n",
+ " return f\"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-view-data-report/%2522%257B%255C%2522parameter_list%255C%2522%253A%255B%257B%255C%2522id%255C%2522%253A0%252C%255C%2522itemName%255C%2522%253A%255C%2522PM2.5%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_193%255C%2522%257D%252C%257B%255C%2522id%255C%2522%253A1%252C%255C%2522itemName%255C%2522%253A%255C%2522PM10%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_215%255C%2522%257D%255D%252C%255C%2522criteria%255C%2522%253A%255C%252224%2520Hours%255C%2522%252C%255C%2522reportFormat%255C%2522%253A%255C%2522Tabular%255C%2522%252C%255C%2522fromDate%255C%2522%253A%255C%252201-01-2024%2520T00%253A00%253A00Z%255C%2522%252C%255C%2522toDate%255C%2522%253A%255C%252211-12-2024%2520T16%253A45%253A59Z%255C%2522%252C%255C%2522state%255C%2522%253A%255C%2522{state.replace(' ', '%2520')}%255C%2522%252C%255C%2522city%255C%2522%253A%255C%2522{city.replace(' ', '%2520')}%255C%2522%252C%255C%2522station%255C%2522%253A%255C%2522{site_id}%255C%2522%252C%255C%2522parameter%255C%2522%253A%255B%255C%2522parameter_193%255C%2522%252C%255C%2522parameter_215%255C%2522%255D%252C%255C%2522parameterNames%255C%2522%253A%255B%255C%2522PM2.5%255C%2522%252C%255C%2522PM10%255C%2522%255D%257D%2522\""
]
},
{
"cell_type": "code",
- "execution_count": 224,
+ "execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
- "# State selection\n",
- "dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- "dropdowns[0].click() # Open\n",
- "# select state\n",
- "option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{state}')]\")\n",
- "option.click() # Select and Close\n",
- "\n",
- "sleep(0.1)\n",
- "\n",
- "# City selection\n",
- "dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- "dropdowns[1].click() # Open\n",
- "\n",
- "sleep(0.1)\n",
- "\n",
- "# select city\n",
- "option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{city}')]\")\n",
- "option.click() # Select and Close\n",
- "\n",
- "sleep(0.1)\n",
- "\n",
- "# Station selection\n",
- "dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- "dropdowns[2].click() # Open\n",
- "\n",
- "sleep(0.1)\n",
+ "# add download directory\n",
+ "options = webdriver.ChromeOptions()\n",
+ "options.add_experimental_option(\"prefs\", {\n",
+ " \"download.default_directory\": \"/Users/project561/cpcb_downloads\"\n",
+ "})\n",
"\n",
- "# select station\n",
- "option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{station}')]\")\n",
- "option.click() # Select and Close"
+ "driver = webdriver.Chrome(options=options)\n",
+ "driver.get(HOME_URL)"
]
},
{
- "cell_type": "code",
- "execution_count": 225,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "multi_select = driver.find_element(By.XPATH, \"//angular2-multiselect//div[@class='c-btn']\")\n",
- "multi_select.click() # Open\n",
- "sleep(0.1)\n",
- "\n",
- "pm25_checkbox = driver.find_element(By.XPATH, \"//label[text()='PM2.5']/preceding-sibling::input\")\n",
- "pm10_checkbox = driver.find_element(By.XPATH, \"//label[text()='PM10']/preceding-sibling::input\")\n",
- "actions = ActionChains(driver)\n",
- "if not pm25_checkbox.is_selected():\n",
- " actions.move_to_element(pm25_checkbox).click().perform()\n",
- " sleep(0.1)\n",
- "if not pm10_checkbox.is_selected():\n",
- " actions.move_to_element(pm10_checkbox).click().perform()\n",
- " sleep(0.1)\n",
- "\n",
- "multi_select.click() # Close"
+ "Enter Captcha manually before moving ahead"
]
},
{
"cell_type": "code",
- "execution_count": 226,
+ "execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " State | \n",
+ " City | \n",
+ " Station | \n",
+ " site_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Andhra Pradesh | \n",
+ " Amaravati | \n",
+ " Secretariat, Amaravati - APPCB | \n",
+ " site_1406 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Andhra Pradesh | \n",
+ " Anantapur | \n",
+ " Gulzarpet, Anantapur - APPCB | \n",
+ " site_5632 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "5"
+ " State City Station site_id\n",
+ "0 Andhra Pradesh Amaravati Secretariat, Amaravati - APPCB site_1406\n",
+ "1 Andhra Pradesh Anantapur Gulzarpet, Anantapur - APPCB site_5632"
]
},
- "execution_count": 226,
+ "execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- "len(dropdowns)"
+ "metadata_df = pd.read_csv(\"metadata.csv\")\n",
+ "metadata_df.head(2)"
]
},
{
"cell_type": "code",
- "execution_count": 227,
- "metadata": {},
- "outputs": [],
- "source": [
- "dropdowns[4].click() # Open\n",
- "option = driver.find_element(\"xpath\", \"//li[contains(text(), '15 Minute')]\")\n",
- "option.click() # Select and Close"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 228,
+ "execution_count": 128,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Already in December\n"
- ]
- }
- ],
- "source": [
- "date_pickers = driver.find_elements(By.CSS_SELECTOR, \".wc-date-container\")\n",
- "assert len(date_pickers) == 2, len(date_pickers)\n",
- "\n",
- "# Select start date\n",
- "date_pickers[0].click()\n",
- "sleep(0.5)\n",
- "desired_month = driver.find_element(By.CLASS_NAME, \"month-year\")\n",
- "desired_month.click()\n",
- "sleep(0.5)\n",
- "option = driver.find_element(By.ID, \"JAN\")\n",
- "option.click()\n",
- "sleep(0.5)\n",
- "try:\n",
- " desired_date = driver.find_element(By.XPATH, \"//td[@class='calendar-day']/span[text()='1']\")\n",
- " driver.execute_script(\"arguments[0].click();\", desired_date)\n",
- "except:\n",
- " print(\"Already selected\")\n",
- "\n",
- "date_pickers = driver.find_elements(By.CSS_SELECTOR, \".wc-date-container\")\n",
- "assert len(date_pickers) == 2, len(date_pickers)\n",
- "\n",
- "# Select end date\n",
- "date_pickers[1].click()\n",
- "sleep(0.5)\n",
- "desired_months = driver.find_elements(By.CLASS_NAME, \"month-year\")\n",
- "desired_months[1].click()\n",
- "sleep(0.5)\n",
- "\n",
- "try:\n",
- " option = driver.find_element(By.ID, \"DEC\")\n",
- " option.click()\n",
- "except:\n",
- " print(\"Already in December\")\n",
- "\n",
- "try:\n",
- " desired_date = driver.find_element(By.XPATH, \"//td[@class='calendar-day']/span[text()='1']\")\n",
- " driver.execute_script(\"arguments[0].click();\", desired_date)\n",
- "except:\n",
- " print(\"Already selected\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 229,
- "metadata": {},
- "outputs": [],
- "source": [
- "# click on submit\n",
- "submit_button = driver.find_element(By.XPATH, \"//button[text()='Submit']\")\n",
- "submit_button.click()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 231,
- "metadata": {},
- "outputs": [],
- "source": [
- "excel_button = WebDriverWait(driver, 20).until(\n",
- " EC.element_to_be_clickable((By.CLASS_NAME, \"fa-file-excel-o\"))\n",
- ")\n",
- "excel_button.click()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Full loop"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Downloading data for entry 44\n",
- "state='Bihar', city='Motihari', station='Gandak Colony, Motihari - BSPCB'\n",
- "Already in December\n",
- "Downloading data for entry 45\n",
- "state='Bihar', city='Munger', station='Town Hall, Munger - BSPCB'\n",
- "Already in December\n",
- "Downloading data for entry 46\n",
- "state='Bihar', city='Muzaffarpur', station='Buddha Colony, Muzaffarpur - BSPCB'\n",
- "Already in December\n",
- "Downloading data for entry 47\n",
- "state='Bihar', city='Muzaffarpur', station='MIT-Daudpur Kothi, Muzaffarpur - BSPCB'\n",
- "Already in December\n",
- "Downloading data for entry 48\n",
- "state='Bihar', city='Muzaffarpur', station='Muzaffarpur Collectorate, Muzaffarpur - BSPCB'\n",
- "PM10 not available\n",
- "Already in December\n",
- "Downloading data for entry 49\n",
- "state='Bihar', city='Patna', station='DRM Office Danapur, Patna - BSPCB'\n",
- "Already in December\n",
- "Downloading data for entry 50\n",
- "state='Bihar', city='Patna', station='Govt. High School Shikarpur, Patna - BSPCB'\n",
- "Already in December\n",
- "Downloading data for entry 51\n",
- "state='Bihar', city='Patna', station='IGSC Planetarium Complex, Patna - BSPCB'\n"
+ "Number of files in the download directory: 204\n",
+ "Downloading 203 Karnataka Ramanagara Vijay Nagar, Ramanagara - KSPCB site_5255\n",
+ "Downloading 204 Karnataka Shivamogga Vinoba Nagara, Shivamogga - KSPCB site_5266\n",
+ "Downloading 205 Karnataka Tumakuru Thimmalapura, Tumakuru - KSPCB site_5682\n",
+ "Downloading 206 Karnataka Udupi Brahmagiri, Udupi - KSPCB site_5433\n",
+ "Downloading 207 Karnataka Vijayapura Ibrahimpur, Vijayapura - KSPCB site_5267\n",
+ "Downloading 208 Karnataka Yadgir Collector Office, Yadgir - KSPCB site_5254\n",
+ "Downloading 209 Kerala Eloor Udyogamandal, Eloor - Kerala PCB site_5105\n",
+ "Downloading 210 Kerala Ernakulam Kacheripady, Ernakulam - Kerala PCB site_5272\n",
+ "Downloading 211 Kerala Kannur Thavakkara, Kannur - Kerala PCB site_5276\n",
+ "Downloading 212 Kerala Kochi Vyttila, Kochi - Kerala PCB site_5270\n",
+ "Downloading 213 Kerala Kollam Polayathode, Kollam - Kerala PCB site_5334\n",
+ "Downloading 214 Kerala Kozhikode Palayam, Kozhikode - Kerala PCB site_5271\n",
+ "Downloading 215 Kerala Thiruvananthapuram Plammoodu, Thiruvananthapuram - Kerala PCB site_252\n",
+ "Downloading 216 Kerala Thiruvananthapuram Kariavattom, Thiruvananthapuram - Kerala PCB site_5331\n",
+ "Downloading 217 Kerala Thrissur Corporation Ground, Thrissur - Kerala PCB site_5390\n",
+ "Downloading 218 Madhya Pradesh Bhopal Idgah Hills, Bhopal - MPPCB site_5708\n",
+ "Downloading 219 Madhya Pradesh Bhopal T T Nagar, Bhopal - MPPCB site_5247\n",
+ "Downloading 220 Madhya Pradesh Bhopal Paryavaran Parisar, Bhopal - MPPCB site_5650\n",
+ "Downloading 221 Madhya Pradesh Damoh Shrivastav Colony, Damoh - MPPCB site_5040\n",
+ "Downloading 222 Madhya Pradesh Dewas Bhopal Chauraha, Dewas - MPPCB site_1404\n",
+ "Downloading 223 Madhya Pradesh Gwalior Maharaj Bada, Gwalior - MPPCB site_5661\n",
+ "Downloading 224 Madhya Pradesh Gwalior City Center, Gwalior - MPPCB site_5273\n",
+ "Downloading 225 Madhya Pradesh Gwalior Phool Bagh, Gwalior - Mondelez Ind. Food site_5275\n",
+ "Downloading 226 Madhya Pradesh Gwalior Deen Dayal Nagar, Gwalior - MPPCB site_5667\n",
+ "Downloading 227 Madhya Pradesh Indore Maguda Nagar, Indore - IMC site_5856\n",
+ "Downloading 228 Madhya Pradesh Indore Chhoti Gwaltoli, Indore - MPPCB site_5248\n",
+ "Downloading 229 Madhya Pradesh Indore Residency Area, Indore - IMC site_5854\n",
+ "Downloading 230 Madhya Pradesh Indore Vijay Nagar Scheme-78, Indore - Glenmark site_5709\n",
+ "Downloading 231 Madhya Pradesh Indore Regional Park, Indore - IMC site_5855\n",
+ "Downloading 232 Madhya Pradesh Indore Airport Area, Indore - IMC site_5857\n",
+ "Downloading 233 Madhya Pradesh Jabalpur Suhagi, Jabalpur - JMC site_5816\n",
+ "Downloading 234 Madhya Pradesh Jabalpur Gupteshwar, Jabalpur - JMC site_5817\n",
+ "Downloading 235 Madhya Pradesh Jabalpur Govindh Bhavan Colony, Jabalpur - JMC site_5965\n",
+ "Downloading 236 Madhya Pradesh Jabalpur Marhatal, Jabalpur - MPPCB site_5249\n",
+ "Downloading 237 Madhya Pradesh Katni Gole Bazar, Katni - MPPCB site_5250\n",
+ "Downloading 238 Madhya Pradesh Maihar Sahilara, Maihar - KJS Cements site_5068\n",
+ "Downloading 239 Madhya Pradesh Mandideep Sector-D Industrial Area, Mandideep - MPPCB site_1403\n",
+ "Downloading 240 Madhya Pradesh Pithampur Sector-2 Industrial Area, Pithampur - MPPCB site_1402\n",
+ "Downloading 241 Madhya Pradesh Ratlam Shasthri Nagar, Ratlam - IPCA Lab site_5042\n",
+ "Downloading 242 Madhya Pradesh Sagar Deen Dayal Nagar, Sagar - MPPCB site_5269\n",
+ "Downloading 243 Madhya Pradesh Sagar Civil Lines, Sagar - MPPCB site_5662\n",
+ "Downloading 244 Madhya Pradesh Satna Bandhavgar Colony, Satna - Birla Cement site_1433\n",
+ "Downloading 245 Madhya Pradesh Singrauli Suryakiran Bhawan NCL, Singrauli - MPPCB site_1401\n",
+ "Downloading 246 Madhya Pradesh Ujjain Mahakaleshwar Temple, Ujjain - MPPCB site_1400\n",
+ "Downloading 247 Maharashtra Ahmednagar Tarakpur, Ahmednagar - MPCB site_5785\n",
+ "Downloading 248 Maharashtra Akola Ramdaspeth, Akola - MPCB site_5784\n",
+ "Downloading 249 Maharashtra Amravati Shivneri Colony, Amravati - MPCB site_5786\n",
+ "Downloading 250 Maharashtra Amravati Shri Shivaji Science College, Amaravati - MPCB site_5787\n",
+ "Downloading 251 Maharashtra Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788\n",
+ "Downloading 252 Maharashtra Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198\n",
+ "Downloading 253 Maharashtra Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789\n"
]
},
{
"ename": "TimeoutException",
- "evalue": "Message: \n",
+ "evalue": "Message: \nStacktrace:\n0 chromedriver 0x0000000104d9baf0 cxxbridge1$str$ptr + 3651580\n1 chromedriver 0x0000000104d94340 cxxbridge1$str$ptr + 3620940\n2 chromedriver 0x00000001047fc4b4 cxxbridge1$string$len + 89224\n3 chromedriver 0x0000000104840898 cxxbridge1$string$len + 368748\n4 chromedriver 0x000000010487a0fc cxxbridge1$string$len + 604368\n5 chromedriver 0x00000001048350b0 cxxbridge1$string$len + 321668\n6 chromedriver 0x0000000104835d00 cxxbridge1$string$len + 324820\n7 chromedriver 0x0000000104d66e34 cxxbridge1$str$ptr + 3435328\n8 chromedriver 0x0000000104d6a14c cxxbridge1$str$ptr + 3448408\n9 chromedriver 0x0000000104d4e1a8 cxxbridge1$str$ptr + 3333812\n10 chromedriver 0x0000000104d6aa0c cxxbridge1$str$ptr + 3450648\n11 chromedriver 0x0000000104d3f9b4 cxxbridge1$str$ptr + 3274432\n12 chromedriver 0x0000000104d85120 cxxbridge1$str$ptr + 3558956\n13 chromedriver 0x0000000104d8529c cxxbridge1$str$ptr + 3559336\n14 chromedriver 0x0000000104d93fb4 cxxbridge1$str$ptr + 3620032\n15 libsystem_pthread.dylib 0x0000000195eb9f94 _pthread_start + 136\n16 libsystem_pthread.dylib 0x0000000195eb4d34 thread_start + 8\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTimeoutException\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[29], line 130\u001b[0m\n\u001b[1;32m 128\u001b[0m driver\u001b[38;5;241m.\u001b[39mexecute_script(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwindow.open(\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m);\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 129\u001b[0m driver\u001b[38;5;241m.\u001b[39mswitch_to\u001b[38;5;241m.\u001b[39mwindow(driver\u001b[38;5;241m.\u001b[39mwindow_handles[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m--> 130\u001b[0m \u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n",
- "Cell \u001b[0;32mIn[29], line 11\u001b[0m, in \u001b[0;36mdownload\u001b[0;34m(i)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstate\u001b[38;5;132;01m=}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcity\u001b[38;5;132;01m=}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstation\u001b[38;5;132;01m=}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# State selection\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m \u001b[43mWebDriverWait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdriver\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m60\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muntil\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mEC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpresence_of_all_elements_located\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCLASS_NAME\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mselect-box\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m dropdowns \u001b[38;5;241m=\u001b[39m driver\u001b[38;5;241m.\u001b[39mfind_elements(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcss selector\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.select-box\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 15\u001b[0m dropdowns[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mclick() \u001b[38;5;66;03m# Open\u001b[39;00m\n",
+ "Cell \u001b[0;32mIn[128], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m driver\u001b[38;5;241m.\u001b[39mswitch_to\u001b[38;5;241m.\u001b[39mwindow(driver\u001b[38;5;241m.\u001b[39mwindow_handles[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m 21\u001b[0m driver\u001b[38;5;241m.\u001b[39mget(url)\n\u001b[0;32m---> 22\u001b[0m excel_button \u001b[38;5;241m=\u001b[39m \u001b[43mWebDriverWait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdriver\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m20\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muntil\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m\u001b[49m\u001b[43mEC\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43melement_to_be_clickable\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCLASS_NAME\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfa-file-excel-o\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m click_it(driver, excel_button)\n\u001b[1;32m 25\u001b[0m sleep(\u001b[38;5;241m1\u001b[39m)\n",
"File \u001b[0;32m/opt/miniconda3/lib/python3.12/site-packages/selenium/webdriver/support/wait.py:105\u001b[0m, in \u001b[0;36mWebDriverWait.until\u001b[0;34m(self, method, message)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time\u001b[38;5;241m.\u001b[39mmonotonic() \u001b[38;5;241m>\u001b[39m end_time:\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 105\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m TimeoutException(message, screen, stacktrace)\n",
- "\u001b[0;31mTimeoutException\u001b[0m: Message: \n"
+ "\u001b[0;31mTimeoutException\u001b[0m: Message: \nStacktrace:\n0 chromedriver 0x0000000104d9baf0 cxxbridge1$str$ptr + 3651580\n1 chromedriver 0x0000000104d94340 cxxbridge1$str$ptr + 3620940\n2 chromedriver 0x00000001047fc4b4 cxxbridge1$string$len + 89224\n3 chromedriver 0x0000000104840898 cxxbridge1$string$len + 368748\n4 chromedriver 0x000000010487a0fc cxxbridge1$string$len + 604368\n5 chromedriver 0x00000001048350b0 cxxbridge1$string$len + 321668\n6 chromedriver 0x0000000104835d00 cxxbridge1$string$len + 324820\n7 chromedriver 0x0000000104d66e34 cxxbridge1$str$ptr + 3435328\n8 chromedriver 0x0000000104d6a14c cxxbridge1$str$ptr + 3448408\n9 chromedriver 0x0000000104d4e1a8 cxxbridge1$str$ptr + 3333812\n10 chromedriver 0x0000000104d6aa0c cxxbridge1$str$ptr + 3450648\n11 chromedriver 0x0000000104d3f9b4 cxxbridge1$str$ptr + 3274432\n12 chromedriver 0x0000000104d85120 cxxbridge1$str$ptr + 3558956\n13 chromedriver 0x0000000104d8529c cxxbridge1$str$ptr + 3559336\n14 chromedriver 0x0000000104d93fb4 cxxbridge1$str$ptr + 3620032\n15 libsystem_pthread.dylib 0x0000000195eb9f94 _pthread_start + 136\n16 libsystem_pthread.dylib 0x0000000195eb4d34 thread_start + 8\n"
]
}
],
"source": [
- "def download(i):\n",
- " print(\"Downloading data for entry\", i)\n",
- " driver.get(DOWNLOAD_PAGE_URL)\n",
- " entry = metadata_df.loc[i]\n",
- " state = entry[\"State\"]\n",
- " city = entry[\"City\"]\n",
- " station = entry[\"Station\"]\n",
- " print(f\"{state=}, {city=}, {station=}\")\n",
- "\n",
- " # State selection\n",
- " WebDriverWait(driver, 60).until(\n",
- " EC.presence_of_all_elements_located((By.CLASS_NAME, \"select-box\"))\n",
- " )\n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " dropdowns[0].click() # Open\n",
- " # select state\n",
- " option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{state}')]\")\n",
- " option.click() # Select and Close\n",
- "\n",
- " sleep(0.1)\n",
- "\n",
- " # City selection\n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " dropdowns[1].click() # Open\n",
- "\n",
- " sleep(0.1)\n",
- "\n",
- " # select city\n",
- " option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{city}')]\")\n",
- " option.click() # Select and Close\n",
- "\n",
- " sleep(0.1)\n",
- "\n",
- " # Station selection\n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " dropdowns[2].click() # Open\n",
- "\n",
- " sleep(0.1)\n",
- "\n",
- " # select station\n",
- " option = driver.find_element(\"xpath\", f\"//li[contains(text(), '{station}')]\")\n",
- " option.click() # Select and Close\n",
- "\n",
- " multi_select = driver.find_element(By.XPATH, \"//angular2-multiselect//div[@class='c-btn']\")\n",
- " multi_select.click() # Open\n",
- " sleep(0.1)\n",
- "\n",
- " actions = ActionChains(driver)\n",
- " try:\n",
- " pm10_checkbox = driver.find_element(By.XPATH, \"//label[text()='PM10']/preceding-sibling::input\")\n",
- " if not pm10_checkbox.is_selected():\n",
- " actions.move_to_element(pm10_checkbox).click().perform()\n",
- " sleep(0.1)\n",
- " except:\n",
- " print(\"PM10 not available\")\n",
- " \n",
- " try:\n",
- " pm25_checkbox = driver.find_element(By.XPATH, \"//label[text()='PM2.5']/preceding-sibling::input\")\n",
- " if not pm25_checkbox.is_selected():\n",
- " actions.move_to_element(pm25_checkbox).click().perform()\n",
- " sleep(0.1)\n",
- " except:\n",
- " print(\"PM2.5 not available\") \n",
- "\n",
- " multi_select.click() # Close\n",
- "\n",
- " dropdowns = driver.find_elements(\"css selector\", \".select-box\")\n",
- " len(dropdowns)\n",
- "\n",
- " dropdowns[4].click() # Open\n",
- " option = driver.find_element(\"xpath\", \"//li[contains(text(), '15 Minute')]\")\n",
- " option.click() # Select and Close\n",
- "\n",
- " date_pickers = driver.find_elements(By.CSS_SELECTOR, \".wc-date-container\")\n",
- " assert len(date_pickers) == 2, len(date_pickers)\n",
- " \n",
- " # Select start date\n",
- " date_pickers[0].click()\n",
- " sleep(0.1)\n",
- " desired_month = driver.find_element(By.CLASS_NAME, \"month-year\")\n",
- " desired_month.click()\n",
- " sleep(0.1)\n",
- " try:\n",
- " option = driver.find_element(By.ID, \"JAN\")\n",
- " option.click()\n",
- " except:\n",
- " print(\"Already in January\")\n",
- " sleep(0.1)\n",
- " try:\n",
- " desired_date = driver.find_element(By.XPATH, \"//td[@class='calendar-day']/span[text()='1']\")\n",
- " driver.execute_script(\"arguments[0].click();\", desired_date)\n",
- " except:\n",
- " print(\"Already 1st selected\")\n",
- " \n",
- " date_pickers = driver.find_elements(By.CSS_SELECTOR, \".wc-date-container\")\n",
- " assert len(date_pickers) == 2, len(date_pickers)\n",
+ "files = glob(\"/Users/project561/cpcb_downloads/*.xlsx\")\n",
+ "print(\"Number of files in the download directory:\", len(files))\n",
+ "site_ids = [re.search(r\"site_\\d+?2024\", file).group()[:-4] for file in files]\n",
+ "# assert len(set(site_ids)) == len(site_ids), pd.Series(site_ids).value_counts()\n",
+ "site_ids = set(site_ids)\n",
+ "\n",
+ "num_windows = 0\n",
+ "\n",
+ "for i in range(len(metadata_df)):\n",
+ " state, city, station, site_id = metadata_df.iloc[i]\n",
+ " if site_id in site_ids:\n",
+ " # print(\"Already downloaded\", i, state, city, station, site_id)\n",
+ " continue\n",
+ " print(\"Downloading\", i, state, city, station, site_id)\n",
+ " url = get_url(state, city, site_id)\n",
" \n",
- " # Select end date\n",
- " date_pickers[1].click()\n",
- " sleep(0.1)\n",
- " desired_months = driver.find_elements(By.CLASS_NAME, \"month-year\")\n",
- " desired_months[1].click()\n",
- " sleep(0.1)\n",
- " try:\n",
- " option = driver.find_element(By.ID, \"DEC\")\n",
- " option.click()\n",
- " except:\n",
- " print(\"Already in December\")\n",
- " \n",
- " try:\n",
- " desired_date = driver.find_element(By.XPATH, \"//td[@class='calendar-day']/span[text()='1']\")\n",
- " driver.execute_script(\"arguments[0].click();\", desired_date)\n",
- " except:\n",
- " print(\"Already 1st selected\")\n",
- " \n",
- " # click on submit\n",
- " submit_button = driver.find_element(By.XPATH, \"//button[text()='Submit']\")\n",
- " submit_button.click()\n",
- " \n",
- " excel_button = WebDriverWait(driver, 20).until(\n",
- " EC.element_to_be_clickable((By.CLASS_NAME, \"fa-file-excel-o\"))\n",
- " )\n",
- " excel_button.click()\n",
- " sleep(10)\n",
- " \n",
- "for i in range(44, len(metadata_df)):\n",
- " # open a new tab\n",
+ " # open new tab\n",
" driver.execute_script(\"window.open('');\")\n",
+ " num_windows += 1\n",
" driver.switch_to.window(driver.window_handles[-1])\n",
- " download(i)"
+ " driver.get(url)\n",
+ " excel_button = WebDriverWait(driver, 20).until(\n",
+ " EC.element_to_be_clickable((By.CLASS_NAME, \"fa-file-excel-o\")))\n",
+ " click_it(driver, excel_button)\n",
+ " sleep(1)\n",
+ " \n",
+ " if len(driver.window_handles) > 10:\n",
+ " # close first 9 windows\n",
+ " for _ in range(9):\n",
+ " driver.switch_to.window(driver.window_handles[0])\n",
+ " driver.close()\n",
+ " \n",
+ " driver.switch_to.window(driver.window_handles[-1])\n",
+ " sleep(1)"
]
}
],