From 6b3db6c7c863f0ce6501e6d234e49318af9a1cad Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Thu, 22 Feb 2024 14:25:54 +0100 Subject: [PATCH] add new integration for scrapegraphai --- .gitignore | 2 ++ main.py | 67 +++++++++++++++++++++++++++++++++++------------- requirements.txt | 1 + task.py | 19 ++++++++++++++ 4 files changed, 71 insertions(+), 18 deletions(-) create mode 100644 .gitignore create mode 100644 task.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bf1a22f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +__pycache__/task.cpython-311.pyc diff --git a/main.py b/main.py index bed1b2c..3ea6a8c 100644 --- a/main.py +++ b/main.py @@ -1,31 +1,62 @@ import base64 import streamlit as st +import json +import pandas as pd +from task import task with st.sidebar: - st.write("**Usage**") - st.write("Add the api key") - st.write("Example of prompt:") - st.write("- Given a link scrape the website") - st.write("- Given a link scrape the number of stars on github") + st.write("# Usage Examples") + st.write("## Prompt 1") + st.write("- Give me all the news with their abstracts") + st.write("## Prompt 2") + st.write("- Create a voice summary of the webpage") + st.write("## Prompt 3") + st.write("- List me all the images with their visual description") st.title("Scrapegraph-ai") -left_co, cent_co,last_co = st.columns(3) +left_co, cent_co, last_co = st.columns(3) with cent_co: st.image("assets/scrapegraphai_logo.png") +key = st.text_input("API key", type="password") +model = st.radio( + "Select the model", + ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-4"], + index=None, +) -key = st.text_input("API key") +link_to_scrape = st.text_input("Link to scrape") +prompt = st.text_input("Write the prompt") -link = st.text_input("Link to scrape") +if st.button("Run the program", type="primary"): + if not key or not model or not link_to_scrape or not prompt: + st.error("Please fill in all fields.") + else: + + st.write("Scraping phase started ...") + result = task(key, link_to_scrape, prompt, model) + st.write(result) -link = st.text_input("Write the prompt") + if result: + json_str = json.dumps(result, indent=4) + df = pd.DataFrame(result) -if st.button("Run th program", type="primary"): - st.write('DO something') -else: - st.write('') + st.download_button( + label="Download JSON", + data=json_str, + file_name="scraped_data.json", + mime="application/json" + ) -left_co2,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,cent_co2,last_co2 = st.columns([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]) + csv = df.to_csv(index=False) + st.download_button( + label="Download CSV", + data=csv, + file_name="scraped_data.csv", + mime="text/csv" + ) + +left_co2, *_, cent_co2, last_co2 = st.columns([1]*18) with cent_co2: discord_link = "https://discord.gg/DujC7HG8" @@ -38,11 +69,11 @@ ) with last_co2: - discord_link = "https://github.com/VinciGit00/Scrapegraph-ai" - discord_logo = base64.b64encode(open("assets/github.png", "rb").read()).decode() + github_link = "https://github.com/VinciGit00/Scrapegraph-ai" + github_logo = base64.b64encode(open("assets/github.png", "rb").read()).decode() st.markdown( - f""" - + f""" + """, unsafe_allow_html=True, ) diff --git a/requirements.txt b/requirements.txt index e801de4..71f968b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ +scrapegraphai==0.0.4 streamlit==1.26.0 diff --git a/task.py b/task.py new file mode 100644 index 0000000..acdf9a0 --- /dev/null +++ b/task.py @@ -0,0 +1,19 @@ +from scrapegraphai.graphs import SmartScraperGraph + +def task(key:str, url:str, prompt:str, model:str): + """ + Task that execute the scraping: + - key (str): key of the model + - url (str): url to scrape + - prompt (str): prompt + - model (str): name of the model + """ + openai_key = key + llm_config = { + "api_key": openai_key, + "model_name": model, + } + + smart_scraper_graph = SmartScraperGraph(prompt, url, llm_config) + + return smart_scraper_graph.run()