Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retrieve Full-Texts from Sinequa Dev Servers #1077

Merged
merged 21 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions .envs/.local/.django
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,17 @@ SINEQUA_CONFIGS_REPO_WEBAPP_PR_BRANCH='dummy_branch'
# Slack Webhook
# ------------------------------------------------------------------------------
SLACK_WEBHOOK_URL=''
LRM_USER=''
LRM_PASSWORD=''

#Server Credentials
#--------------------------------------------------------------------------------
LRM_DEV_USER=''
LRM_DEV_PASSWORD=''
XLI_USER=''
XLI_PASSWORD=''
LRM_QA_USER=''
LRM_QA_PASSWORD=''

#Server Tokens
#--------------------------------------------------------------------------------
LRM_DEV_TOKEN=''
XLI_TOKEN=''
6 changes: 4 additions & 2 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,9 @@
SLACK_WEBHOOK_URL = env("SLACK_WEBHOOK_URL")
XLI_USER = env("XLI_USER")
XLI_PASSWORD = env("XLI_PASSWORD")
LRM_USER = env("LRM_USER")
LRM_PASSWORD = env("LRM_PASSWORD")
LRM_DEV_USER = env("LRM_DEV_USER")
LRM_DEV_PASSWORD = env("LRM_DEV_PASSWORD")
LRM_QA_USER = env("LRM_QA_USER")
LRM_QA_PASSWORD = env("LRM_QA_PASSWORD")
LRM_DEV_TOKEN = env("LRM_DEV_TOKEN")
XLI_TOKEN = env("XLI_TOKEN")
35 changes: 26 additions & 9 deletions sde_collections/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,23 @@
from .models.candidate_url import CandidateURL, ResolvedTitle
from .models.collection import Collection, WorkflowHistory
from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
from .tasks import import_candidate_urls_from_api
from .tasks import fetch_and_update_full_text, import_candidate_urls_from_api


def fetch_and_update_text_for_server(modeladmin, request, queryset, server_name):
for collection in queryset:
fetch_and_update_full_text.delay(collection.id, server_name)
modeladmin.message_user(request, f"Started importing URLs from {server_name.upper()} Server")


@admin.action(description="Import candidate URLs from LRM Dev Server with Full Text")
def fetch_full_text_lrm_dev_action(modeladmin, request, queryset):
saifrk marked this conversation as resolved.
Show resolved Hide resolved
fetch_and_update_text_for_server(modeladmin, request, queryset, "lrm_dev")


@admin.action(description="Import candidate URLs from XLI Server with Full Text")
def fetch_full_text_lis_action(modeladmin, request, queryset):
fetch_and_update_text_for_server(modeladmin, request, queryset, "xli")


@admin.action(description="Generate deployment message")
Expand Down Expand Up @@ -105,11 +121,10 @@ def import_candidate_urls_from_api_caller(modeladmin, request, queryset, server_
collection_ids=list(queryset.values_list("id", flat=True)),
server_name=server_name,
)
collection_names = ", ".join(queryset.values_list("name", flat=True))
messages.add_message(
request,
messages.INFO,
f"Started importing URLs from the API for: {collection_names} from {server_name.title()}",
f"Started importing URLs from {server_name.upper()} Server",
CarsonDavis marked this conversation as resolved.
Show resolved Hide resolved
)


Expand All @@ -133,19 +148,19 @@ def import_candidate_urls_secret_production(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "secret_production")


@admin.action(description="Import candidate URLs from Li's Server")
def import_candidate_urls_lis_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lis_server")
@admin.action(description="Import candidate URLs from XLI Server")
def import_candidate_urls_xli_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "xli")


@admin.action(description="Import candidate URLs from LRM Dev Server")
def import_candidate_urls_lrm_dev_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev_server")
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_dev")


@admin.action(description="Import candidate URLs from LRM QA Server")
def import_candidate_urls_lrm_qa_server(modeladmin, request, queryset):
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa_server")
import_candidate_urls_from_api_caller(modeladmin, request, queryset, "lrm_qa")


class ExportCsvMixin:
Expand Down Expand Up @@ -236,9 +251,11 @@ class CollectionAdmin(admin.ModelAdmin, ExportCsvMixin, UpdateConfigMixin):
import_candidate_urls_production,
import_candidate_urls_secret_test,
import_candidate_urls_secret_production,
import_candidate_urls_lis_server,
import_candidate_urls_xli_server,
import_candidate_urls_lrm_dev_server,
import_candidate_urls_lrm_qa_server,
fetch_full_text_lrm_dev_action,
fetch_full_text_lis_action,
]
ordering = ("cleaning_order",)

Expand Down
18 changes: 18 additions & 0 deletions sde_collections/migrations/0059_candidateurl_scraped_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.9 on 2024-10-21 23:10

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("sde_collections", "0058_candidateurl_division_collection_is_multi_division_and_more"),
]

operations = [
migrations.AddField(
model_name="candidateurl",
name="scraped_text",
field=models.TextField(blank=True, null=True),
),
]
24 changes: 24 additions & 0 deletions sde_collections/migrations/0060_alter_candidateurl_scraped_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.9 on 2024-11-07 17:34

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("sde_collections", "0059_candidateurl_scraped_text"),
]

operations = [
migrations.AlterField(
model_name="candidateurl",
name="scraped_text",
field=models.TextField(
blank=True,
default="",
help_text="This is the text scraped by Sinequa",
null=True,
verbose_name="Scraped Text",
),
),
]
7 changes: 7 additions & 0 deletions sde_collections/models/candidate_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ class CandidateURL(models.Model):
blank=True,
help_text="This is the original title scraped by Sinequa",
)
scraped_text = models.TextField(
"Scraped Text",
default="",
null=True,
blank=True,
help_text="This is the text scraped by Sinequa",
)
generated_title = models.CharField(
"Generated Title",
default="",
Expand Down
63 changes: 42 additions & 21 deletions sde_collections/sinequa_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import Any

import requests
Expand Down Expand Up @@ -32,17 +33,17 @@
"query_name": "query-sde-primary",
"base_url": "https://sciencediscoveryengine.nasa.gov",
},
"lis_server": {
"xli": {
"app_name": "nasa-sba-smd",
"query_name": "query-smd-primary",
"base_url": "http://sde-xli.nasa-impact.net",
},
"lrm_dev_server": {
"lrm_dev": {
"app_name": "sde-init-check",
"query_name": "query-init-check",
"base_url": "https://sde-lrm.nasa-impact.net",
},
"lrm_qa_server": {
"lrm_qa": {
"app_name": "sde-init-check",
"query_name": "query-init-check",
"base_url": "https://sde-qa.nasa-impact.net",
CarsonDavis marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -53,15 +54,13 @@
class Api:
def __init__(self, server_name: str) -> None:
CarsonDavis marked this conversation as resolved.
Show resolved Hide resolved
self.server_name = server_name
self.app_name: str = server_configs[server_name]["app_name"]
self.query_name: str = server_configs[server_name]["query_name"]
self.base_url: str = server_configs[server_name]["base_url"]
self.xli_user = settings.XLI_USER
self.xli_password = settings.XLI_PASSWORD
self.lrm_user = settings.LRM_USER
self.lrm_password = settings.LRM_PASSWORD
self.lrm_qa_user = settings.LRM_QA_USER
self.lrm_qa_password = settings.LRM_QA_PASSWORD
config = server_configs[server_name]
self.app_name: str = config["app_name"]
self.query_name: str = config["query_name"]
self.base_url: str = config["base_url"]
self.user = getattr(settings, f"{server_name}_USER".upper(), None)
self.password = getattr(settings, f"{server_name}_PASSWORD".upper(), None)
self.token = getattr(settings, f"{server_name}_TOKEN".upper(), None)

def process_response(self, url: str, payload: dict[str, Any]) -> Any:
response = requests.post(url, headers={}, json=payload, verify=False)
Expand All @@ -74,14 +73,7 @@ def process_response(self, url: str, payload: dict[str, Any]) -> Any:
return meaningful_response

def query(self, page: int, collection_config_folder: str = "") -> Any:
if self.server_name == "lis_server":
url = f"{self.base_url}/api/v1/search.query?Password={self.xli_password}&User={self.xli_user}"
elif self.server_name == "lrm_dev_server":
url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_password}&User={self.lrm_user}"
elif self.server_name == "lrm_qa_server":
url = f"{self.base_url}/api/v1/search.query?Password={self.lrm_qa_password}&User={self.lrm_qa_user}"
else:
url = f"{self.base_url}/api/v1/search.query"
url = f"{self.base_url}/api/v1/search.query?Password={self.password}&User={self.user}"
CarsonDavis marked this conversation as resolved.
Show resolved Hide resolved
payload = {
"app": self.app_name,
"query": {
Expand All @@ -94,11 +86,40 @@ def query(self, page: int, collection_config_folder: str = "") -> Any:
}

if collection_config_folder:
if self.server_name in ["lis_server", "lrm_dev_server", "lrm_qa_server"]:
if self.server_name in ["xli", "lrm_dev", "lrm_qa"]:
payload["query"]["advanced"]["collection"] = f"/scrapers/{collection_config_folder}/"
else:
payload["query"]["advanced"]["collection"] = f"/SDE/{collection_config_folder}/"

response = self.process_response(url, payload)

return response

def sql_query(self, sql: str) -> Any:
"""Executes an SQL query on the configured server using token-based authentication."""
if not self.token:
raise ValueError("You must have a token to use the SQL endpoint")

url = f"{self.base_url}/api/v1/engine.sql"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.token}"}
payload = json.dumps(
{
"method": "engine.sql",
"sql": sql,
"pretty": True,
"log": False,
"output": "json",
"resolveIndexList": "false",
"engines": "default",
}
)
try:
response = requests.post(url, headers=headers, data=payload, timeout=10)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
raise Exception(f"API request failed: {str(e)}")

def get_full_texts(self, collection_config_folder: str) -> Any:
saifrk marked this conversation as resolved.
Show resolved Hide resolved
sql = f"SELECT url1, text, title FROM sde_index WHERE collection = '/SDE/{collection_config_folder}/'"
return self.sql_query(sql)
34 changes: 34 additions & 0 deletions sde_collections/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from django.core.management.commands import loaddata

from config import celery_app
from sde_collections.models.candidate_url import CandidateURL

from .models.collection import Collection, WorkflowStatusChoices
from .sinequa_api import Api
Expand Down Expand Up @@ -141,3 +142,36 @@ def resolve_title_pattern(title_pattern_id):
TitlePattern = apps.get_model("sde_collections", "TitlePattern")
title_pattern = TitlePattern.objects.get(id=title_pattern_id)
title_pattern.apply()


@celery_app.task
def fetch_and_update_full_text(collection_id, server_name):
"""
Task to fetch and update full text and metadata for all URLs associated with a specified collection
from a given server.

Args:
collection_id (int): The identifier for the collection in the database.
server_name (str): The name of the server.

Returns:
str: A message indicating the result of the operation, including the number of URLs processed
or a message if no records were found.
"""
collection = Collection.objects.get(id=collection_id)
api = Api(server_name)
full_texts = api.get_full_texts(collection.config_folder)

records = full_texts.get("Rows", [])
if not records:
return "No records found in the response."

for record in records:
url, full_text, title = record
if not (url and full_text and title):
continue

CandidateURL.objects.update_or_create(
url=url, collection=collection, defaults={"scraped_text": full_text, "scraped_title": title}
)
return f"Successfully processed {len(records)} records and updated the database."