-
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix javascript routing problem, add check for project to match URL, f…
…ix line endings, add validation tests
- Loading branch information
Showing
3 changed files
with
247 additions
and
221 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,78 @@ | ||
import logging | ||
import urllib | ||
|
||
import mwparserfromhell | ||
import requests | ||
import validators | ||
|
||
from wp1.constants import WP1_USER_AGENT | ||
from wp1.exceptions import Wp1FatalSelectionError | ||
from wp1.selection.abstract_builder import AbstractBuilder | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Builder(AbstractBuilder): | ||
|
||
def build(self, content_type, **params): | ||
if content_type != 'text/tab-separated-values': | ||
raise Wp1FatalSelectionError('Unrecognized content type') | ||
if 'url' not in params: | ||
raise Wp1FatalSelectionError('Missing required param: url') | ||
if 'project' not in params: | ||
raise Wp1FatalSelectionError('Missing required param: project') | ||
|
||
if not isinstance(params['url'], str): | ||
raise Wp1FatalSelectionError('Param `url` was not str') | ||
if not isinstance(params['project'], str): | ||
raise Wp1FatalSelectionError('Param `project` was not str') | ||
|
||
book_name = params['url'].split('wiki/')[1] | ||
final_url = ( | ||
'https://%s/w/api.php?' | ||
'action=query&prop=revisions&rvprop=content&format=json&rvslots=main' | ||
'&titles=%s' % (params['project'], book_name)) | ||
|
||
resp = requests.get(final_url, headers={'User-Agent': WP1_USER_AGENT}) | ||
try: | ||
resp.raise_for_status() | ||
except requests.exceptions.HTTPError as e: | ||
logger.exception('Error status received from Wikipedia API') | ||
raise Wp1FatalSelectionError( | ||
'Error status received from Wikipedia API') from e | ||
|
||
data = resp.json() | ||
pages = data['query']['pages'] | ||
page = list(pages.values())[0] | ||
wikitext = page['revisions'][0]['slots']['main']['*'] | ||
|
||
parsed = mwparserfromhell.parse(wikitext) | ||
unique = set() | ||
titles = [] | ||
for link in parsed.filter_wikilinks(): | ||
title = link.strip('[]').replace(' ', '_') | ||
if title not in unique: | ||
titles.append(title) | ||
unique.add(title) | ||
|
||
return '\n'.join(titles).encode('utf-8') | ||
|
||
def validate(self, **params): | ||
if 'url' not in params: | ||
return ('', params['url'], ['Missing URL parameter']) | ||
|
||
if 'project' not in params: | ||
return ('', params['url'], ['Missing project parameter']) | ||
|
||
if params['project'] not in params['url']: | ||
parsed_url = urllib.parse.urlparse(params['url']) | ||
return ('', params['url'], [ | ||
'The domain of your URL does not match your ' | ||
'selected project (project is: %s, URL has: %s)' % | ||
(params['project'], parsed_url.netloc) | ||
]) | ||
|
||
if not validators.url(params['url']): | ||
return ('', params['url'], ['That doesn\'t look like a valid URL.']) | ||
|
||
return ('', '', []) | ||
import logging | ||
import urllib | ||
|
||
import mwparserfromhell | ||
import requests | ||
import validators | ||
|
||
from wp1.constants import WP1_USER_AGENT | ||
from wp1.exceptions import Wp1FatalSelectionError | ||
from wp1.selection.abstract_builder import AbstractBuilder | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Builder(AbstractBuilder): | ||
|
||
def build(self, content_type, **params): | ||
if content_type != 'text/tab-separated-values': | ||
raise Wp1FatalSelectionError('Unrecognized content type') | ||
if 'url' not in params: | ||
raise Wp1FatalSelectionError('Missing required param: url') | ||
if 'project' not in params: | ||
raise Wp1FatalSelectionError('Missing required param: project') | ||
|
||
if not isinstance(params['url'], str): | ||
raise Wp1FatalSelectionError('Param `url` was not str') | ||
if not isinstance(params['project'], str): | ||
raise Wp1FatalSelectionError('Param `project` was not str') | ||
|
||
book_name = params['url'].split('wiki/')[1] | ||
final_url = ( | ||
'https://%s/w/api.php?' | ||
'action=query&prop=revisions&rvprop=content&format=json&rvslots=main' | ||
'&titles=%s' % (params['project'], book_name)) | ||
|
||
resp = requests.get(final_url, headers={'User-Agent': WP1_USER_AGENT}) | ||
try: | ||
resp.raise_for_status() | ||
except requests.exceptions.HTTPError as e: | ||
logger.exception('Error status received from Wikipedia API') | ||
raise Wp1FatalSelectionError( | ||
'Error status received from Wikipedia API') from e | ||
|
||
data = resp.json() | ||
pages = data['query']['pages'] | ||
page = list(pages.values())[0] | ||
wikitext = page['revisions'][0]['slots']['main']['*'] | ||
|
||
parsed = mwparserfromhell.parse(wikitext) | ||
unique = set() | ||
titles = [] | ||
for link in parsed.filter_wikilinks(): | ||
title = link.strip('[]').replace(' ', '_') | ||
if title not in unique: | ||
titles.append(title) | ||
unique.add(title) | ||
|
||
return '\n'.join(titles).encode('utf-8') | ||
|
||
def validate(self, **params): | ||
if 'url' not in params: | ||
return ('', '', ['Missing URL parameter']) | ||
|
||
if 'project' not in params: | ||
return ('', params['url'], ['Missing project parameter']) | ||
|
||
if params['project'] not in params['url']: | ||
parsed_url = urllib.parse.urlparse(params['url']) | ||
return ('', params['url'], [ | ||
'The domain of your URL does not match your ' | ||
'selected project (project is: %s, URL has: %s)' % | ||
(params['project'], parsed_url.netloc) | ||
]) | ||
|
||
if not validators.url(params['url']): | ||
return ('', params['url'], ['That doesn\'t look like a valid URL.']) | ||
|
||
return ('', '', []) |
Oops, something went wrong.