Skip to content

Commit

Permalink
Merge branch 'master' into pull-requests/co-uk-fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jmeosbn authored Dec 24, 2017
2 parents c987702 + 8f0a91a commit 1ec50dd
Showing 1 changed file with 47 additions and 16 deletions.
63 changes: 47 additions & 16 deletions Contents/Code/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@

def json_decode(output):
try:
return json.loads(output)
return json.loads(output,encoding="utf-8")
except:
return None


# URLs
VERSION_NO = '1.2017.11.10.1'
VERSION_NO = '1.2017.12.21.1'

REQUEST_DELAY = 0 # Delay used when requesting HTML, may be good to have to prevent being banned from the site

Expand All @@ -31,10 +31,10 @@ def json_decode(output):
THREAD_MAX = 20

intl_sites={
'en' : { 'url': 'www.audible.com' , 'rel_date' : u'Release Date' , 'nar_by' : u'Narrated By' , 'nar_by2': u'Narrated by'},
'fr' : { 'url': 'www.audible.fr' , 'rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2': u'Lu par'},
'de' : { 'url': 'www.audible.de' , 'rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von', 'rel_date2': u'Veröffentlicht'},
'it' : { 'url': 'www.audible.it' , 'rel_date' : u'Data di Pubblicazione', 'nar_by' : u'Narratore' },
'en' : { 'url': 'www.audible.com' , 'urltitle' : u'title=' , 'rel_date' : u'Release date' , 'nar_by' : u'Narrated By' , 'nar_by2': u'Narrated by'},
'fr' : { 'url': 'www.audible.fr' , 'urltitle' : u'searchTitle=' , 'rel_date' : u'Date de publication' , 'nar_by' : u'Narrateur(s)' , 'nar_by2': u'Lu par'},
'de' : { 'url': 'www.audible.de' , 'urltitle' : u'searchTitle=' , 'rel_date' : u'Erscheinungsdatum' , 'nar_by' : u'Gesprochen von', 'rel_date2': u'Veröffentlicht'},
'it' : { 'url': 'www.audible.it' , 'urltitle' : u'searchTitle=' , 'rel_date' : u'Data di Pubblicazione', 'nar_by' : u'Narratore' },
#'jp' : { 'url': 'www.audible.co.jp', 'rel_date' : u'N/A', 'nar_by' : u'ナレーター' }, # untested
}

Expand All @@ -57,6 +57,7 @@ def SetupUrls(sitetype, base, lang='en'):
Log('Pulling language from sites array')
lang=sites_langs[base]['lang']
if lang in intl_sites :
urlsearchtitle=intl_sites[lang]['urltitle']
ctx['REL_DATE']=intl_sites[lang]['rel_date']
ctx['NAR_BY' ]=intl_sites[lang]['nar_by']
if 'rel_date2' in intl_sites[lang]:
Expand All @@ -68,7 +69,7 @@ def SetupUrls(sitetype, base, lang='en'):
else:
ctx['NAR_BY_INFO' ]=ctx['NAR_BY' ]
else:
ctx['REL_DATE' ]='Release Date'
ctx['REL_DATE' ]='Release date'
ctx['REL_DATE_INFO']=ctx['REL_DATE']
ctx['NAR_BY' ]='Narrated By'
ctx['NAR_BY_INFO' ]='Narrated by'
Expand All @@ -86,6 +87,7 @@ def SetupUrls(sitetype, base, lang='en'):
base='www.audible.com'
if lang in intl_sites :
base=intl_sites[lang]['url']
urlsearchtitle=intl_sites[lang]['urltitle']
ctx['REL_DATE']=intl_sites[lang]['rel_date']
ctx['NAR_BY' ]=intl_sites[lang]['nar_by']
if 'rel_date2' in intl_sites[lang]:
Expand All @@ -97,18 +99,19 @@ def SetupUrls(sitetype, base, lang='en'):
else:
ctx['NAR_BY_INFO' ]=ctx['NAR_BY' ]
else:
ctx['REL_DATE' ]='Release Date'
ctx['REL_DATE' ]='Release date'
ctx['REL_DATE_INFO']=ctx['REL_DATE']
ctx['NAR_BY' ]='Narrated By'
ctx['NAR_BY_INFO' ]='Narrated by'


AUD_BASE_URL='https://' + str(base) + '/'
AUD_TITLE_URL=urlsearchtitle
ctx['AUD_BOOK_INFO' ]=AUD_BASE_URL + 'pd/%s?ipRedirectOverride=true'
ctx['AUD_ARTIST_SEARCH_URL' ]=AUD_BASE_URL + 'search?searchAuthor=%s&ipRedirectOverride=true'
ctx['AUD_ALBUM_SEARCH_URL' ]=AUD_BASE_URL + 'search?searchTitle=%s&x=41&ipRedirectOverride=true'
ctx['AUD_ALBUM_SEARCH_URL' ]=AUD_BASE_URL + 'search?' + AUD_TITLE_URL + '%s&x=41&ipRedirectOverride=true'
ctx['AUD_KEYWORD_SEARCH_URL']=AUD_BASE_URL + 'search?filterby=field-keywords&advsearchKeywords=%s&x=41&ipRedirectOverride=true'
ctx['AUD_SEARCH_URL' ]=AUD_BASE_URL + 'search?searchTitle={0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
ctx['AUD_SEARCH_URL' ]=AUD_BASE_URL + 'search?' + AUD_TITLE_URL + '{0}&searchAuthor={1}&x=41&ipRedirectOverride=true'
return ctx


Expand Down Expand Up @@ -169,6 +172,7 @@ def doSearch(self, url, ctx):
html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY)

found = []

for r in html.xpath('//div[a/img[@class="yborder"]]'):
date = self.getDateFromString(self.getStringContentFromXPath(r, 'text()[1]'))
title = self.getStringContentFromXPath(r, 'a[2]')
Expand Down Expand Up @@ -264,7 +268,22 @@ def findDateInTitle(self, title):
def doSearch(self, url, ctx):
html = HTML.ElementFromURL(url, sleep=REQUEST_DELAY)
found = []

self.Log('-----------------------------------------just before new xpath line--------------------')
for r in html.xpath('//ul//li[contains(@class,"productListItem")]'):
datetext = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul/li[contains (@class,"releaseDateLabel")]/span'.decode('utf-8'))
datetext=re.sub(r'[^0-9\-]', '',datetext)
date=self.getDateFromString(datetext)
title = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul//a[1]')
murl = self.getAnchorUrlFromXPath(r, 'div/div/div/div/div/div/span/ul/li/h3//a[1]')
thumb = self.getImageUrlFromXPath(r, 'div/div/div/div/div/div/div[contains(@class,"responsive-product-square")]/div/a/img')
author = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul/li[contains (@class,"authorLabel")]/span/a[1]')
narrator = self.getStringContentFromXPath(r, 'div/div/div/div/div/div/span/ul/li[contains (@class,"narratorLabel")]/span//a[1]'.format(ctx['NAR_BY']).decode('utf-8'))
self.Log('---------------------------------------XPATH SEARCH HIT-----------------------------------------------')

found.append({'url': murl, 'title': title, 'date': date, 'thumb': thumb, 'author': author, 'narrator': narrator})

self.Log('-----------------------------------------just after new xpath line--------------------')

for r in html.xpath('//div[contains (@class, "adbl-search-result")]'):
date = self.getDateFromString(self.getStringContentFromXPath(r, 'div/div/ul/li[contains (., "{0}")]/span[2]//text()'.format(ctx['REL_DATE']).decode('utf-8')))
title = self.getStringContentFromXPath(r, 'div/div/div/div/a[1]')
Expand Down Expand Up @@ -351,7 +370,7 @@ def search(self, results, media, lang, manual):
self.Log('Found %s result(s) for query "%s"', len(found), normalizedName)
i = 1
for f in found:
self.Log(' %s. (title) %s (url)[%s] (date)(%s) (thumb){%s}', i, f['title'], f['url'], str(f['date']), f['thumb'])
self.Log(' %s. (title) %s (author) %s (url)[%s] (date)(%s) (thumb){%s}', i, f['title'], f['author'], f['url'], str(f['date']), f['thumb'])
i += 1

self.Log('-----------------------------------------------------------------------')
Expand All @@ -363,11 +382,17 @@ def search(self, results, media, lang, manual):
self.Log('URL For Breakdown: %s', url)

# Get the id
# for itemId in url.split('/') :
for itemId in url.split('/') :
if re.match(r'B0[0-9A-Z]{8,8}', itemId):
break
itemId=None

#New Search results contain question marks after the ID
for itemId in itemId.split('?') :
if re.match(r'B0[0-9A-Z]{8,8}', itemId):
break

if len(itemId) == 0:
Log('No Match: %s', url)
continue
Expand Down Expand Up @@ -446,10 +471,10 @@ def update(self, metadata, media, lang, force=False):
pass

date=None
rating=None
series=''
genre1=None
genre2=None
rating=0

for r in html.xpath('//div[contains (@id, "adbl_page_content")]'):
date = self.getDateFromString(self.getStringContentFromXPath(r, '//li[contains (., "{0}")]/span[2]//text()'.format(ctx['REL_DATE_INFO']).decode('utf-8')))
Expand All @@ -466,10 +491,16 @@ def update(self, metadata, media, lang, force=False):
self.Log('---------------------------------------XPATH SEARCH HIT-----------------------------------------------')

if date is None :
#for r in html.xpath('//div[contains (@class,"slot bottomSlot")]/script[contains (@type, "application/ld+json")]'):
for r in html.xpath('//script[contains (@type, "application/ld+json")]'):
page_content = r.text_content()
page_content = page_content.replace('\n', '') # Remove and new lines. JSON doesn't like them.
page_content = re.sub(r'\\(?![bfnrtv\'\"\\])', '', page_content) # Remove any backslashes that aren't escaping a character JSON needs escaped
page_content = page_content.replace('\n', '')
#page_content = page_content.replace('\'', '\\\'')
#page_content = re.sub(r'\\(?![bfnrtv\'\"\\])', '', page_content)
# Remove any backslashes that aren't escaping a character JSON needs escaped
remove_inv_json_esc=re.compile(r'([^\\])(\\(?![bfnrt\'\"\\/]|u[A-Fa-f0-9]{4}))')
page_content=remove_inv_json_esc.sub(r'\1\\\2', page_content)
self.Log(page_content)
json_data=json_decode(page_content)
for json_data in json_data:
if 'datePublished' in json_data:
Expand Down Expand Up @@ -499,7 +530,7 @@ def update(self, metadata, media, lang, force=False):
#for key in json_data:
# Log('{0}:{1}'.format(key, json_data[key]))
genre1=json_data['itemListElement'][1]['item']['name']
try: # Not all books have two genre tags.
try:
genre2=json_data['itemListElement'][2]['item']['name']
except:
continue
Expand Down

0 comments on commit 1ec50dd

Please sign in to comment.