-
Notifications
You must be signed in to change notification settings - Fork 55
/
image_wall.py
186 lines (172 loc) · 8.69 KB
/
image_wall.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import argparse
import os
import random
import re
import requests
import time
from bs4 import BeautifulSoup
from io import BytesIO
from item import Item, get_next_img_url
from PIL import Image, UnidentifiedImageError
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--id', type=str, required=True, help='User id, or a link containing it')
parser.add_argument('-o', '--offset', type=int, default=13,
help='In the first 4 rows, a larger image will occur once every {offset} cells.'
'The following rows repeat this pattern. Set to 0 to disable larger images.')
parser.add_argument('-c', '--col', type=int, default=5, help='Number of columns in the matrix')
parser.add_argument('-r', '--row', type=int, default=8, help='Number of rows in the matrix')
parser.add_argument('-w', '--width', type=int, default=540, help='Width of one cell. A value less than 600 is recommended.')
parser.add_argument('-ht', '--height', type=int, default=750, help='Height of one cell. A value less than 800 is recommended. Will be the same as cell width in music mode.')
parser.add_argument('-rd', '--random', type=bool, default=False, help='Whether to shuffle the images')
parser.add_argument('-rt', '--rating', type=int, default=0, help='Rating filter. Only integers between 1 (one star) and 5 (five stars) are meaningful')
parser.add_argument('-s', '--sort-by-time', type=bool, default=False,
help='Whether to sort chronologically (neareast first). Default is False, as default sort order is by rating.')
parser.add_argument('-m', '--mode', type=str, default='movie', choices=['book', 'movie', 'music'], help='"movie", "music" or "book". In "music" mode, cell height will be the same as cell width.')
parser.add_argument('-t', '--threshold', type=int, default=300,
help='Number of cached images to pertain. Once the number goes beyond this threshold, '
'unused cache gets removed. Set to 0 to immediately remove unused cache after a run. Set to negative to disable cache.')
parser.add_argument('-l', '--limit', type=int, default=200,
help='Limit of number of items to process. Default is 200. Raise this when COL x ROW > 200.')
parser.add_argument('--max-width', type=int, default=4000, help='Maximum width of the output in pixel number.')
parser.add_argument('--max-height', type=int, default=8000, help='Maximum height of the output in pixel number.')
args = parser.parse_args()
_MUSIC_MODE = args.mode == 'music'
_BOOK_MODE = args.mode == 'book'
_COLUMN_NUM = args.col
_ROW_NUM = args.row
_OFFSET = args.offset
_CELL_WIDTH = args.width
_CELL_HEIGHT = args.width if _MUSIC_MODE else args.height
_WIDTH = _CELL_WIDTH * _COLUMN_NUM
_HEIGHT = _CELL_HEIGHT * _ROW_NUM
_CACHE = 'cache/'
_OUTPUT = 'output/'
_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.128 Safari/537.36'
larger_image_index = []
if _OFFSET:
for x in range(0, _COLUMN_NUM * 3, _OFFSET):
for y in range(x, _COLUMN_NUM * _ROW_NUM, _COLUMN_NUM * 4):
if y % _COLUMN_NUM >= _COLUMN_NUM - 1:
print('Invalid offset. Try decrease or increase by 1.')
exit()
larger_image_index.append(y)
skip_image_index = [x for i in larger_image_index for x in (i + 1, i + _COLUMN_NUM, i + _COLUMN_NUM + 1) if x < _COLUMN_NUM * _ROW_NUM]
if any(x in larger_image_index for x in skip_image_index):
print('Invalid offset. Try decrease or increase by 1.')
exit()
if args.id.isnumeric() or 'douban.com' not in args.id:
id = args.id
else:
match = re.match('.*people/(.*)/.*', args.id)
if not match:
print('Invalid id.')
exit()
id = match[1]
start = range(0, args.limit, 15)
urls = (f'https://{args.mode}.douban.com/people/{id}/collect?'
f'start={x}&sort={"time" if args.sort_by_time else "rating"}'
f'&rating={"all" if args.rating<1 or args.rating>5 else args.rating}'
f'&filter=all&mode=grid&tags_sort=count'
for x in start)
headers = {'User-Agent': _UA}
items = []
enough_met = False
rating_span_regex = re.compile('rating(\d)-t')
for url in urls:
headers['Referer'] = url
response = requests.get(url, headers=headers).text
soup = BeautifulSoup(response, features='html.parser')
item_divs = soup.find_all('li', {'class': 'subject-item'}) if _BOOK_MODE else soup.find_all('div', {'class': 'item'})
if not item_divs:
break
for item_div in item_divs:
name = item_div.find('a', title=True)['title'] if _BOOK_MODE else item_div.find('em').text
img_url = item_div.find('a', {'class': 'nbg'}).find('img', recursive=False)['src']
rating_span = item_div.find('span', {'class': rating_span_regex})
rating = int(re.match(rating_span_regex, rating_span['class'][0])[1]) if rating_span else 0
items.append(Item(name, img_url, rating))
if len(items) >= _COLUMN_NUM * _ROW_NUM - len(skip_image_index):
enough_met = True
break
if enough_met:
break
time.sleep(5)
if len(items):
print(f'A total of {len(items)} images to process.')
else:
print('Error: No image to process. HTTP request may have been blocked.')
exit()
if args.random:
random.shuffle(items)
os.makedirs(_CACHE, exist_ok=True)
cache_imgs = {_CACHE + f: False for f in os.listdir(_CACHE) if os.path.isfile(_CACHE + f) and (f.lower().endswith('.jpg') or f.lower().endswith('.webp'))}
result = Image.new('RGB', (_WIDTH, _HEIGHT))
count = 0
for j in range(0, _HEIGHT, _CELL_HEIGHT):
for i in range(0, _WIDTH, _CELL_WIDTH):
if count in skip_image_index:
count += 1
continue
need_large = count in larger_image_index
image_url = get_next_img_url(items, args.sort_by_time, need_large)
if not image_url:
break
size = (_CELL_WIDTH, _CELL_HEIGHT)
if need_large:
size = (_CELL_WIDTH * 2, _CELL_HEIGHT * 2)
if _MUSIC_MODE or _BOOK_MODE:
image_url = image_url.replace('subject/s', 'subject/l')
else:
image_url = image_url.replace('s_ratio_poster', 'l_ratio_poster')
elif _MUSIC_MODE or _BOOK_MODE:
image_url = image_url.replace('subject/s', 'subject/m')
else:
image_url = image_url.replace('s_ratio_poster', 'm_ratio_poster')
count += 1
cache_img_name = _CACHE + image_url[image_url.rfind('/') + 1:]
need_retrieve = True
if cache_img_name in cache_imgs:
need_retrieve = False
print(f'Found cache for {image_url}: {cache_img_name}')
cache_imgs[cache_img_name] = True
img = Image.open(cache_img_name)
if need_large and img.width < 600:
print('...but size is smaller than needed. Will retrieve and overwrite.')
need_retrieve = True
if need_retrieve:
print('Retrieving ' + image_url)
response = requests.get(image_url, stream=True, headers=headers)
try:
img = Image.open(BytesIO(response.content))
except:
print('Failed to process. Trying again with webp...')
time.sleep(3)
image_url = image_url.replace('.jpg', '.webp')
response = requests.get(image_url, stream=True, headers=headers)
try:
img = Image.open(BytesIO(response.content))
except:
print('Still failing! Trying again with larger image size...')
time.sleep(3)
if _MUSIC_MODE or _BOOK_MODE:
image_url = image_url.replace('subject/l', 'subject/b').replace('subject/m', 'subject/l')
else:
image_url = image_url.replace('l_ratio_poster', 'b_ratio_poster').replace('m_ratio_poster', 'l_ratio_poster')
response = requests.get(image_url, stream=True, headers=headers)
try:
img = Image.open(BytesIO(response.content))
except:
print('Still failing! Giving up.')
continue
if args.threshold >= 0:
img.save(cache_img_name, optimize=True)
time.sleep(3)
result.paste(img.resize(size, Image.LANCZOS), (i, j))
result.thumbnail((args.max_width, args.max_height), Image.LANCZOS)
os.makedirs(_OUTPUT, exist_ok=True)
result.save(f'{_OUTPUT}/{id}_{args.mode}.jpg', optimize=True, quaility=80)
if len(cache_imgs) > args.threshold:
for cached_img, used in cache_imgs.items():
if not used or args.threshold < 0:
print(f'Removing unused cache {cached_img}...')
os.remove(cached_img)