-
Notifications
You must be signed in to change notification settings - Fork 0
/
Best_score(Ensembling)(0.64 score).py
362 lines (228 loc) · 53 KB
/
Best_score(Ensembling)(0.64 score).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from sklearn.model_selection import GridSearchCV
# In[2]:
trainset=pd.read_csv('train.csv')
testset=pd.read_csv('test.csv')
# ## Below are all the preprocessing steps we have done to clean and properly process our dataset
# - We are doing contraction mapping to reduce similar words into their common form . Abbreviated words are converted to their normal forms
# - Removing punctuations
# - Converting non-english words to english words
# - Tokenization
# - Lemmatization
# - Stopword removal
# - Spelling correction
# - Removing ASCII symbols
# In[3]:
words_to_english_map = {'隶书': 'Lishu', 'तपस्': 'Tapse', 'ให้': 'give', '宗教官': 'Religious officer', '没有神一样的对手': 'No god-like opponent', 'יהודיות': 'Jewish women', '我明明是中国人': 'I am obviously Chinese.', 'お早う': 'Good morning', 'मेरे': 'my', 'পরীক্ষা': 'Test', 'נעכטן': 'Yesterday', '中华民国': 'Republic of China', 'तकनीकी': 'Technical', 'यहाँ': 'here', '奇兵隊': 'Qibing', 'か゚': 'Or', '知识分子': 'Intellectuals', 'ごめなさい': "I'm sorry", '하기를': 'To', 'गाढवाचे': 'Donkey', 'इबादत': 'Worship', '千金': 'Thousand gold', 'ельзи': 'elzhi', '低端人口': 'Low-end population', '맛저': 'Mazur', 'दानि': 'Danias', 'юродство': 'foolishness', 'τον': 'him', '素质': 'Quality', '王晓菲': 'Wang Xiaofei', 'माहेर': 'Maher', 'لمصارى': 'For my advice', '送客!': 'see a visitor out!', '然而': 'however', 'जिग्यासा': 'Jigyaasa', '都市': 'city', 'ஜோடி': 'Pair', 'لاني': 'See', 'пельмени': 'dumplings', '请不要误会': "Please don't misunderstand", 'люис': 'Lewis', '不送!': 'Do not send!', 'के': 'Of', 'मस्का': 'Mascara', 'вoyѕ': 'voyce', 'बुन्ना': 'Bunna', '战略支援部队': 'Strategic support force', '平埔族': 'Pingpu', 'فهمت؟': 'I see?', 'कयामत': 'Apocalypse', 'ức': 'memory', 'ᗰeᑎ': 'ᗰe ᑎ', 'सेकना': 'Sexting', 'धकेलना': 'Shove', 'পারি': 'We can', 'مقام': 'Official', 'बेसति': 'Baseless', '歪果仁研究协会': 'Hazelnut Research Association', '短信': 'SMS', 'всегда': 'is always', '修身': 'Slim fit', 'إنَّ': 'that', '不是民国': 'Not the Republic of China', '写的好': 'Written well', 'õhtupoolik': 'afternoon', 'तालीन': 'Training', 'और': 'And', 'щит': 'shield', 'ᗩtteᑎtiᐯe': 'ᗩtte ᑎ ti ᐯ e', 'اسکوپیه': 'Script', 'çomar': 'fido', '和製英語': 'Japanglish', '吊着命': 'Hanging', 'много': 'much', 'समुराय': 'Samurai', 'भटकना': 'Wander', 'مقاومة': 'resistance', '싱관없어': 'I have no one.', '修身養性': 'Self-cultivation', 'मटका': 'pot', 'θιοψʼ': 'thiós', 'ㅎㅎㅎㅎ': 'Hehe', 'تساوى': 'Equal', 'बाट': 'From', '不地道啊。': 'Not authentic.', 'контакт': 'contact', '이런것도': 'This', 'तै': 'The', 'मेल': 'similarity', 'álvarez': 'Alvarez', 'नुक्स': 'Damage', '口訣': 'Mouth', 'масло': 'butter', 'परम्परा': 'Tradition', '学会': 'learn', 'کردن': 'Make up', 'öd': 'bile', 'टशन': 'Tashan', 'つらやましす': 'I am cheating', 'чего': 'what', '为什么总有人能在shoplex里免费获取iphone等重金商品?': 'Why do people always get free access to iphone and other heavy commodities in shoplex?', 'श्री': 'Mr', 'प्रेषक': 'Sender', 'خواندن': 'Read', 'बदलेंगे': 'Will change', 'बीएड': 'Breed', 'अदा': 'Paid', 'फैलाना': 'spread', 'ബുദ്ധിമാനായ': 'Intelligent', '谢谢六佬': 'Thank you, Liu Wei', 'উপস্থাপন': 'Present', 'بكلها': 'All of them', 'जाए': 'Go', '无可挑剔': 'Impeccable', 'आना': 'come', '太阴': 'lunar', 'القط': 'The cat', '있네': 'It is.', 'कर्म': 'Karma', 'आड़': 'Shroud', 'įit': 'IIT', 'जशने': 'Jashnay', 'से': 'From', 'åkesson': 'Åkesson', '乳名': 'Milk name', '我看起来像韩国人吗': 'Do I look like a Korean?', 'тнan': 'tn', 'العظام': 'Bones', '暹罗': 'Siam', '小肥羊鍋店': 'Little fat sheep pot shop', 'করেন': 'Do it', 'हरामी': 'bastard', 'डाले': 'Cast', 'استراحة': 'Break', '射道': 'Shooting', 'επιστήμη': 'science', 'χράομαι': "I'm glad", '脚踏车': 'bicycle', 'हिलना': 'To move', 'đa': 'multi', '標點符號': 'Punctuation', 'मैं': 'I', '不能化生水谷精微': "Can't transform the water valley", 'निवाला': 'Boar', 'दर्शनाभिलाषी': 'Spectacular', 'বাছাই': 'Picking', 'どくぜんてき': 'Dokedari', 'पीवें।': 'Pieve.', 'ʻoumuamua': 'Priority', 'में': 'In', 'चलाऊ': 'Run', 'निपटाना': 'To settle', 'ごはん': 'rice', 'चार्ज': 'Charge', 'शिथिल': 'Loosely', 'ястребиная': 'yastrebina', 'ложись': 'lie down', '李银河': 'Li Yinhe', 'へも': 'Also', 'हुलिया': 'Hulia', 'ऊब': 'Bored', 'छाछ': 'Buttermilk', 'عن': 'About', 'नहीं': 'No', 'जीव': 'creatures', 'जेहाद': 'Jehad', 'νερό': 'water', '열여섯': 'sixteen', 'मार': 'Kill', '早就': 'Long ago', '《齐天大圣》for': '"Qi Tian Da Sheng" for', 'الجنس': 'Sex', 'مع': 'With', 'रंगरलियाँ': 'Color palettes', 'जोल': 'Jol', 'बुद्धी': 'Intelligence', '罗思成': 'Luo Sicheng', '独善的': 'Self-righteousness', 'धर्मः': 'Religion', '中国大陆的同胞们': 'Chinese compatriots', '愛してない': 'I do not love you', '日本語が上手ですね': 'Japanese is good', 'ओठ': 'Lips', '如果你希望表達你的觀點': 'If you want to express your opinion', 'देशवाशी': 'Countryman', 'καί': 'and', '阮铭': 'Yu Ming', '跑步跑': 'Running', 'हो': 'Ho', '「褒められる」': '"Praised"', 'నా': 'My', '\x10new': '?new', 'ゆいがどくそん': 'A graduate student', '白语': 'White language', 'वह': 'She', '白い巨塔': 'White big tower', '로리이고': 'Lori.', 'परि': 'Circumcision', 'æj': 'Aw', 'انضيع': 'Lost', '平民苗字必称義務令': 'Civilian Miaozi must be called an obligation', 'ਸਿੰਘ': 'Singh', 'уже': 'already', 'đầu': 'head', '雨热同期': 'Rain and heat', 'घुलना': 'Dissolve', 'мис': 'Miss', 'て下さいませんか': 'Could it be?', '猫头鹰': 'owl', 'चढ़': 'Climbing', '漢音': 'Hanyin', 'यही': 'This only', '只有猪一样的队友': 'Only pig-like teammates', 'übersoldaten': 'about soldiers', 'αθ': 'Ath', 'ουδεις': 'no', '党国合一': 'Party and country', 'रेड': 'Red', 'ढोलना': 'Drift', 'शाक': 'Shake', '같이': 'together', '攻克': 'capture', 'özalan': 'exclusive', 'काम': 'work', 'चाहना': 'Wish', '坚持一个中国原则': 'Adhere to the one-China principle', '배우고': 'Learn', '柴棍': 'Firewood stick', 'उसे': 'To him', 'на': 'on', '無手': 'No hands', 'čvrsnica': 'Čvrsnica', 'ज़ार': 'Tsar', 'ˆo': 'O', '後宮甄嬛傳': 'Harem of the harem', '意音文字': 'Sound character', 'बहारा': 'Bahara', 'てみる': 'Try', '老铁': 'Old iron', '野比のび太': 'Nobita Nobita', 'याद': 'remember', 'پیشی': 'Surpass', 'توقعك': 'Your expectation', 'はたち': 'Slender', 'فزت': 'I won', '伊藤': 'Ito', 'कलेजे': 'Liver', 'αγεν': 'ruin', 'ìmprovement': 'Improvement', 'では': 'Then.', 'பார்ப்பது': 'Viewing', '只留清气满乾坤': 'Only stay clear and full of energy', '이정도쯤': 'About this', 'すてん': 'Sponge', 'ما': 'What', '海南人の日本': "Hainan people's Japan", '小鹿乱撞': 'very excited', 'ῥωμαῖοι': 'ῥomaşii', 'वारी': 'Vary', 'भोज': 'Feast', '陈庚': 'Chen Geng', 'कट्टरवादि': 'Fanatic', '凌霸': 'Lingba', 'पार': 'The cross', 'कुचलना': 'To crush', 'रहा': 'Stayed', 'हम': 'We', 'бойся': 'be afraid', '大刀': 'Large sword', 'ন্ন': 'Done', '汽车': 'car', 'কে': 'Who', 'الکعبه': 'Alkaebe', '网络安全法': 'Cybersecurity law', 'नेपाली': 'Nepali', 'পদের': 'Position', '\x92t': '??t', 'रास': 'Ras', 'لكل': 'for every', 'शूद्राणां': 'Shudran', '此问题只是针对外国友人的一次统计': 'This question is only a statistic for foreign friends.', 'ឃើញឯកសារអំពីប្រវត្តិស្ត្រនៃប្រាសាទអង្គរវត្ត': 'See the history of Angkor Wat', 'спасибо': 'thank', 'رب': 'God', 'वजूद': 'Non existent', 'पकड़': 'Hold', 'बरहा': 'Baraha', '雾草': 'Fog grass', 'мощный': 'powerful', 'বৃদ্ধাশ্রম': 'Old age', 'आई': 'Mother', 'खड़ी': 'Steep', '\x1aaùõ\x8d': '? aùõ ??', 'āto': 'standard', '在冰箱裡': 'in the refrigerator', 'đời': 'life', 'लड़का': 'The boy', 'τὸ': 'the', 'مدرسان': 'Instructors', 'பறை': 'Drum', 'índia': 'India', 'दिया': 'Gave', '小篆': 'Xiao Yan', '唐樂': 'Tang Le', '米国': 'USA', '文言文': 'Classical Chinese', 'उवाच': 'Uwach', 'هدف': 'Target', 'घेरा': 'Cordon', 'झूम': 'Zoom', 'εσσα': 'all the same', '和製漢語': 'And making Chinese', 'ऐलोपैथिक': 'Allopathic', 'создала': 'has created', 'إدمان': 'addiction', '游泳游': 'Swimming tour', '狮子头': 'Lion head', 'दिये': 'Given', 'पास': 'near', 'εντjα': 'in', 'まする': 'To worship', 'हाल': 'condition', '짱깨': 'Chin', 'জল': 'Water', 'चीज': 'thing', 'îmwe': 'one', '胡江南': 'Hu Jiangnan', 'तमाम': 'All', 'कट्टर': 'Hardcore', '魔法师': 'Magician', 'బుూ\u200c': 'Buu', 'चनचल': 'Chanchal', 'सीधा': 'Straightforward', '不要人夸颜色好': "Don't exaggerate the color", 'हिमालय': 'Himalaya', 'सिंह': 'Lion', 'خراميشو': 'Wastewater', 'мoѕт': 'the moment', 'কোথায়': 'Where?', 'نصف': 'Half', 'رائع': 'Wonderful', 'добрый': 'kind', 'ज़र्रा': 'Zarra', 'يعقوب': 'Yacoub', 'सम्झौता': 'Agreement', 'ān': 'yes', '했다': 'did', 'ἀριστοκράτης': 'aristocrat', 'çan': 'Bell', '太阳': 'sun', 'सर': 'head', 'गुरु': 'Master', '嘉定': 'Jiading', '故乡': 'home', '安倍晋三': 'Shinzo Abe', 'मच्छर': 'Mosquito', 'सभी': 'All', '成語': 'idiom', '唯我独尊': 'Only me', 'นะค่ะ': 'Yes', 'นั่นคุณกำลังทำอะไร': 'What are you doing', 'ın': 's', 'الشامبيونزليج': 'Shamballons', '抗日神剧': 'Anti-Japanese drama', '小妹': 'Little sister', 'çok': 'very', 'लोड': 'Load', 'साहित्यिक': 'Literary', 'लाल।': 'Red.', 'فى': 'in a', '絕不放過一人」之勢': 'Never let go of one person', '国家主席': 'National president', '异议': 'objection', 'अनुस्वार': 'Anuswasar', 'តើបងប្អូនមានមធ្យបាយអ្វីខ្លះដើម្បីរក': 'What are some ways to find out', '允≱ၑご搢': '≱ ≱ ≱ 搢', '黒髪': 'Black hair', '自戳双目': 'Self-poke binocular', 'βροχέως': 'rainy', 'ని': 'The', '一带一路': 'Belt and Road', 'śliwińska': 'Śliwińska', 'κατὰ': 'against', '打古人名': 'Hit the ancient name', 'குறை': 'Low', 'üjin': 'Uji', 'öppning': 'opening', 'अन्जल': 'Anjal', '이와': 'And', 'आविष्कार': 'Invention', 'غنج': 'Gnostic', 'рассвет': 'dawn', 'होना': 'Happen', 'বাড়ি': 'Home', '入定': 'Enter', 'भेड़': 'The sheep', 'देगा': 'Will give', 'ब्रह्मा': 'Brahma', 'बीन': 'Bean', 'χράω': "I'm glad", 'तीन': 'three', 'ضرار': 'Drar', 'विराजमान': 'Seated', 'सैलाब': 'Salab', 'συνηθείας': 'communication', '구경하고': 'To see', 'चुल्ल': 'Chool', '红宝书': 'Red book', '羡慕和嫉妒是不一样的。': 'Envy and jealousy are not the same.', 'मेरा': 'my', 'कलटि': 'Katiti', 'हिमाती': 'Himalayan', 'ಸಾಧು': 'Sadhu', 'عطيتها': 'Her gift', 'छान': 'Nice', 'เกาหลี': 'Korea', 'íntruments': 'instruments', 'يتحمل': 'Bear', 'रुस्तम': 'Rustam', 'बरात': 'Baraat', 'रंग': 'colour', '나이': 'age', 'פרפר': 'butterfly', '老乡': 'Hometown', '謝謝': 'Thank you', 'í‰lbƒ': '‰ in lbƒ', 'हरजाई': 'Harjai', 'পেতে': 'Get to', '「勉強': '"study', 'रड़कना': 'To cry', '清真诊所': 'Halal clinic', 'أفضل': 'Best', 'استطيع': 'I can', 'नाकतोडा': 'Nag', 'बड़ि': 'Elder', 'वैद्य': 'Vaidya', 'أنَّ': 'that', 'いたロリィis': 'There was Lory', 'ìn': 'print', '本人堅持一個中國的原則': 'I adhere to the one-China principle', 'океан': 'ocean', 'बाद': 'after', '禮儀': 'etiquette', 'सिषेविरे': 'Sisavir', 'अमृत': 'Honeydew', 'بدو': 'run', 'ὅς': 'that is', 'छोटा': 'small', 'स्वभावप्रभवैर्गुणै': 'Nature is bad', 'ענג': 'Tight', 'שלמה': 'Complete', 'डोरे': 'Dore', '我害怕得毛骨悚然': 'I am afraid of creeps.', 'झोलि': 'Jolly', 'единадесет': 'eleven', 'என்ன': 'What', 'आयुरवेद': 'Ayurveda', '堵天下悠悠之口': 'Blocking the mouth of the world', 'الجمعي': 'Collective', 'ángel': 'Angel', 'रोना': 'crying', 'हमराज़': 'Humraj', 'चिलमन': 'Drape', 'औषध': 'Medicine', 'निया': 'Nia', 'תעליעוכופור': 'Go up and go', '中国队大胜': 'Chinese team wins', 'δ##': 'd ##', 'चला': 'walked', 'पर': 'On', 'ἔργον': 'work', 'रंक': 'Rank', 'सिक्ताओ': 'Sixtao', '陳太宗': 'Chen Taizong', 'لومڤور': 'لوموور', 'விளையாட்டு': 'Sports', '的?': 'of?', '안녕하세요': 'Hi', 'נבאים': 'Prophets', 'ন্য': 'N.', 'şimdi': 'now', 'भरना': 'Fill', 'धरी': 'Clog', '漢字': 'Chinese character', 'इसि': 'This', 'आवर': 'Hour', 'काटना': 'cutting', '僕だけがいない街': 'City where I alone does not exist', 'जूठा': 'Lier', 'çekerdik': 'We Are', 'čaj': 'tea', 'నందొ': 'Nando', '核对无误': 'Check is correct', '無限': 'unlimited', 'समेटना': 'Crimp', 'żurek': 'sour soup', 'আছে': 'There are', 'مرتك': 'Committed', 'आपाद': 'Exorcism', 'चोरी': 'theft', 'బవిష్కతి': 'Baviskati', 'निकालना': 'removal', 'सीमा': 'Limit', '配信開始日': 'Distribution start date', '宝血': 'Blood', 'ग्यारह': 'Eleven', 'गरीब': 'poor', '있고': 'There', 'île': 'island', 'स्मि': 'Smile', 'енгел': 'engel', 'вы': 'you', 'οπότε': 'so', 'सूनापन': 'Desolation', 'áraszt': 'exudes', 'मारि': 'Marie', '서로를': 'To each other', 'घोपना': 'To announce', 'फितूर': 'Fitur', 'あからさまに機嫌悪いオーラ出してんのに話しかけてくるし': 'I will speak to the out-putting aura outright bad', '封柏荣': 'Feng Bairong', '「褒められたものではない」': '"It is not a praise"', '傳統文化已經失存了': 'Traditional culture has lost', '水木清华': 'Shuimu Tsinghua', 'पन्नी': 'Foil', 'ਰਾਜਬੀਰ': 'Rajbir', '煮立て': 'Boiling', '外国的月亮比较远': 'Foreign moon is far away', 'شغف': 'passion', 'గురించి': 'About', 'övp': 'ÖVP', '实名买票制': 'Real name ticket system', 'बिखरना': 'To scatter', 'التَعَمّقِ': 'Persecution', 'जीवें': 'The living', 'गई': 'Has been', '部頭合約': 'Head contract', 'دقیانوس': 'Precise', 'फंकी': 'Funky', '粵拼': 'Cantonese fight', 'ĉiohelpanto': 'all supporter', 'मारना': 'killing', 'मानजा': 'Manja', 'अष्टांगिक': 'Octagonal', 'への': 'To', 'रहना': 'live', 'खाना': 'food', 'ಕೋಕಿಲ': 'Kokila', 'చిన్న': 'Small', '金髪': 'Blond hair', '籍贯': 'Birthplace', 'उखाड़ना': 'Extirpate', 'α2': "A'2", 'להשתלט': 'take over control', '露西亚': 'Lucia', '大有「寧可錯殺一千': 'There is a lot of "I would rather kill a thousand', 'संस्कृत': 'Sanskrit', 'христо': 'Christo', 'लगाना': 'to set', '선배님': 'Seniors', 'उड़ीसा': 'Orissa', 'ஆய்த': 'Ayta', 'तेरे': 'Your', 'आकांक्षी': 'Aspiring', 'बाज़ार': 'The market', 'हर्ज़': 'Herz', 'だします': 'Will do.', 'चक्कर': 'affair', '없어': 'no', 'चम्पक': 'Champak', 'ताल': 'rythm', 'āgamas': 'hobby', 'योद्धा': 'Warrior', 'αλυπος': 'chain', 'बेड़ा': 'Fleet', 'बात': 'talk', '做莫你酱紫的?': 'Do you make your sauce purple?', '見逃し': 'Miss', '鰹節': 'Bonito', 'तथैव': 'Fact', 'आध्यात्मिकता': 'Spirituality', '正弦': 'Sine', 'लिया': 'took', 'îndrăgire': 'love', '我願意傾聽': 'I am willing to listen', '家乡': 'hometown', '大败': 'Big defeat', 'मए': 'Ma', 'జ్ఞ\u200cా': 'Sign', 'օօ': 'oo', 'खेमे': 'Camps', 'الشوكولاه': 'Chocolate', 'полностью': 'completely', '商品発売日': 'Product Release Date', '金継ぎ': 'Gold piecing', '烤全羊是多少人民币呢?': 'How much is the renminbi roasted in the whole sheep?', 'γолемата': 'golem', '孫瀛枚': 'Grandson', 'özlüyorum': 'I am missing', 'خلبوص': 'Libs', 'アンダージー': 'Undergar', '蝴蝶': 'butterfly', 'প্রশ্ন': 'The question', 'जरूरी': 'Necessary', 'बूरा': 'Bura', '有毒': 'poisonous', 'सान्त्वना': 'Comfort', '눈치': 'tact', '\ufeffwhat': 'what', 'çeşme': 'fountain', 'ごっめんなさい': 'Please, sorry.', 'விளக்கம்': 'Description', '罗西亚': 'Rosia', 'गोटा': 'Knot', 'लेखावलिपुस्तके': 'Accounting Tips', 'सम्भोग': 'Sexual intercourse', '慢走': 'Slow walking', 'तुम': 'you', 'लीला': 'Leela', 'ฉันจะทำให้ดีที่สุด': 'I will do my best', 'चम्पु': 'Shampoo', '視聽華語': 'Audiovisual Chinese', '比比皆是': 'Abound', 'धावा': 'Run', '娘娘': 'Goddess', 'पहाड़': 'the mountain', 'राजा': 'King', '茹西亚': 'Rusia', 'ब्राह्मणक्षत्रियविशां': 'Brahmin Constellations', '수업하니까': "I'm in class.", 'చెల్లెలు': 'Sister', 'साजे': 'Made', '支那人': 'Zhina', '团表': 'Group table', '讲得': 'Speak', 'へからねでて': 'From to', 'über': 'over', 'քʀɛʋɛռt': 'kʀɛʋɛr t', 'นเรศวร': 'Naresuan', '方言': 'dialect', 'पना': 'Find out', '怕樱': 'Afraid of cherry', 'घुल': 'Dissolve', '米帝': 'Midi', 'طيب': 'Ok', 'प्रेम': 'Love', 'पढ़ाई': 'studies', '他穿褲子': 'He wears pants', 'मेरी': 'Mine', 'উত্তর': 'Reply', 'स्थिति': 'Event', '\x1b\xadü': '?\xadü', 'तैश': 'Tachsh', '写得好': 'Well written', 'मॉलूम': 'Known', '创意梦工坊': 'Creative Dream Workshop', 'आपकी': 'your', 'मिलना': 'Get', '배웠어요': 'I learned it', '許自東': 'Xu Zidong', 'जाऊँ': 'Go', 'अहिंसा': 'Nonviolence', 'джоу': 'Joe', '金繕い': 'Gold patting', '好心没好报': 'No return on a good deed', 'çevapsiz': 'unanswered', 'मिल': 'The mill', '日本語': 'Japanese', 'اخذ': 'Get', '水军': 'Water army', 'बिना': 'without', 'बनाना': 'Make', 'التوبه': 'Repentance', '一个灵运行在我的家': 'a spirit running in my home', '한국어를': 'Korean', 'कि': 'That', 'εισα': 'import', 'लगना': 'feel', 'गपोड़िया': 'Gopodiya', 'ārūpyadhātu': 'exhyadadate', 'आए': 'Returns', '吃好吃金': 'Eat delicious gold', 'นั่น': 'that', 'बढ़ाना': 'raise up', '보니': 'Bonnie', '爱着': 'Love', '선배': 'Elder', '刷屏': 'Brush screen', '人性化': 'Humanize', 'خرسانة': 'Concrete', '麻辣乾鍋': 'Spicy dry pot', '\x10œø': '? œø', 'सतरंगी': 'Satarangi', '磨合': 'Run-in', 'को': 'To', 'شهادة': 'Degree', 'একটি': 'A', 'へと': 'To', 'يلي': 'Following', '光本': 'Light source', '褒め殺し': 'Praise kill', 'आँचल': 'Anchal', 'এর': 'Of it', 'पिटा': 'Pita', 'بديش': 'Badish', 'गंगु': 'Gangu', '可是': 'but', 'की': 'Of', '谢谢。台灣同胞': 'Thank you. Taiwan compatriots', 'दम': 'power', 'मैकशि': 'Macshi', 'రాజ': 'King', '玉蘭花': 'Magnolia', '江ノ島盾子': 'Enoshima Junko', 'ѕтυpιd': 'ѕtυpιd', 'जिसका': 'Whose', 'எழுத்து': 'Letter', '甲骨文': 'Oracle', 'चूरमा': 'Churma', 'चूलें': 'Chulen', 'प्रविभक्तानि': 'Interpretation', 'いる': 'To have', 'مقال': 'article', 'पाय': 'Feet', 'अतीत': 'Past', 'ármin': 'Armin', '東夷': 'Dongyi', 'आदमकद': 'Life expectancy', 'किये': 'Done', 'पतवार': 'Helm', '楽曲派アイドル': 'Song musical idols', 'डगमगाना': 'Waver', '북한': 'North Korea', '禮記': 'Book of Rites', '西魏': 'Xi Wei', '過労死': 'Death from overwork', 'बेबुनियाद': 'Unbounded', '仙人跳': 'Fairy jump', '港女': 'Hong Kong girl', '虞朝': 'Sui Dynasty', 'µ0': 'μ0', '字母词': 'Letter word', 'अर्धांगिनि': 'Arghangini', '真功夫': 'real Kong Fu', '飯糰': 'Rice ball', 'علم': 'Science', 'गुड़': 'Jaggery', 'гречку': 'buckwheat', '我方记账数字与贵方一致': 'Our billing figures are consistent with yours', 'आने': 'To arrive', 'कब': 'When', '宋楚瑜': 'James Soong', 'διητ': 'filter', 'राई': 'Rye', 'விரதம்': 'Fasting', 'बदल': 'change', '成语': 'idiom', 'ĺj': 'junk', 'какая': 'which one', '文翰': 'Wen Han', 'کـ': 'K', 'ʀɛċօʍʍɛռɖɛɖ': 'ʀɛċ oʍʍɛрɖɛɖ', 'दिलों': 'Hearts', '星期七': 'Sunday', '傳送': 'Transfer', '陳云根': 'Chen Yunen', 'ʿalaʾ': 'Allah', 'गुल': 'Gul', 'ख़बर': 'The news', 'मन्च': 'Manch', '国家知识产权局': 'State Intellectual Property Office', '행복하게': 'happily', 'बबीता': 'Babita', 'юродивый': 'holy fool', 'सफाई': 'clean', 'вода': 'water', 'लाठि': 'End', 'γὰρ': 'γσρ', '在日朝鮮人/韓国人': 'Koreans/Koreans in Japan', '学过': 'Learned', 'जमाना': 'Solidification', 'इकोनॉमिक्स': 'Economics', 'क्या': 'what', 'ドア': 'door', '中国的存在本身就是原罪': 'The existence of China itself is the original sin', 'मौसमि': 'Season', 'ठाठ': 'Chic', 'ἡμετέραν': 'another', '火了mean': 'Fired mean', 'せえの': 'Set out', 'ஆயுத': 'Arms', 'कंपनी': 'company', 'अहम्': 'Ego', 'भर': 'Filled', 'फिरना': 'To revolve', '高山族': 'Gaoshan', '王飞飞是一个哑巴的婊子。': 'Wang Feifei is a dumb nephew.', '反清復明': 'Anti-clearing', '关门放狗': 'Close the dog', '中国民族伟大复兴?i': 'The great revival of the Chinese nation? i', '漢名': 'Han name', 'ín': 'into the', '이름은': 'name is', '뽀비엠퍼러': 'Pobi Emperor', '堅決反對台獨言論': 'Resolutely oppose Taiwan independence speech', 'ड़': 'D', 'ادب': 'Literary', '〖2x〗': '〖〗 2x', 'في': 'in a', 'परन्तप': 'Parantap', '不正常人類研究中心': 'Abnormal human research center', 'метью': 'by the way', 'σε': 'in', '罗马炮架': 'Roman gun mount', 'đỗ': 'parked', '二哈': 'Erha', '中國': 'China', 'मुझे': 'me', 'бджа': 'bzha', 'छुटाना': 'To leave', 'সহ': 'Including', 'δίδυμος': 'twin', 'दौरा': 'Tour', 'आया': 'He Came', 'ľor': 'Lor', 'شاف': 'balmy', 'افشلك': 'I miss you', 'पता': 'Address', 'śląska': 'Silesian', '我还有几个条件呢。': 'I still have a few conditions.', '元寇': 'Yuan', 'सहायक': 'Assistant', 'टाइम': 'Time', 'समुदाय': 'Community', 'टिपटिपवा': 'Tiptipava', '五毛党': 'Wu Mao Party', 'दे': 'Give', '屠城': 'Slaughter city', 'कहने': 'To say', 'कलमूहा': 'Kalamuha', 'בפנים': 'in', 'कर्माणि': 'Creation', 'अथवा': 'Or', 'રાજ્ય': 'State', 'घसोना': 'Shed', '今そう言う冗談で笑える気分じゃないから一人にしてって言ったら何があったの?話聞くよって言われたんだけど': 'I do not feel like being laughing with what I say now, so what happened when I told you to be alone? I was told by listening and talking', 'परमो': 'Paramo', 'υφηιρειτω': 'I suppose', 'כתובים': 'Written', '能夠': 'were able', '고등학교는': 'High School', 'बजरि': 'Breeze', 'जानेदिल': 'Knowingly', '大胜': 'Big win', 'काल': 'period', 'すみません': "I'm sorry", 'हिमाकत': 'Snow plant', 'þîfû': 'iphone', 'よね': 'right', 'هالنحس': 'Halting', 'الحجازية': 'Hijaz', 'жизнь': 'a life', 'किया': 'Did', '沸騰する': 'To boil', 'นะครับ': 'Yes', 'پیش': 'Before', 'परदाफास': 'Bustard', 'वफाएं': 'Affection', 'सैनानि': 'Sanani', 'ölü': 'dead', 'हैं': 'Are', '宋美齡': 'Song Meiling', 'கவாடம்': 'Valve', '我是一名来自大陆的高中生': 'I am a high school student from the mainland.', 'اجمل': 'The most beautiful', '가용': 'Available', 'चरम': 'Extreme', '象形文字': 'Hieroglyphics', '男女授受不亲': 'Men and women don’t kiss', 'моя': 'my', '疑义': 'Doubt', '管中閔': 'In the tube', 'çekseydin': 'pulls you were', 'عم': 'uncle', 'ルージュ': 'Rouge', '永遠': 'forever and always', 'بيت': 'a house', '「褒められた」': '"I was praised"', 'कर्णजित्': 'Karnajit', 'あたまわるい': 'Bad headache', 'ε0': 'e0', 'şafak': 'dawn', 'जलाओ': 'Burn', '観世音菩薩': '観世音', 'पीत': 'Yellow', 'दारी': 'Dari', 'የሚያየኝን': 'What i see', 'هاپو': 'Dog', '发声点': 'Vocal point', 'être': 'be', 'மாண்பாளன்': 'Manpalan', '감겨드려유': 'You can wrap it.', '知音': 'Bosom friend', 'एक': 'One', '\x01jhó': '? Jho', 'かんぜおんぼさつ': 'Punctuation', 'ødegaard': 'Ødegaard', '得理也要让人': 'It’s ok to make people', 'مرة': 'Once', 'दो': 'two', 'ने': 'has', '用乡村包围城市': 'Surround the city with the countryside', 'مكتب': 'Office', 'řepa': 'beet', 'день': 'day', '江青': 'Jiang Qing', 'ángeles': 'angels', 'çonstant': 'constant', 'टिपटिपुआ': 'Tip Tip', 'ஆன்லைன்': 'Online', '盧麗安': "Lu Li'an", 'самый': 'most', 'からかってくる男に': 'To the coming men', 'بعرف': 'I know', '知乎': 'Know almost', 'पहेलि': 'Puzzles', 'बहार': 'spring', 'भंगिमाँ': 'Bhangima', 'くわんぜおんぼさつ': 'Honey', '河殤': 'River', 'شو': 'Shaw', 'محمد': 'Mohammed', 'спереди': 'in front', '没毛病': 'No problem', 'árbenz': 'Arbenz', 'लार्वा': 'Larvae', 'चढ़ा': 'Ascend', 'तो': 'so', 'يلعب': 'Play', 'घिसा': 'Ginger', 'रेखागड़ित': 'Sketchy', 'मुरदा': 'Murada', 'चित्र': 'picture', 'தாக்கல்': 'Filing', '大败美国队': 'Big defeat to the US team', 'искусственный': 'artificial', 'चाल': 'Trick', 'घुटता': 'Kneeling', 'बिगड़ना': 'Deteriorate', '您这口味奇特也就罢了': 'Your taste is strange.', 'लायक': 'Worth', '大篆': 'Daxie', 'खिलाना': 'To feed', 'ماهو': 'What is the', 'पहुँचनेके': 'To reach', '外来語': 'Foreign language', 'चटका': 'Click', 'کوالا': 'Koala bear', '不过': 'but', 'पड़ना': 'Fall', 'पानी': 'Water', '저의': 'my', '一生懸命': 'Hard', 'مش': 'not', 'लिखामि': 'Written', 'गया': 'Gaya', 'компания': 'company', 'तेरि': 'Teri', '茶髪': 'Brown hair', '하다': 'Do', '不是共和国;中华人民共和国和朝鲜民主主义人民共和国是共和国': 'Not a republic; the People’s Republic of China and the Democratic People’s Republic of Korea are Republic', '烤全羊多少人民币呢?': 'How much is the price of roast whole sheep?', 'घर': 'Home', 'पाला': 'Frost', '左右手': 'Left and right hand', 'کوالالامپور': 'Kuala Lumpur', 'āsanas': 'asanas', 'ոո': 'e', 'сегодня': 'Today', '저는': 'I am', 'русофил': 'blossoming', '甘蓝': 'Cabbage', '流浪到淡水': 'Wandering to fresh water', 'единайсет': 'eleven', 'परात': 'Underwear', '\x7fhow': '?how', '¡que': 'what', 'ἀχιλλεύς': 'Achilles', 'так': 'So', 'آداب': 'Rituals', '\x10i': '?i', 'मट्ठा': 'Whey', '平天下悠悠之口': 'The mouth of the world', 'लुन्डा': 'Lunda', 'ся': 'camping', 'вареники': 'Vareniks', 'δο': 'gt;', 'öyle': 'so', 'पुरे': 'Enough', '看他不顺眼': 'Seeing him not pleasing to the eye', '「o」': '"O"', 'とする': 'To', 'হচ্ছে': 'Being', 'लाखों': 'Millions', 'α1': "A'1", 'نهائي': 'Final', 'ɾ̃': 'ɾ', 'तिलक': 'Tilak', 'لا': 'No', 'صور': 'photo', '怒怼': 'Roar', 'மௌன': 'Mauna', 'परिस्थिति': 'Situation', '서로가': 'Mutually', '山進': 'Shanjin', '蝴蝶蛋': 'Butterfly egg', 'ксш': 'ksş', '饭可以乱吃': 'Rice can be eaten', '渡します': 'I will hand it over.', 'österreich': 'Austria', 'øÿ\x13': 'øÿ?', 'ممتاز': 'Excellent', '蝴蝶卵': 'Butterfly egg', 'ही': 'Only', 'ठोकरे': 'Knock', '湿婆': 'Shiva', 'обожаю': 'love', 'लस्सी': 'Lassi', '操你妈': 'Fuck your mother', 'फौलादि': 'Fauladi', 'жизни': 'of life', 'đổi': 'change', '阻天下悠悠之口': 'Block the mouth of the world', 'オメエだよオメエと話してんのが苦痛なんだよ。シネ。': "It's omn it is painful to talk to Oume. Cine.", 'खाट': 'The cot', '译文': 'Translation', 'सँवरना': 'To embellish', 'дванадесет': 'twelve', '陳雲': 'Chen Yun', 'дванайсет': 'twelve', 'आँखें': 'Eyes', 'पूछते': 'Inquires', 'भंगी': 'Posture', 'सूना': 'Deserted', 'प्याला': 'Cup', 'には': 'To', 'доктора': 'the doctors', 'देना': 'give', 'बिरेन्र्द': 'Forget', 'गला': 'throat', 'रखा': 'Kept', 'हाले': 'Haley', '欢迎入坑': 'Welcome to the pit', 'डलि': 'Dallie', 'ôš': 'OS', 'يوسف': 'Yousuf', 'छोंकना': 'Strain', 'пп': 'pp', 'उसपर': 'on that', 'υρολογιστών': 'computers', '新年快乐!学业进步!身体健康!谢谢您们读我的翻译篇章': 'happy New Year! Academic progress! Healthy body! Thank you for reading my translation chapter.', '人民': 'people', 'घंटे': 'Hours', 'شباط': 'February', '食べる': 'eat', 'صلاح': 'Salah', '土澳': 'Tuao', '干嘛天天跟我说韩语': 'Why do you speak Korean with me every day?', 'ōnogi': 'climate', 'صاحب': 'owner', 'اكل': 'ate', '大唐': 'Datang', 'مطالعه': 'Study', '养生': 'Health', '车子': 'Car', 'कचरा': 'Garbage', 'महरबानि': 'Mehraban', 'शुष्टि': 'Shutti', 'интеллект': 'intelligence', '阮鏐': '阮镠', '鲁玥': 'Reckless', '입니다': 'is', 'ῥιζὤματα': 'Threads', 'люблю': 'love', 'ɛxɛʀċɨsɛ': 'it is not', 'कपड़े': 'dresses', 'उड़न': 'Flying', '广电总局': 'SARFT', '骂人': 'curse', 'የየየኝን': 'What do i say', 'बेलना': 'Crib', 'பல்லாக்குப்': 'Pallakkup', 'बराबर': 'equal', 'ظرف': 'Dish', 'होनोपैथिक': 'Homeopathic', '君子': 'Gentleman', '河和湖': 'Kawahata lake', '精一杯': 'Utmost', 'है': 'is', '非要以此为依据对人家批判一番': 'I have to criticize others on this basis.', 'வச்சன்': 'Vaccan', 'நான்': 'I', 'का': 'Of', '三味線': 'Shamisen', 'šwejk': 'švejk', 'дурак': 'fool', '风琴': 'organ', 'हिंसा': 'Violence', 'βιον': 'bio', 'नारी': 'Woman', '知らない': 'Do not know', 'मुँह': 'The mouth', 'अपरम्पार': 'Unperturbed', '秋季新款': 'Autumn new style', 'আমার': 'Me', 'عبقري': 'genius', 'आहें': 'Ah', 'ਨਾਮ': 'Name', 'महफ़िल': 'Mehfil', 'बटेर': 'Quail', '林彪': 'Lin Wei', 'जाने': 'Know', 'डोंगरिचाल': 'Mountain move', 'εἰρήνη': 'Irene', 'प्रतिलेखनम्': 'Transcript', 'дп': 'dp', 'उसने': 'He', '몇시간': 'how many hours', 'नैया': 'Naiya', 'ἐξήλλακτο': 'inexpensive', '彩蛋': 'Egg', 'उलटफेर': 'Reverse', '台湾最美的风景是人': 'The most beautiful scenery in Taiwan is people.', '馄饨': 'ravioli', 'सदके': 'Shake', '饺子': 'Dumplings', 'भूमिका': 'role', '为什么说': 'Why do you say', '요즘': 'Nowadays', 'गिरि': 'Giri', '中國話': 'Chinese words', 'द्रोहि': 'Drohi', '中庸之道': 'The doctrine of the mean', 'хочу': 'want', 'ḵarasāna': 'ḵarasana', '走gǒ': 'Go gǒ', 'जमाल': 'Jamal', 'मन': 'The mind', 'तेलि': 'Oilseed', '非诚勿扰': 'You Are the One', 'атом': 'atom', '中华民国和大韩民国是民国': 'The Republic of China and the Republic of Korea are the Republic of China', 'नलि': 'Nile', 'हाथ': 'hand', 'खाजला': 'Itching', 'ōe': 'yes', '것이다': 'will be', 'şoųl': 'şoùl', 'तश्तरि': 'Cleverness', 'χρῆσιν': 'use', '民族': 'Nationality', 'الرياضيات': 'Mathematics', 'にじゅうさい': 'Twelve months', 'ὠς': 'as', '恋に落ちないからよく悲しい': 'It is often sad because it does not fall in love', 'ਸ਼ੀਂਹ': 'Lion', '黎氏玉英': 'Li Shiyuying', 'فبراير': 'February', '白濮': 'Chalk', 'অধীনে': 'Under', 'प्रशंसा': 'appreciation', 'ขอพระเจ้าอยู่ด้วย': 'May God be with you', 'अडला': 'Bent', 'ده': 'Ten', 'पसीजना': 'Exudate', 'कोई': 'someone', 'கருக்குமட்டை': 'Karukkumattai', 'कन्नी': 'Kanni', 'यात्रा': 'journey', '白酒': 'Liquor', 'τὴν': 't', 'करना': 'do', 'उल': 'ul', '俄罗斯': 'Russia', 'உனக்கு': 'You', 'जौहर': 'Johar', 'ಸ್ವರಕ್ಷರಗಳು': 'Self-defense', '论语': 'Analects', 'šakotis': 'branching', '儿臣惶恐': 'Childier fear', '讲的?': 'Said?', 'প্রতিনিয়ত': 'Every day', 'சன்னல்': 'Sill', '蠢的像猪一样': 'Stupid like a pig', 'رح': 'Please', 'بس': 'Yes', 'εὔιδον': 'you see', 'सदबुद्धि': 'Good sense', 'भगवान': 'God', '사랑해': 'I love you', 'בתוך': 'Inside', 'čeferin': 'чеферин', '民国': 'Republic of China', '但是': 'but', 'सकता': 'can', 'घाट': 'Wharf', 'čechy': 'Bohemia', '抹黑': 'Smear', 'γλαυκῶπις': 'greyhounds', 'నీకెందుకు': 'Nikenduku', 'चमत्कार': 'Miracle', 'दुनिया': 'world', 'یہاں': 'Here', 'اللي': 'Elly', 'খামার': 'The farm', '一呼百诺': 'One call', 'വിഡ്ഢി': 'Stupid', 'दिल': 'heart', 'тeenage': 'trinage', '皇上': 'emperor', 'टटोलना': 'Grope', '犬子': 'Dog', '我希望有一天你沒有公王病': 'I hope that one day you don’t have a king', '并无分裂中国的意图': 'No intention to split China', 'óscar': 'oscar', 'ኤልሮኢ': 'Alright', 'ŷhat': 'hhat', '천사': 'Angel', 'दी।': 'Given', 'रड़क': 'Raze', 'कानी': 'Kani', '江之島盾子': 'Enokima Shiko', '老生常谈': 'Old talk', 'των': 'of', '星期日': 'on Sunday', 'पैन्डा': 'Panda', 'マリも仲直りしました': 'Mari also made up.', 'ты': 'you', 'देश': 'Country', 'ठंडक': 'Coolness', 'নামল': 'Get down', 'जर्मनी': 'Germany', 'шли': 'walked', 'たべる': 'To eat', 'लिए': 'for', '鸡汤文': 'Chicken soup', 'มวยไทย': 'Thai boxing', '簡訊': 'Newsletter', 'منزل': 'Home', 'कर': 'Tax', '生女眞': 'Daughter-in-law', 'ек': 'ek', 'સંઘ': 'Union', '\u200bsalarpuria': 'Salphary', 'ţara': 'the country', 'नही': 'No', 'मगज': 'Mercury', 'অক্ষয়': 'Akshay', 'حال': 'Now', 'चिन्दी': 'Chindee', 'τῆς': 'her', '福哒柄': 'Good fortune', '청하': 'Qinghai', '越人': 'Yueren', 'なかなかに謎だな': "It's quite a mystery.", 'रखना': 'keep', 'பரை': 'Parai', 'करके': 'By doing', 'فِي': 'F', 'गुथना': 'Knit', '话不可以乱讲': "Can't talk nonsense", 'वैकल्पिक': 'Alternative', 'ਨਾਮੁ': 'Name', '你别高兴得太早': "Don't be too happy too early", '煎餅': 'Pancake', '한다': 'do', 'सबब': 'Cause', 'বিষয়টি': 'Matter', 'कोसना': 'To crack', 'ㅜㅜ': 'ㅜ', 'অক্সয়': 'Akshay', 'الدوالي': 'Varicose veins', 'பயிர்ப்பு': 'Yields', 'अजा': 'SC', 'あの色々': 'That kind of variety', 'емеля': 'emel', 'मेवा': 'Meva', 'जलवाफरोज़': 'Jalwa Phoroz', '中庸': 'Moderate', 'उसके': 'his', 'अहम': 'Important', 'वहम': 'Vanity', 'ís': 'ice', 'कलात्मक': 'Artistic', 'ἀχιλῆος': 'Achilles', '民族罪人': 'National sinner', 'मुकाम': 'Peer', 'ão': 'to', '한국': 'Korea', 'ادهر': 'Idir', '一長': 'One long', 'てくれませんか': 'Would you please', 'ブレイク': 'break', 'शहनाई': 'the clarinet', 'तीरन्दाज़ि': 'Arrows', 'रूढ़ीवादि': 'Conservative', 'झाँकी': 'Peeping', 'सत्यवादी': 'Truthful', '郑琳': 'Zheng Lin', 'युनानी': 'Unani', 'φώνας': 'light', 'जमना': 'Solidify', '〖plg〗': '〖Plg〗', 'हरी': 'Green', 'بطولة': 'championship', '这些词怎么读?这些词怎么说?这些词怎么念?which': 'How do you read these words? What do these words say? How do you read these words? Which', 'алтерман': 'alterman', 'اَلبَحْثِ': 'البحث', 'موجود': 'Available', 'कश्ति': 'Power', 'اسم': 'Noun', 'लाज़मि': 'Shameful', 'तुमने': 'you', '공화국': 'republic', 'लुक्का': 'Lukka', '史记': 'Historical record', '포경수술': 'Circumcised', '高端大气上档次into': 'High-end atmospheric grade into', 'что': 'what', 'योध': 'Rift', 'धर्म': 'religion', 'दरदर': 'Tariff', '訓読み': 'Kun Readings', 'впереди': 'in front', '민국': 'Republic of Korea', 'εντσα': 'in', '我搜了这本小说': 'I searched this novel.', '杨皎滢?': 'Yang Wei?', 'भारतीयों': 'Indians', '巴蜀': 'Bayu', '\x02tñ\x7f': '? tñ?', 'αβtαβ': 'aba', 'जल': 'water', 'बाध्य': 'Bound', 'মহাবিশ্ব': 'Universe', 'প্রসারিত': 'Stretch', 'अन्जाम': 'Anjaam', 'जीतना': 'win', 'कड़ा': 'Hard', '刁民': 'Untouchable', 'ขอให้พระเจ้าอยู่ด้วย': 'May God live too.', '油腻': 'Greasy', 'ᗯoᗰeᑎ': 'ᗯoᗰe ᑎ', 'להתראות': 'Goodbye', 'वाले': 'Ones', 'አየሁ': 'I saw', 'ओर': 'And', 'ずand': 'Without', 'निगोड़ा': 'Nigoda', 'эй': 'Hey'}
lm=nltk.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon'}
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
"could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
"don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
"he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
"how's": "how is", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
"i'm": "i am", "i've": "i have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
"i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
"it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is",
"let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
"mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
"needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
"oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
"she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
"she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
"so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have",
"that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is",
"here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will",
"they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
"wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
"we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
"what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is",
"what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
"where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
"who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
"won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
"wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
"y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would",
"you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are",
"you've": "you have" }
nts={' b\\w ':' between ',' & ':' and ',' e.g. ':' example ',' e.g ':' example ',' eg ':' example ',
'gimme':'give me','^':' to the power of ',"it'll've": "it will have","it's": "it is", "let's": "let us",
"ma'am": "madam", "mayn't": "may not",
"might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
"mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
"o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
"sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
"she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not",
"shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would",
"that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have",
"there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will",
"they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not",
"we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
"we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
"what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
"where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
"who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
"won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
"y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
"y'all've": "you all have","you'd": "you would","you'd've": "you would have", "you'll": "you will",
"you'll've": "you will have", "you're": "you are", "you've": "you have" }
# In[4]:
#functions for preprocessing!
def extras(text): #text = string form
for word, initial in nts.items():
text = text.replace(word.lower(), initial)
for word, initial in contraction_mapping.items(): #contractions included
text = text.replace(word.lower(), initial)
return text
def remove_punctuation(text): #here text is in string form
new_text="".join([ch for ch in text if ch not in string.punctuation])
return new_text
def languages(text): #text = string form
for word, initial in words_to_english_map.items():
text = text.replace(word, initial)
return text
def tokenize(text): #text in string form
new_text=re.split('\W+',text)
return new_text
def lemmatizing(text): #text = tokenized
new_text=[lm.lemmatize(word) for word in text]
return new_text
def rem_stopwords(text): #text = tokenized
new_text=[word for word in text if word not in stopwords ]
return new_text
def correct_spelling(text): #text = tokenized
for word, initial in mispell_dict.items():
text = text.replace(word, initial)
return text
def contractions(text): #text = list form, returns in string form
new_text=""
for word in text:
if word in contraction_mapping:
new_text=new_text+contraction_mapping[word]+" "
else:
new_text=new_text+word+" "
return new_text
def listtostring(text): # text= list form
mystr=' '.join([str(elem) for elem in text])
return mystr
def rem_single_char(text):
text=' '.join( [w for w in text.split() if len(w)>1] )
return text
# # Visualizing our output from various preprocessing steps
# ### From the output we can see how our input text can been processed and changed to a great degree so that our model gives better results
# In[5]:
#checking for correct order of preprocessing
text="what were lyanna stark’s and elia martell’s they'd 隶书 titles after they wed prince rhaegar?"
text=text.lower()
text=extras(text)
text=correct_spelling(text)
text=languages(text)
text=text.lower()
text=text.encode("ascii","replace")
text=text.decode()
text=tokenize(text)
text=lemmatizing(text)
text=listtostring(text)
text=rem_single_char(text)
text=remove_punctuation(text)
text
# In[6]:
trainset['question_text']=trainset['question_text'].str.lower()
testset['question_text']=testset['question_text'].str.lower()
# In[7]:
trainset['question_text']=trainset['question_text'].apply(lambda x:extras(x))
testset['question_text']=testset['question_text'].apply(lambda x:extras(x))
# In[8]:
trainset['question_text']=trainset['question_text'].apply(lambda x:correct_spelling(x))
testset['question_text']=testset['question_text'].apply(lambda x:correct_spelling(x))
# In[9]:
trainset['question_text']=trainset['question_text'].apply(lambda x:languages(x))
testset['question_text']=testset['question_text'].apply(lambda x:languages(x))
# In[10]:
trainset['question_text']=trainset['question_text'].apply(lambda x: x.encode("ascii","replace").decode())
testset['question_text']=testset['question_text'].apply(lambda x:x.encode("ascii","replace").decode())
# In[11]:
trainset['question_text']=trainset['question_text'].apply(lambda x:tokenize(x))
testset['question_text']=testset['question_text'].apply(lambda x:tokenize(x))
# In[12]:
trainset['question_text']=trainset['question_text'].apply(lambda x:lemmatizing(x))
testset['question_text']=testset['question_text'].apply(lambda x:lemmatizing(x))
# In[13]:
trainset['question_text']=trainset['question_text'].apply(lambda x:listtostring(x))
testset['question_text']=testset['question_text'].apply(lambda x:listtostring(x))
# In[14]:
trainset['question_text']=trainset['question_text'].apply(lambda x:rem_single_char(x))
testset['question_text']=testset['question_text'].apply(lambda x:rem_single_char(x))
# ### Creating the train and test arrays
# In[15]:
X_train=trainset.iloc[:,[1]].values
X_test=testset.iloc[:,[1]].values
y_train=trainset.iloc[:,[-1]].values
y_test=testset.iloc[:,[-1]].values
# In[16]:
X_train=X_train.flatten()
X_test=X_test.flatten()
X_train
# ## Count Vectorizer with ngram range (1,4)
# In[17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,4))
vectorizer.fit(X_train)
X_trainc=vectorizer.transform(X_train)
X_testc=vectorizer.transform(X_test)
# ## TFIDF Vectorizer
# In[18]:
from sklearn.feature_extraction.text import TfidfVectorizer #token_pattern=r'\w{1,}',stop_words='english',
vectorizer2 = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char_wb',
ngram_range=(1, 4),
max_features=50000)
vectorizer2.fit(X_train)
vectorizer2.fit(X_test)
X_traint=vectorizer2.transform(X_train)
X_testt=vectorizer2.transform(X_test)
# ## LOGISTIC REGRESSION for out Count Vectorizer output
# This Logistic Regression classifier is specifically for our Count Vectorizer output matrix
# In[19]:
from sklearn.linear_model import LogisticRegression
lrc=LogisticRegression(solver='liblinear',penalty='l2')
lrc.fit(X_trainc,y_train)
# ## LOGISTIC REGRESSION for out TFIDF Vectorizer output
# This Logistic Regression classifier is specifically for our TFIDF Vectorizer output matrix
# In[20]:
from sklearn.linear_model import LogisticRegression
lrt=LogisticRegression()
lrt.fit(X_traint,y_train)
# ### Getting our predicted probabilities with both our models
# In[21]:
Yc=lrc.predict_proba(X_testc)
Yc=Yc[:,1]
# In[22]:
Yt=lrt.predict_proba(X_testt)
Yt=Yt[:,1]
# ### Combining the two model's predicted probabilities in 3:7 ratio ( 3 parts for TFIDF model + 7 parts for Count Vectorizer model)
# In[33]:
Y=0.3*Yt+0.7*Yc
# ## Getting our final classified output setting threshold as 0.19
# This threshold ensures that our final output is marked as 1 if the predicted probability of 1 is anything above 0.19
# In[34]:
pred_testY= (Y>0.19).astype(np.int)
pred_testY
# In[30]:
X_submission=testset.iloc[:,0:1]
# In[36]:
submission = pd.DataFrame({'qid':X_submission['qid'], 'target':pred_testY})
print(submission)
# ### Writing our final output to a CSV file
# In[37]:
submission.to_csv('.\ensemblingbest.csv', index=False)