Merge pull request #101 from mtybadger/main

Spruce Campbell Will Hathaway Cross-Lingual Fine-Tuning for Multilingual Text Embeddings
deep-learning-mit · Dec 16, 2023 · 0fe85c0 · 0fe85c0
2 parents cf791d1 + d705cff
commit 0fe85c0
Show file tree

Hide file tree

Showing 19 changed files with 772 additions and 0 deletions.
diff --git a/_posts/2023-11-09-multilingual-representations-in-embeddings-models.md b/_posts/2023-11-09-multilingual-representations-in-embeddings-models.md
diff --git a/assets/bibliography/2023-11-09-multilingual-representations-in-embeddings-models.bib b/assets/bibliography/2023-11-09-multilingual-representations-in-embeddings-models.bib
@@ -0,0 +1,323 @@
+@article{rag,
+  author       = {Patrick S. H. Lewis and
+                  Ethan Perez and
+                  Aleksandra Piktus and
+                  Fabio Petroni and
+                  Vladimir Karpukhin and
+                  Naman Goyal and
+                  Heinrich K{\"{u}}ttler and
+                  Mike Lewis and
+                  Wen{-}tau Yih and
+                  Tim Rockt{\"{a}}schel and
+                  Sebastian Riedel and
+                  Douwe Kiela},
+  title        = {Retrieval-Augmented Generation for Knowledge-Intensive {NLP} Tasks},
+  journal      = {CoRR},
+  volume       = {abs/2005.11401},
+  year         = {2020},
+  url          = {https://arxiv.org/abs/2005.11401},
+  eprinttype    = {arXiv},
+  eprint       = {2005.11401},
+  timestamp    = {Fri, 29 May 2020 09:57:22 +0200},
+  biburl       = {https://dblp.org/rec/journals/corr/abs-2005-11401.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{mteb,
+      title={MTEB: Massive Text Embedding Benchmark}, 
+      author={Niklas Muennighoff and Nouamane Tazi and Loïc Magne and Nils Reimers},
+      year={2023},
+      eprint={2210.07316},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{sbert,
+      title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks}, 
+      author={Nils Reimers and Iryna Gurevych},
+      year={2019},
+      eprint={1908.10084},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{attn,
+      title={Attention Is All You Need}, 
+      author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
+      year={2023},
+      eprint={1706.03762},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@article{nmt,
+title = {Progress in Machine Translation},
+journal = {Engineering},
+volume = {18},
+pages = {143-153},
+year = {2022},
+issn = {2095-8099},
+doi = {https://doi.org/10.1016/j.eng.2021.03.023},
+url = {https://www.sciencedirect.com/science/article/pii/S2095809921002745},
+author = {Haifeng Wang and Hua Wu and Zhongjun He and Liang Huang and Kenneth Ward Church},
+keywords = {Machine translation, Neural machine translation, Simultaneous translation},
+abstract = {After more than 70 years of evolution, great achievements have been made in machine translation. Especially in recent years, translation quality has been greatly improved with the emergence of neural machine translation (NMT). In this article, we first review the history of machine translation from rule-based machine translation to example-based machine translation and statistical machine translation. We then introduce NMT in more detail, including the basic framework and the current dominant framework, Transformer, as well as multilingual translation models to deal with the data sparseness problem. In addition, we introduce cutting-edge simultaneous translation methods that achieve a balance between translation quality and latency. We then describe various products and applications of machine translation. At the end of this article, we briefly discuss challenges and future research directions in this field.}
+}
+
+@inproceedings{gpt1,
+  title={Improving Language Understanding by Generative Pre-Training},
+  author={Alec Radford and Karthik Narasimhan},
+  year={2018},
+  url={https://api.semanticscholar.org/CorpusID:49313245}
+}
+
+@inproceedings{gpt2,
+  title={Language Models are Unsupervised Multitask Learners},
+  author={Alec Radford and Jeff Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever},
+  year={2019},
+  url={https://api.semanticscholar.org/CorpusID:160025533}
+}
+
+@misc{rmae,
+      title={RetroMAE: Pre-Training Retrieval-oriented Language Models Via Masked Auto-Encoder}, 
+      author={Shitao Xiao and Zheng Liu and Yingxia Shao and Zhao Cao},
+      year={2022},
+      eprint={2205.12035},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{w2v,
+      title={Efficient Estimation of Word Representations in Vector Space}, 
+      author={Tomas Mikolov and Kai Chen and Greg Corrado and Jeffrey Dean},
+      year={2013},
+      eprint={1301.3781},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@article{sent, title={Word embeddings for sentiment analysis - Towards Data Science}, url={https://towardsdatascience.com/word-embeddings-for-sentiment-analysis-65f42ea5d26e}, journal={Medium}, author={Carremans, Bert} }
+
+@misc{sgpt,
+      title={SGPT: GPT Sentence Embeddings for Semantic Search}, 
+      author={Niklas Muennighoff},
+      year={2022},
+      eprint={2202.08904},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{lyst,
+      title={Metadata Embeddings for User and Item Cold-start Recommendations}, 
+      author={Maciej Kula},
+      year={2015},
+      eprint={1507.08439},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR}
+}
+
+@inproceedings{kmeans,
+  title={Some methods for classification and analysis of multivariate observations},
+  author={J. MacQueen},
+  year={1967},
+  url={https://api.semanticscholar.org/CorpusID:6278891}
+}
+
+@misc{rerank,
+      title={Passage Re-ranking with BERT}, 
+      author={Rodrigo Nogueira and Kyunghyun Cho},
+      year={2020},
+      eprint={1901.04085},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR}
+}
+
+@misc{beir,
+      title={BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, 
+      author={Nandan Thakur and Nils Reimers and Andreas Rücklé and Abhishek Srivastava and Iryna Gurevych},
+      year={2021},
+      eprint={2104.08663},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR}
+}
+
+@article{bm25,
+author = {Robertson, Stephen and Zaragoza, Hugo},
+title = {The Probabilistic Relevance Framework: BM25 and Beyond},
+year = {2009},
+issue_date = {April 2009},
+publisher = {Now Publishers Inc.},
+address = {Hanover, MA, USA},
+volume = {3},
+number = {4},
+issn = {1554-0669},
+url = {https://doi.org/10.1561/1500000019},
+doi = {10.1561/1500000019},
+abstract = {The Probabilistic Relevance Framework (PRF) is a formal framework for document retrieval, grounded in work done in the 1970—1980s, which led to the development of one of the most successful text-retrieval algorithms, BM25. In recent years, research in the PRF has yielded new retrieval models capable of taking into account document meta-data (especially structure and link-graph information). Again, this has led to one of the most successful Web-search and corporate-search algorithms, BM25F. This work presents the PRF from a conceptual point of view, describing the probabilistic modelling assumptions behind the framework and the different ranking algorithms that result from its application: the binary independence model, relevance feedback models, BM25 and BM25F. It also discusses the relation between the PRF and other statistical models for IR, and covers some related topics, such as the use of non-textual features, and parameter optimisation for models with free parameters.},
+journal = {Found. Trends Inf. Retr.},
+month = {apr},
+pages = {333–389},
+numpages = {57}
+}
+
+@inproceedings{glove,
+author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher},
+year = {2014},
+month = {01},
+pages = {1532-1543},
+title = {Glove: Global Vectors for Word Representation},
+volume = {14},
+journal = {EMNLP},
+doi = {10.3115/v1/D14-1162}
+}
+
+@misc{mssg,
+      title={Efficient Non-parametric Estimation of Multiple Embeddings per Word in Vector Space}, 
+      author={Arvind Neelakantan and Jeevan Shankar and Alexandre Passos and Andrew McCallum},
+      year={2015},
+      eprint={1504.06654},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{bert,
+      title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, 
+      author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
+      year={2019},
+      eprint={1810.04805},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{berthater,
+      title={Evaluation of BERT and ALBERT Sentence Embedding Performance on Downstream NLP Tasks}, 
+      author={Hyunjin Choi and Judong Kim and Seongho Joe and Youngjune Gwon},
+      year={2021},
+      eprint={2101.10642},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{roberta,
+      title={RoBERTa: A Robustly Optimized BERT Pretraining Approach}, 
+      author={Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},
+      year={2019},
+      eprint={1907.11692},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{gte,
+      title={Towards General Text Embeddings with Multi-stage Contrastive Learning}, 
+      author={Zehan Li and Xin Zhang and Yanzhao Zhang and Dingkun Long and Pengjun Xie and Meishan Zhang},
+      year={2023},
+      eprint={2308.03281},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{e5,
+      title={Text Embeddings by Weakly-Supervised Contrastive Pre-training}, 
+      author={Liang Wang and Nan Yang and Xiaolong Huang and Binxing Jiao and Linjun Yang and Daxin Jiang and Rangan Majumder and Furu Wei},
+      year={2022},
+      eprint={2212.03533},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{bge,
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, 
+      author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighof},
+      year={2023},
+      eprint={2309.07597},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{st5,
+      title={Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models}, 
+      author={Jianmo Ni and Gustavo Hernández Ábrego and Noah Constant and Ji Ma and Keith B. Hall and Daniel Cer and Yinfei Yang},
+      year={2021},
+      eprint={2108.08877},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{chinchilla,
+      title={Training Compute-Optimal Large Language Models}, 
+      author={Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego de Las Casas and Lisa Anne Hendricks and Johannes Welbl and Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican and George van den Driessche and Bogdan Damoc and Aurelia Guy and Simon Osindero and Karen Simonyan and Erich Elsen and Jack W. Rae and Oriol Vinyals and Laurent Sifre},
+      year={2022},
+      eprint={2203.15556},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{persimmon,
+  author = {Elsen, Erich and Odena, Augustus and Nye, Maxwell and Taşırlar, Sağnak and Dao, Tri and Hawthorne, Curtis and Moparthi, Deepak and Somani, Arushi},
+  title = {Releasing {Persimmon-8B}},
+  url = {https://www.adept.ai/blog/persimmon-8b},
+  year = {2023}
+}
+
+@misc{gnmt,
+      title={Google's Multilingual Neural Machine Translation System: Enabling Zero-Shot Translation}, 
+      author={Melvin Johnson and Mike Schuster and Quoc V. Le and Maxim Krikun and Yonghui Wu and Zhifeng Chen and Nikhil Thorat and Fernanda Viégas and Martin Wattenberg and Greg Corrado and Macduff Hughes and Jeffrey Dean},
+      year={2017},
+      eprint={1611.04558},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{gtr,
+      title={Large Dual Encoders Are Generalizable Retrievers}, 
+      author={Jianmo Ni and Chen Qu and Jing Lu and Zhuyun Dai and Gustavo Hernández Ábrego and Ji Ma and Vincent Y. Zhao and Yi Luan and Keith B. Hall and Ming-Wei Chang and Yinfei Yang},
+      year={2021},
+      eprint={2112.07899},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR}
+}
+
+@misc{msmarco,
+      title={MS MARCO: A Human Generated MAchine Reading COmprehension Dataset}, 
+      author={Payal Bajaj and Daniel Campos and Nick Craswell and Li Deng and Jianfeng Gao and Xiaodong Liu and Rangan Majumder and Andrew McNamara and Bhaskar Mitra and Tri Nguyen and Mir Rosenberg and Xia Song and Alina Stoica and Saurabh Tiwary and Tong Wang},
+      year={2018},
+      eprint={1611.09268},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{fever,
+      title={FEVER: a large-scale dataset for Fact Extraction and VERification}, 
+      author={James Thorne and Andreas Vlachos and Christos Christodoulopoulos and Arpit Mittal},
+      year={2018},
+      eprint={1803.05355},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{mt5,
+      title={mT5: A massively multilingual pre-trained text-to-text transformer}, 
+      author={Linting Xue and Noah Constant and Adam Roberts and Mihir Kale and Rami Al-Rfou and Aditya Siddhant and Aditya Barua and Colin Raffel},
+      year={2021},
+      eprint={2010.11934},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{nllb,
+      title={No Language Left Behind: Scaling Human-Centered Machine Translation},
+      author={NLLB Team and Marta R Costa-jussa and James Cross and Onur Celebi and Maha Elbayad and Kenneth Heafield and Elahe Kalbassis and Janice Lam},
+      year={2022},
+      eprint={2207.04672},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+      }
+
+@misc{infonce,
+      title={Representation Learning with Contrastive Predictive Coding}, 
+      author={Aaron van den Oord and Yazhe Li and Oriol Vinyals},
+      year={2019},
+      eprint={1807.03748},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
diff --git a/assets/html/2023-11-09-multilingual-representations-in-embeddings-models/special_demo.html b/assets/html/2023-11-09-multilingual-representations-in-embeddings-models/special_demo.html
diff --git a/assets/html/2023-11-09-multilingual-representations-in-embeddings-models/word2vec_demo.html b/assets/html/2023-11-09-multilingual-representations-in-embeddings-models/word2vec_demo.html
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/base.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/base.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/bert.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/bert.png
diff --git a/...ts/img/2023-11-09-multilingual-representations-in-embeddings-models/english.jpg b/...ts/img/2023-11-09-multilingual-representations-in-embeddings-models/english.jpg
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/langs.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/langs.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/mlm.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/mlm.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/mteb.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/mteb.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/multi.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/multi.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/nsp.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/nsp.png
diff --git a/...g/2023-11-09-multilingual-representations-in-embeddings-models/openai_embed.png b/...g/2023-11-09-multilingual-representations-in-embeddings-models/openai_embed.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/sbert.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/sbert.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/scale.png b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/scale.png
diff --git a/...ts/img/2023-11-09-multilingual-representations-in-embeddings-models/scaling.png b/...ts/img/2023-11-09-multilingual-representations-in-embeddings-models/scaling.png
diff --git a/...ts/img/2023-11-09-multilingual-representations-in-embeddings-models/vs_base.png b/...ts/img/2023-11-09-multilingual-representations-in-embeddings-models/vs_base.png
diff --git a/...s/img/2023-11-09-multilingual-representations-in-embeddings-models/vs_multi.png b/...s/img/2023-11-09-multilingual-representations-in-embeddings-models/vs_multi.png
diff --git a/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/words.jpg b/assets/img/2023-11-09-multilingual-representations-in-embeddings-models/words.jpg