lexsemtm.py

"""
Created by: Andrew Bennett
Last updated: July, 2016

Provides class for accessing LexSemTM
"""

import csv
import json
import os
import subprocess

DEBUG = True

def get_reader(lexsemtm_path):
    """
    Obtain LexSemTMReader object

    :param lexsemtm_path: directory containing LexSemTM data and index files
    :return: LexSemTMReader object
    """
    return LexSemTMReader(lexsemtm_path)


class LexSemTMReader:
    """
    Class for accessing LexSemTM
    """
    def __init__(self, lexsemtm_dir):
        """
        :param lexsemtm_dir: directory containing LexSemTM data and index files
        """
        self.lexsemtm_dir = lexsemtm_dir
        self.all_lemmas = {}
        self.all_lemma_indices = {}
        self.all_lemma_freqs = {}
        self.all_sense_dists = {}
        self.all_topic_models = {}
        self.vocab_lists = {}

    def get_lemma_names(self, lang="en", which_version="s"):
        """
        Obtain list of lemma names in LexSemTM

        :param lang: which language to obtain lemmas from (default: "en")
        :param which_version: which version of LexSemTM to obtain lemmas from
            (default: "s")
        :return: list of lemma names
        """
        key = (lang, which_version)
        if key not in self.all_lemmas:
            self._load_lemma_info_file(lang, which_version)
        return self.all_lemmas[key]

    def get_lemma_freq(self, lemma, lang="en", which_version="s"):
        """
        Obtain number of usages of given lemma used to train LexSemTM topic
        model (i.e. frequency of lemma in LexSemTM)

        :param lemma: lemma to obtain frequency of
        :param lang: which language lemma belongs to (default: "en")
        :param which_version: which version of LexSemTM lemma belongs to
            (default: "s")
        :return: frequency of lemma
        """
        key = (lang, which_version)
        if key not in self.all_lemma_freqs:
            self._load_lemma_info_file(lang, which_version)
        return self.all_lemma_freqs[key][lemma]

    def get_sense_dist(self, lemma, lang="en", which_version="s"):
        """
        Obtain LexSemTM sense distribution for given lemma

        :param lemma: lemma to obtain sense distribution of
        :param lang: which language lemma belongs to (default: "en")
        :param which_version: which version of LexSemTM lemma belongs to
            (default: "s")
        :return: sense distribution (of type dict, mapping sense name
            to probability)
        """
        key = (lang, which_version)
        if key not in self.all_sense_dists:
            self._load_all_sense_dists(lang, which_version)
        return self.all_sense_dists[key][lemma]

    def get_topic_model(self, lemma, lang="en", which_version="s"):
        """
        Obtain LexSemTM topic model for given lemma

        :param lemma: lemma to obtain topic model of
        :param lang: which language lemma belongs to (default: "en")
        :param which_version: which version of LexSemTM lemma belongs to
            (default: "s")
        :return: topic model output, consisting of dict containing
            doc-topic counts and topic-word counts
        """
        # extract topic model raw string from archive
        key = (lang, which_version)
        if key not in self.all_lemma_indices:
            self._load_lemma_info_file(lang, which_version)
        lemma_id = self.all_lemma_indices[key][lemma]
        tm_fname = "%s.%s.%08d.tm.json.gz" % (lang, which_version, lemma_id)
        tar_path = os.path.join(self.lexsemtm_dir, "%s.%s.data.tar" % key)
        extract_cmd_1 = ["tar", "-xOf", tar_path, tm_fname]
        extract_cmd_2 = ["gunzip"]
        p1 = subprocess.Popen(extract_cmd_1, stdout=subprocess.PIPE)
        p2 = subprocess.Popen(extract_cmd_2, stdout=subprocess.PIPE,
                              stdin=p1.stdout)
        tm_json_str = p2.stdout.read()

        # convert json string to usable tm object
        # (doc-topic counts and topic-word counts)
        try:
            tm_json = json.loads(tm_json_str)
        except ValueError:
            return tm_json_str
        if lang not in self.vocab_lists:
            self._load_vocab_file(lang)
        vocab_list = self.vocab_lists[lang]
        doc_topic_counts = {}
        for d, topic_counts in enumerate(tm_json["doc_topic_counts"]):
            doc_topic_counts["d_%06d" % d] = topic_counts
        topic_word_counts = {}
        for t, word_counts in tm_json["topic_word_counts"].iteritems():
            topic_word_counts[t] = {vocab_list[w]: c
                                    for w, c in zip(word_counts["word_ids"],
                                                    word_counts["counts"])}
        return {"doc_topic_counts": doc_topic_counts,
                "topic_word_counts": topic_word_counts}

    def _load_lemma_info_file(self, lang, which_version):
        """
        Load LexSemTM lemma metadata for given language/version combination

        :param lang: language from which to load metadata
        :param which_version: version of LexSemTM from which to load metadata
        :return: None
        """
        key = (lang, which_version)
        lemma_info_fname = "%s.%s.lemmas.tab" % key
        lemma_info_path = os.path.join(self.lexsemtm_dir, lemma_info_fname)
        fp = open(lemma_info_path)
        reader = csv.DictReader(fp, delimiter="\t", quoting=csv.QUOTE_NONE)
        self.all_lemma_indices[key] = {}
        self.all_lemma_freqs[key] = {}
        self.all_lemmas[key] = []
        for row in reader:
            lemma = row["lemma"]
            self.all_lemmas[key].append(lemma)
            self.all_lemma_indices[key][lemma] = int(row["lemma-id"])
            self.all_lemma_freqs[key][lemma] = int(row["num-usages"])
        fp.close()

    def _load_vocab_file(self, lang):
        """
        Load LexSemTM vocabulary index for given language

        :param lang: language from which to load vocabulary index
        :return: None
        """
        vocab_fname = "%s.vocab.tab" % lang
        vocab_path = os.path.join(self.lexsemtm_dir, vocab_fname)
        fp = open(vocab_path)
        reader = csv.DictReader(fp, delimiter="\t", quoting=csv.QUOTE_NONE)
        vocab_map = {}
        for row in reader:
            token_id = int(row["token-id"])
            token = row["token"]
            vocab_map[token_id] = token
        self.vocab_lists[lang] = vocab_map
        fp.close()

    def _load_all_sense_dists(self, lang, which_version):
        """
        Parse LexSemTM data file to obtain all sense distributions for given
        language/version combination

        :param lang: language form which to obtain sense distributions
        :param which_version: version of LexSemTM from which to load sense
            distributions
        :return: None
        """
        key = (lang, which_version)
        if key not in self.all_sense_dists:
            self.all_sense_dists[key] = {}
        sense_dists = self.all_sense_dists[key]
        tar_path = os.path.join(self.lexsemtm_dir, "%s.%s.data.tar" % key)
        sdist_pattern = "%s.%s.*.sdist.tab" % key
        extract_cmd = ["tar", "--wildcards", "-xOf", tar_path, sdist_pattern]
        p = subprocess.Popen(extract_cmd, stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        while True:
            line = p.stdout.readline().strip()
            if not line:
                break
            sense_id, prob = line.split()
            if sense_id == "sense-name":
                continue
            lemma = ".".join(sense_id.split(".")[:-1])
            if lemma not in sense_dists:
                sense_dists[lemma] = {}
            sense_dists[lemma][sense_id] = float(prob)