Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

using pyLDAvis with turicreate #181

Open
rohanrajpal opened this issue Nov 2, 2020 · 2 comments
Open

using pyLDAvis with turicreate #181

rohanrajpal opened this issue Nov 2, 2020 · 2 comments

Comments

@rohanrajpal
Copy link

Graphlab is now turicreate, so I modified the functions a bit to make it work

from __future__ import absolute_import

import funcy as fp
import numpy as np
import pandas as pd
import turicreate as tc
import pyLDAvis


def _topics_as_df(topic_model):
    tdf = topic_model.topics.to_dataframe()
    return pd.DataFrame(np.vstack(tdf['topic_probabilities'].values), index=tdf['vocabulary'])


def _sum_sarray_dicts(sarray):
    counts_sf = tc.SFrame({
        'count_dicts': sarray}).stack('count_dicts').groupby(
        key_column_names='X1',
        operations={'count': tc.aggregate.SUM('X2')})
    return list(counts_sf.unstack(column_names=['X1', 'count'])[0].values())[0]


def _extract_doc_data(docs):
    doc_lengths = list(docs.apply(lambda d: np.array(d.values()).sum()))
    term_freqs_dict = _sum_sarray_dicts(docs)

    vocab = term_freqs_dict.keys()
    term_freqs = term_freqs_dict.values()

    return {'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}


def _extract_model_data(topic_model, docs, vocab):
    doc_topic_dists = np.vstack(topic_model.predict(docs, output_type='probabilities'))
    doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)[:, None]

    topics = _topics_as_df(topic_model)
    topic_term_dists = topics.T[vocab].values
    topic_term_dists = topic_term_dists / topic_term_dists.sum(axis=1)[:, None]

    print(doc_topic_dists.shape)
    print(topic_term_dists.shape)
    return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists}


def _extract_data(topic_model, docs):
    doc_data = _extract_doc_data(docs)
    model_data = _extract_model_data(topic_model, docs, doc_data['vocab'])
    return fp.merge(doc_data, model_data)


def prepare(topic_model, docs, **kargs):
    """Transforms the GraphLab TopicModel and related corpus data into
    the data structures needed for the visualization.
    Parameters
    ----------
    topic_model : graphlab.toolkits.topic_model.topic_model.TopicModel
        An already trained GraphLab topic model.
    docs : SArray of dicts
        The corpus in bag of word form, the same docs used to train the model.
    **kwargs :
        additional keyword arguments are passed through to :func:`pyldavis.prepare`.
    Returns
    -------
    prepared_data : PreparedData
        the data structures used in the visualization
    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/GraphLab.ipynb
    """
    opts = fp.merge(_extract_data(topic_model, docs), kargs)
    return pyLDAvis.prepare(**opts)

prepare(model,docs)

But running the above code gives me the error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-098d23e5b0bf> in <module>
     73     return pyLDAvis.prepare(**opts)
     74 
---> 75 prepare(model,docs)

<ipython-input-11-098d23e5b0bf> in prepare(topic_model, docs, **kargs)
     71     """
     72     opts = fp.merge(_extract_data(topic_model, docs), kargs)
---> 73     return pyLDAvis.prepare(**opts)
     74 
     75 prepare(model,docs)

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pyLDAvis/_prepare.py in prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics)
    377    print(doc_topic_dists.shape)
    378    print(doc_lengths.shape)
--> 379    topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
    380    # topic_freq       = np.dot(doc_topic_dists.T, doc_lengths)
    381    if (sort_topics):

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value)
    773             pass_op = op if axis in [0, "columns", None] else na_op
    774             pass_op = pass_op if not is_logical else op
--> 775             return _combine_series_frame(
    776                 self, other, pass_op, fill_value=fill_value, axis=axis, level=level
    777             )

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/__init__.py in _combine_series_frame(self, other, func, fill_value, axis, level)
    618         new_data = left._combine_match_index(right, func)
    619     else:
--> 620         new_data = dispatch_to_series(left, right, func, axis="columns")
    621 
    622     return left._construct_result(new_data)

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/__init__.py in dispatch_to_series(left, right, func, str_rep, axis)
    417         raise NotImplementedError(right)
    418 
--> 419     new_data = expressions.evaluate(column_op, str_rep, left, right)
    420     return new_data
    421 

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, use_numexpr)
    206     use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
    207     if use_numexpr:
--> 208         return _evaluate(op, op_str, a, b)
    209     return _evaluate_standard(op, op_str, a, b)
    210 

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b)
    119 
    120     if result is None:
--> 121         result = _evaluate_standard(op, op_str, a, b)
    122 
    123     return result

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b)
     68         _store_test_result(False)
     69     with np.errstate(all="ignore"):
---> 70         return op(a, b)
     71 
     72 

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/__init__.py in column_op(a, b)
    405 
    406             def column_op(a, b):
--> 407                 return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))}
    408 
    409     elif isinstance(right, ABCSeries):

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/__init__.py in <dictcomp>(.0)
    405 
    406             def column_op(a, b):
--> 407                 return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))}
    408 
    409     elif isinstance(right, ABCSeries):

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/common.py in new_method(self, other)
     62         other = item_from_zerodim(other)
     63 
---> 64         return method(self, other)
     65 
     66     return new_method

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/__init__.py in wrapper(left, right)
    501         lvalues = extract_array(left, extract_numpy=True)
    502         rvalues = extract_array(right, extract_numpy=True)
--> 503         result = arithmetic_op(lvalues, rvalues, op, str_rep)
    504 
    505         return _construct_result(left, result, index=left.index, name=res_name)

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/array_ops.py in arithmetic_op(left, right, op, str_rep)
    195     else:
    196         with np.errstate(all="ignore"):
--> 197             res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep)
    198 
    199     return res_values

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, str_rep)
    147 
    148     try:
--> 149         result = expressions.evaluate(op, str_rep, left, right)
    150     except TypeError:
    151         result = masked_arith_op(left, right, op)

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, use_numexpr)
    206     use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
    207     if use_numexpr:
--> 208         return _evaluate(op, op_str, a, b)
    209     return _evaluate_standard(op, op_str, a, b)
    210 

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b)
    119 
    120     if result is None:
--> 121         result = _evaluate_standard(op, op_str, a, b)
    122 
    123     return result

~/anaconda3/envs/namoenv/lib/python3.8/site-packages/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b)
     68         _store_test_result(False)
     69     with np.errstate(all="ignore"):
---> 70         return op(a, b)
     71 
     72 

ValueError: operands could not be broadcast together with shapes (30,) (39,)

On printing the shape of the matrices about to be multiplied

(1000, 30)
(1000,)

Everything seems fine, can anyone help find me the exact issue?

@jcharlet
Copy link

Hi @rohanrajpal did you manage to solve this?

@rohanrajpal
Copy link
Author

Hi @rohanrajpal did you manage to solve this?

Nah man :(

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants