-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Multivariate #114
base: master
Are you sure you want to change the base?
Multivariate #114
Changes from all commits
19afcac
e0ba62c
5012c4f
366adc3
043f7e1
75e1e0e
c9f3691
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
/* | ||
* File: EigenDenseBaseAddons.h | ||
* Author: as1986 | ||
* | ||
* Created on September 16, 2014, 10:22 PM | ||
*/ | ||
|
||
#ifndef EIGEN_DENSE_BASE_ADDONS_H_ | ||
#define EIGEN_DENSE_BASE_ADDONS_H_ | ||
|
||
friend class boost::serialization::access; | ||
|
||
template<class Archive> | ||
void save(Archive & ar, const unsigned int version) const { | ||
derived().eval(); | ||
const Index rows = derived().rows(), cols = derived().cols(); | ||
ar & rows; | ||
ar & cols; | ||
for (Index j = 0; j < cols; ++j) | ||
for (Index i = 0; i < rows; ++i) | ||
ar & derived().coeff(i, j); | ||
} | ||
|
||
template<class Archive> | ||
void load(Archive & ar, const unsigned int version) { | ||
Index rows, cols; | ||
ar & rows; | ||
ar & cols; | ||
if (rows != derived().rows() || cols != derived().cols()) | ||
derived().resize(rows, cols); | ||
ar & boost::serialization::make_array(derived().data(), derived().size()); | ||
} | ||
|
||
template<class Archive> | ||
void serialize(Archive & ar, const unsigned int file_version) { | ||
boost::serialization::split_member(ar, *this, file_version); | ||
} | ||
|
||
#endif // EIGEN_DENSE_BASE_ADDONS_H_ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,20 @@ HmmModel2::HmmModel2(const string &textFilename, | |
|
||
// populate the observations vector with encoded sentences | ||
vocabEncoder.Read(textFilename, observations); | ||
|
||
|
||
|
||
if(!learningInfo.neuralRepFilename.empty()) { | ||
neuralRep.clear(); | ||
LatentCrfModel::readNeuralRep(learningInfo.neuralRepFilename, neuralRep); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please move the implementation of readNeuralRep() to chucheng-utils directory |
||
for(auto sentence:neuralRep) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove this for loop. |
||
cerr << "sen length:\t" << sentence.size() << endl; | ||
for(auto emb:sentence) { | ||
cerr << emb.mean() << " "; | ||
} | ||
cerr << endl; | ||
} | ||
} | ||
|
||
// initialize theta and gamma parameters | ||
InitParams(); | ||
|
@@ -68,6 +82,16 @@ void HmmModel2::InitParams(){ | |
cerr << "nlogGamma params: " << endl; | ||
nlogGamma.PrintParams(); | ||
} | ||
|
||
if(!learningInfo->neuralRepFilename.empty()) { | ||
assert(neuralMean.size()==0); | ||
assert(neuralVar.size()==0); | ||
for(auto y: yDomain) { | ||
neuralMean[y].setRandom(Eigen::NEURAL_SIZE,1); | ||
neuralVar[y].setIdentity(); | ||
} | ||
cerr << "initialized neural means\n"; | ||
} | ||
} | ||
|
||
void HmmModel2::PersistParams(string &prefix) { | ||
|
@@ -81,9 +105,19 @@ void HmmModel2::PersistParams(string &prefix) { | |
} | ||
|
||
// builds the lattice of all possible label sequences | ||
void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogArc> &fst) { | ||
void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogArc> &fst, unsigned sentId) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove. |
||
// arcs represent a particular choice of y_i at time step i | ||
// arc weights are - log \theta_{x_i|y_i} - log \gamma_{y_i|y_{i-1}} | ||
|
||
if(sentId==55665566) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please remove. |
||
assert(learningInfo->neuralRepFilename.empty()); | ||
} | ||
|
||
vector<Eigen::VectorNeural> neural; | ||
if(! learningInfo->neuralRepFilename.empty()) { | ||
neural = GetNeuralSequence(sentId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as discussed in the code review, we are going to store the embeddings per word type, rather than per sentence id. this will reduce the memory requirements because some words are repeated many times in different sentence ids. we will have the utility function return a map from word string to eigen vector, and use that to populate a map<int64_t, EigenVector> as a member of HmmModel2 class. |
||
} | ||
|
||
assert(fst.NumStates() == 0); | ||
int startState = fst.AddState(); | ||
fst.SetStart(startState); | ||
|
@@ -118,7 +152,12 @@ void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogAr | |
} | ||
|
||
// compute arc weight | ||
double arcWeight = nlogGamma[yIM1][yI] + nlogTheta[yI][x[i]]; | ||
double arcWeight = nlogGamma[yIM1][yI]; | ||
if(!learningInfo->neuralRepFilename.empty()) { | ||
arcWeight += getGaussianPDF(yI, neural[i]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. call a utility function instead of a member function here. think about the appropriate function signature. |
||
} else { | ||
arcWeight += nlogTheta[yI][x[i]]; | ||
} | ||
if(arcWeight < 0 || std::isinf(arcWeight) || std::isnan(arcWeight)) { | ||
cerr << "FATAL ERROR: arcWeight = " << arcWeight << endl << "will terminate." << endl; | ||
} | ||
|
@@ -148,7 +187,8 @@ void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogAr | |
void HmmModel2::BuildThetaGammaFst(unsigned sentId, VectorFst<FstUtils::LogArc> &fst, vector<FstUtils::LogWeight> &alphas, vector<FstUtils::LogWeight> &betas) { | ||
|
||
// first, build the lattice | ||
BuildThetaGammaFst(observations[sentId], fst); | ||
BuildThetaGammaFst(observations[sentId], fst, sentId); | ||
|
||
|
||
// then compute forward/backward state potentials | ||
assert(alphas.size() == 0); | ||
|
@@ -221,11 +261,18 @@ void HmmModel2::Train(){ | |
// expectation | ||
double nloglikelihood = 0; | ||
ConditionalMultinomialParam<int64_t> thetaMle, gammaMle; | ||
boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> > meanPerLabel; | ||
boost::unordered_map< int64_t, std::vector<LogVal<double >>> nNormalizingConstant; | ||
for(unsigned sentId = 0; sentId < observations.size(); sentId++) { | ||
VectorFst<FstUtils::LogArc> fst; | ||
vector<FstUtils::LogWeight> alphas, betas; | ||
BuildThetaGammaFst(sentId, fst, alphas, betas); | ||
UpdateMle(sentId, fst, alphas, betas, thetaMle, gammaMle); | ||
if(!learningInfo->neuralRepFilename.empty()) { | ||
// cerr << "training with word embeddings\n"; | ||
UpdateMle(sentId, fst, alphas, betas, meanPerLabel, nNormalizingConstant, gammaMle); | ||
} else { | ||
UpdateMle(sentId, fst, alphas, betas, thetaMle, gammaMle); | ||
} | ||
double sentNlogProb = betas[0].Value(); | ||
if(sentNlogProb < -0.01) { | ||
cerr << "FATAL ERROR: sentNlogProb = " << sentNlogProb << " in sent #" << sentId << endl << "will terminate." << endl; | ||
|
@@ -240,11 +287,16 @@ void HmmModel2::Train(){ | |
nloglikelihood += sentNlogProb; | ||
} | ||
|
||
// maximization | ||
MultinomialParams::NormalizeParams(thetaMle, learningInfo->multinomialSymmetricDirichletAlpha, | ||
if(!learningInfo->neuralRepFilename.empty()) { | ||
NormalizeMleMeanAndUpdateMean(meanPerLabel, | ||
nNormalizingConstant); | ||
} else { | ||
// maximization | ||
MultinomialParams::NormalizeParams(thetaMle, learningInfo->multinomialSymmetricDirichletAlpha, | ||
false, true, | ||
learningInfo->variationalInferenceOfMultinomials); | ||
nlogTheta = thetaMle; | ||
nlogTheta = thetaMle; | ||
} | ||
MultinomialParams::NormalizeParams(gammaMle, learningInfo->multinomialSymmetricDirichletAlpha, | ||
false, true, | ||
learningInfo->variationalInferenceOfMultinomials); | ||
|
@@ -264,7 +316,19 @@ void HmmModel2::Train(){ | |
void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels) { | ||
//cerr << "inside HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels)" << endl; | ||
VectorFst<FstUtils::LogArc> fst; | ||
BuildThetaGammaFst(tokens, fst); | ||
BuildThetaGammaFst(tokens, fst, 55665566); | ||
|
||
VectorFst<FstUtils::StdArc> fst2, shortestPath; | ||
fst::ArcMap(fst, &fst2, FstUtils::LogToTropicalMapper()); | ||
fst::ShortestPath(fst2, &shortestPath); | ||
std::vector<int> dummy; | ||
FstUtils::LinearFstToVector(shortestPath, dummy, labels); | ||
assert(labels.size() == tokens.size()); | ||
} | ||
void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels, unsigned sentId) { | ||
//cerr << "inside HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels)" << endl; | ||
VectorFst<FstUtils::LogArc> fst; | ||
BuildThetaGammaFst(tokens, fst, sentId); | ||
|
||
VectorFst<FstUtils::StdArc> fst2, shortestPath; | ||
fst::ArcMap(fst, &fst2, FstUtils::LogToTropicalMapper()); | ||
|
@@ -273,3 +337,140 @@ void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels) { | |
FstUtils::LinearFstToVector(shortestPath, dummy, labels); | ||
assert(labels.size() == tokens.size()); | ||
} | ||
|
||
|
||
void HmmModel2::Label(vector<vector<string> > &tokens, vector<vector<int> > &labels) { | ||
assert(labels.size() == 0); | ||
labels.resize(tokens.size()); | ||
if(!learningInfo->neuralRepFilename.empty()) { | ||
cerr << "labeling with word embeddings\n"; | ||
for(unsigned i = 0 ; i <tokens.size(); i++) { | ||
Label(observations[i], labels[i], i); | ||
} | ||
} | ||
else { | ||
for(unsigned i = 0 ; i <tokens.size(); i++) { | ||
UnsupervisedSequenceTaggingModel::Label(tokens[i], labels[i]); | ||
} | ||
} | ||
} | ||
|
||
void HmmModel2::Label(string &inputFilename, string &outputFilename) { | ||
std::vector<std::vector<std::string> > tokens; | ||
StringUtils::ReadTokens(inputFilename, tokens); | ||
vector<vector<int> > labels; | ||
Label(tokens, labels); | ||
StringUtils::WriteTokens(outputFilename, labels); | ||
} | ||
|
||
double HmmModel2::getGaussianPDF(int64_t yi, const Eigen::VectorNeural& zi) { | ||
if(zi.isConstant(Eigen::NONE)) { | ||
return 0; | ||
} | ||
|
||
const auto c = -0.5 * Eigen::NEURAL_SIZE * log(2 * M_PI); | ||
const auto& mean = neuralMean[yi]; | ||
const auto& diff = zi - mean; | ||
|
||
//double inner_product = diff.transpose() * var_inverse * diff; | ||
double inner_product = diff.squaredNorm(); | ||
if(std::isinf(inner_product)) { | ||
cerr << "inner product inf!\n"; | ||
assert(false); | ||
return 2.0e100; | ||
} | ||
double log_pdf = c - 0.5 * inner_product; | ||
|
||
return -log_pdf; | ||
} | ||
|
||
const vector<Eigen::VectorNeural>& HmmModel2::GetNeuralSequence(int exampleId) { | ||
assert(exampleId < neuralRep.size()); | ||
return neuralRep[exampleId]; | ||
} | ||
|
||
void HmmModel2::UpdateMle(const unsigned sentId, | ||
const VectorFst<FstUtils::LogArc> &fst, | ||
const vector<FstUtils::LogWeight> &alphas, | ||
const vector<FstUtils::LogWeight> &betas, | ||
boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> > &meanPerLabel, | ||
boost::unordered_map< int64_t, std::vector<LogVal<double >>> &nNormalizingConstant, | ||
ConditionalMultinomialParam<int64_t> &gammaMle){ | ||
|
||
// schedule for visiting states such that we know the timestep for each arc | ||
set<int> iStates, iP1States; | ||
iStates.insert(fst.Start()); | ||
const vector<Eigen::VectorNeural>&x = GetNeuralSequence(sentId); | ||
const auto zeros = Eigen::VectorNeural::Zero(Eigen::NEURAL_SIZE, 1); | ||
// for each timestep | ||
for(int i = 0; i < x.size(); i++) { | ||
auto xI = x[i]; | ||
|
||
// from each state at timestep i | ||
for(set<int>::const_iterator iStatesIter = iStates.begin(); | ||
iStatesIter != iStates.end(); | ||
iStatesIter++) { | ||
int fromState = *iStatesIter; | ||
|
||
// for each arc leaving this state | ||
for(ArcIterator< VectorFst<FstUtils::LogArc> > aiter(fst, fromState); !aiter.Done(); aiter.Next()) { | ||
const FstUtils::LogArc &arc = aiter.Value(); | ||
int yIM1 = arc.ilabel; | ||
int yI = arc.olabel; | ||
const FstUtils::LogWeight &arcWeight = arc.weight; | ||
int toState = arc.nextstate; | ||
|
||
// compute marginal weight of passing on this arc | ||
const FstUtils::LogWeight nlogWeight(fst::Times(arcWeight , fst::Times(betas[toState], alphas[fromState]))); | ||
double nlogProb = fst::Divide(nlogWeight, betas[0]).Value(); | ||
if(nlogProb < -1.0 || std::isinf(nlogProb) || std::isnan(nlogProb)) { | ||
cerr << "FATAL ERROR: nlogProb = " << nlogProb << " = alpha + arcWeight + beta - betas[0] = " << alphas[fromState].Value() << " + " << arcWeight.Value() << " + " << betas[toState].Value() << " - " << betas[0].Value() << endl << "will terminate." << endl; | ||
assert(false); | ||
} | ||
// fix precision problems | ||
if(nlogProb < 0) { | ||
nlogProb = 0; | ||
} | ||
double prob = MultinomialParams::nExp(nlogProb); | ||
meanPerLabel[yI].push_back(xI); | ||
nNormalizingConstant[yI].push_back(LogVal<double>(-nlogProb, init_lnx())); | ||
gammaMle[yIM1][yI] += prob; | ||
|
||
// prepare the schedule for visiting states in the next timestep | ||
iP1States.insert(toState); | ||
} | ||
} | ||
|
||
// prepare for next timestep | ||
iStates = iP1States; | ||
iP1States.clear(); | ||
} | ||
} | ||
|
||
void HmmModel2::NormalizeMleMeanAndUpdateMean(boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> >& means, | ||
boost::unordered_map< int64_t, std::vector<LogVal<double>>>& nNormalizingConstant) { | ||
|
||
boost::unordered_map<int64_t, LogVal<double>> sum; | ||
// init | ||
for(auto y:yDomain) { | ||
sum[y] = LogVal<double>::Zero(); | ||
} | ||
|
||
// sum | ||
for (const auto& t : nNormalizingConstant) { | ||
sum[t.first] += std::accumulate(t.second.begin(), t.second.end(), LogVal<double>::Zero()); | ||
} | ||
|
||
// clear | ||
for(auto y:yDomain) { | ||
neuralMean[y].setZero(Eigen::NEURAL_SIZE,1); | ||
|
||
} | ||
|
||
for (auto y : yDomain) { | ||
for (auto i = 0; i < means[y].size(); i++) { | ||
const auto weight = (nNormalizingConstant[y][i] / sum[y]).as_float(); | ||
neuralMean[y] += weight * means[y][i]; | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove the additional parameter. We can read the number of dimensions of the word embeddings directly in the code rather than at compile time.