Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multivariate #114

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile-latentCrfPosTagger
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ SINGLE=-c
BEFORE=-x c++ -std=c++11
LIBS=-lfst -ldl -lboost_mpi -lboost_program_options -lboost_serialization -lboost_thread -lboost_system -lrt
#-llbfgs
OPT=-O3 -DNDEBUG
OPT=-O3 -DDEBUG -DNEURALCONST=${NEURALCONST}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the additional parameter. We can read the number of dimensions of the word embeddings directly in the code rather than at compile time.

# -g
all: train-latentCrfPosTagger

Expand Down
6 changes: 3 additions & 3 deletions alignment/LatentCrfAligner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,13 @@ void LatentCrfAligner::InitTheta() {

void LatentCrfAligner::PrepareExample(unsigned exampleId) {
yDomain.clear();
this->yDomain.insert(LatentCrfAligner::START_OF_SENTENCE_Y_VALUE); // always insert the conceptual yValue of word at position -1 in a sentence
this->yDomain.push_back(LatentCrfAligner::START_OF_SENTENCE_Y_VALUE); // always insert the conceptual yValue of word at position -1 in a sentence
// if null alignments are enabled, this length will include the null token that was inserted at the begging of all source sentences
unsigned srcSentLength = testingMode? testSrcSents[exampleId].size() : srcSents[exampleId].size();
// each position in the src sentence, including null, should have an entry in yDomain
unsigned firstPossibleYValue = learningInfo.allowNullAlignments? NULL_POSITION : NULL_POSITION + 1;
for(unsigned i = firstPossibleYValue; i < firstPossibleYValue + srcSentLength; ++i) {
yDomain.insert(i);
yDomain.push_back(i);
}
}

Expand Down Expand Up @@ -340,4 +340,4 @@ void LatentCrfAligner::FireFeatures(int yI, int yIM1, unsigned sentId, int i,
LatentCrfModel::START_OF_SENTENCE_Y_VALUE, firstPos,
activeFeatures);
assert(GetObservableSequence(sentId).size() > 0);
}
}
2 changes: 1 addition & 1 deletion core/BasicTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ enum FeatureTemplate {
PRECOMPUTED, PRECOMPUTED_PAIR,
ALIGNMENT_JUMP, LOG_ALIGNMENT_JUMP,
// pos
LABEL_BIGRAM, OTHER_POS, PHRASE,
LABEL_BIGRAM, OTHER_POS, PHRASE, SEQUENCE_METADATA,
// alignment
SRC_BIGRAM, ALIGNMENT_JUMP_IS_ZERO, SRC0_TGT0,
DIAGONAL_DEVIATION, SYNC_START, SYNC_END, OTHER_ALIGNERS, NULL_ALIGNMENT, NULL_ALIGNMENT_LENGTH_RATIO,
Expand Down
39 changes: 39 additions & 0 deletions core/EigenDenseBaseAddons.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* File: EigenDenseBaseAddons.h
* Author: as1986
*
* Created on September 16, 2014, 10:22 PM
*/

#ifndef EIGEN_DENSE_BASE_ADDONS_H_
#define EIGEN_DENSE_BASE_ADDONS_H_

friend class boost::serialization::access;

template<class Archive>
void save(Archive & ar, const unsigned int version) const {
derived().eval();
const Index rows = derived().rows(), cols = derived().cols();
ar & rows;
ar & cols;
for (Index j = 0; j < cols; ++j)
for (Index i = 0; i < rows; ++i)
ar & derived().coeff(i, j);
}

template<class Archive>
void load(Archive & ar, const unsigned int version) {
Index rows, cols;
ar & rows;
ar & cols;
if (rows != derived().rows() || cols != derived().cols())
derived().resize(rows, cols);
ar & boost::serialization::make_array(derived().data(), derived().size());
}

template<class Archive>
void serialize(Archive & ar, const unsigned int file_version) {
boost::serialization::split_member(ar, *this, file_version);
}

#endif // EIGEN_DENSE_BASE_ADDONS_H_
217 changes: 209 additions & 8 deletions core/HmmModel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,20 @@ HmmModel2::HmmModel2(const string &textFilename,

// populate the observations vector with encoded sentences
vocabEncoder.Read(textFilename, observations);



if(!learningInfo.neuralRepFilename.empty()) {
neuralRep.clear();
LatentCrfModel::readNeuralRep(learningInfo.neuralRepFilename, neuralRep);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move the implementation of readNeuralRep() to chucheng-utils directory

for(auto sentence:neuralRep) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove this for loop.

cerr << "sen length:\t" << sentence.size() << endl;
for(auto emb:sentence) {
cerr << emb.mean() << " ";
}
cerr << endl;
}
}

// initialize theta and gamma parameters
InitParams();
Expand Down Expand Up @@ -68,6 +82,16 @@ void HmmModel2::InitParams(){
cerr << "nlogGamma params: " << endl;
nlogGamma.PrintParams();
}

if(!learningInfo->neuralRepFilename.empty()) {
assert(neuralMean.size()==0);
assert(neuralVar.size()==0);
for(auto y: yDomain) {
neuralMean[y].setRandom(Eigen::NEURAL_SIZE,1);
neuralVar[y].setIdentity();
}
cerr << "initialized neural means\n";
}
}

void HmmModel2::PersistParams(string &prefix) {
Expand All @@ -81,9 +105,19 @@ void HmmModel2::PersistParams(string &prefix) {
}

// builds the lattice of all possible label sequences
void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogArc> &fst) {
void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogArc> &fst, unsigned sentId) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove.

// arcs represent a particular choice of y_i at time step i
// arc weights are - log \theta_{x_i|y_i} - log \gamma_{y_i|y_{i-1}}

if(sentId==55665566) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove.

assert(learningInfo->neuralRepFilename.empty());
}

vector<Eigen::VectorNeural> neural;
if(! learningInfo->neuralRepFilename.empty()) {
neural = GetNeuralSequence(sentId);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as discussed in the code review, we are going to store the embeddings per word type, rather than per sentence id. this will reduce the memory requirements because some words are repeated many times in different sentence ids. we will have the utility function return a map from word string to eigen vector, and use that to populate a map<int64_t, EigenVector> as a member of HmmModel2 class.

}

assert(fst.NumStates() == 0);
int startState = fst.AddState();
fst.SetStart(startState);
Expand Down Expand Up @@ -118,7 +152,12 @@ void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogAr
}

// compute arc weight
double arcWeight = nlogGamma[yIM1][yI] + nlogTheta[yI][x[i]];
double arcWeight = nlogGamma[yIM1][yI];
if(!learningInfo->neuralRepFilename.empty()) {
arcWeight += getGaussianPDF(yI, neural[i]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call a utility function instead of a member function here. think about the appropriate function signature.

} else {
arcWeight += nlogTheta[yI][x[i]];
}
if(arcWeight < 0 || std::isinf(arcWeight) || std::isnan(arcWeight)) {
cerr << "FATAL ERROR: arcWeight = " << arcWeight << endl << "will terminate." << endl;
}
Expand Down Expand Up @@ -148,7 +187,8 @@ void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogAr
void HmmModel2::BuildThetaGammaFst(unsigned sentId, VectorFst<FstUtils::LogArc> &fst, vector<FstUtils::LogWeight> &alphas, vector<FstUtils::LogWeight> &betas) {

// first, build the lattice
BuildThetaGammaFst(observations[sentId], fst);
BuildThetaGammaFst(observations[sentId], fst, sentId);


// then compute forward/backward state potentials
assert(alphas.size() == 0);
Expand Down Expand Up @@ -221,11 +261,18 @@ void HmmModel2::Train(){
// expectation
double nloglikelihood = 0;
ConditionalMultinomialParam<int64_t> thetaMle, gammaMle;
boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> > meanPerLabel;
boost::unordered_map< int64_t, std::vector<LogVal<double >>> nNormalizingConstant;
for(unsigned sentId = 0; sentId < observations.size(); sentId++) {
VectorFst<FstUtils::LogArc> fst;
vector<FstUtils::LogWeight> alphas, betas;
BuildThetaGammaFst(sentId, fst, alphas, betas);
UpdateMle(sentId, fst, alphas, betas, thetaMle, gammaMle);
if(!learningInfo->neuralRepFilename.empty()) {
// cerr << "training with word embeddings\n";
UpdateMle(sentId, fst, alphas, betas, meanPerLabel, nNormalizingConstant, gammaMle);
} else {
UpdateMle(sentId, fst, alphas, betas, thetaMle, gammaMle);
}
double sentNlogProb = betas[0].Value();
if(sentNlogProb < -0.01) {
cerr << "FATAL ERROR: sentNlogProb = " << sentNlogProb << " in sent #" << sentId << endl << "will terminate." << endl;
Expand All @@ -240,11 +287,16 @@ void HmmModel2::Train(){
nloglikelihood += sentNlogProb;
}

// maximization
MultinomialParams::NormalizeParams(thetaMle, learningInfo->multinomialSymmetricDirichletAlpha,
if(!learningInfo->neuralRepFilename.empty()) {
NormalizeMleMeanAndUpdateMean(meanPerLabel,
nNormalizingConstant);
} else {
// maximization
MultinomialParams::NormalizeParams(thetaMle, learningInfo->multinomialSymmetricDirichletAlpha,
false, true,
learningInfo->variationalInferenceOfMultinomials);
nlogTheta = thetaMle;
nlogTheta = thetaMle;
}
MultinomialParams::NormalizeParams(gammaMle, learningInfo->multinomialSymmetricDirichletAlpha,
false, true,
learningInfo->variationalInferenceOfMultinomials);
Expand All @@ -264,7 +316,19 @@ void HmmModel2::Train(){
void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels) {
//cerr << "inside HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels)" << endl;
VectorFst<FstUtils::LogArc> fst;
BuildThetaGammaFst(tokens, fst);
BuildThetaGammaFst(tokens, fst, 55665566);

VectorFst<FstUtils::StdArc> fst2, shortestPath;
fst::ArcMap(fst, &fst2, FstUtils::LogToTropicalMapper());
fst::ShortestPath(fst2, &shortestPath);
std::vector<int> dummy;
FstUtils::LinearFstToVector(shortestPath, dummy, labels);
assert(labels.size() == tokens.size());
}
void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels, unsigned sentId) {
//cerr << "inside HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels)" << endl;
VectorFst<FstUtils::LogArc> fst;
BuildThetaGammaFst(tokens, fst, sentId);

VectorFst<FstUtils::StdArc> fst2, shortestPath;
fst::ArcMap(fst, &fst2, FstUtils::LogToTropicalMapper());
Expand All @@ -273,3 +337,140 @@ void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels) {
FstUtils::LinearFstToVector(shortestPath, dummy, labels);
assert(labels.size() == tokens.size());
}


void HmmModel2::Label(vector<vector<string> > &tokens, vector<vector<int> > &labels) {
assert(labels.size() == 0);
labels.resize(tokens.size());
if(!learningInfo->neuralRepFilename.empty()) {
cerr << "labeling with word embeddings\n";
for(unsigned i = 0 ; i <tokens.size(); i++) {
Label(observations[i], labels[i], i);
}
}
else {
for(unsigned i = 0 ; i <tokens.size(); i++) {
UnsupervisedSequenceTaggingModel::Label(tokens[i], labels[i]);
}
}
}

void HmmModel2::Label(string &inputFilename, string &outputFilename) {
std::vector<std::vector<std::string> > tokens;
StringUtils::ReadTokens(inputFilename, tokens);
vector<vector<int> > labels;
Label(tokens, labels);
StringUtils::WriteTokens(outputFilename, labels);
}

double HmmModel2::getGaussianPDF(int64_t yi, const Eigen::VectorNeural& zi) {
if(zi.isConstant(Eigen::NONE)) {
return 0;
}

const auto c = -0.5 * Eigen::NEURAL_SIZE * log(2 * M_PI);
const auto& mean = neuralMean[yi];
const auto& diff = zi - mean;

//double inner_product = diff.transpose() * var_inverse * diff;
double inner_product = diff.squaredNorm();
if(std::isinf(inner_product)) {
cerr << "inner product inf!\n";
assert(false);
return 2.0e100;
}
double log_pdf = c - 0.5 * inner_product;

return -log_pdf;
}

const vector<Eigen::VectorNeural>& HmmModel2::GetNeuralSequence(int exampleId) {
assert(exampleId < neuralRep.size());
return neuralRep[exampleId];
}

void HmmModel2::UpdateMle(const unsigned sentId,
const VectorFst<FstUtils::LogArc> &fst,
const vector<FstUtils::LogWeight> &alphas,
const vector<FstUtils::LogWeight> &betas,
boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> > &meanPerLabel,
boost::unordered_map< int64_t, std::vector<LogVal<double >>> &nNormalizingConstant,
ConditionalMultinomialParam<int64_t> &gammaMle){

// schedule for visiting states such that we know the timestep for each arc
set<int> iStates, iP1States;
iStates.insert(fst.Start());
const vector<Eigen::VectorNeural>&x = GetNeuralSequence(sentId);
const auto zeros = Eigen::VectorNeural::Zero(Eigen::NEURAL_SIZE, 1);
// for each timestep
for(int i = 0; i < x.size(); i++) {
auto xI = x[i];

// from each state at timestep i
for(set<int>::const_iterator iStatesIter = iStates.begin();
iStatesIter != iStates.end();
iStatesIter++) {
int fromState = *iStatesIter;

// for each arc leaving this state
for(ArcIterator< VectorFst<FstUtils::LogArc> > aiter(fst, fromState); !aiter.Done(); aiter.Next()) {
const FstUtils::LogArc &arc = aiter.Value();
int yIM1 = arc.ilabel;
int yI = arc.olabel;
const FstUtils::LogWeight &arcWeight = arc.weight;
int toState = arc.nextstate;

// compute marginal weight of passing on this arc
const FstUtils::LogWeight nlogWeight(fst::Times(arcWeight , fst::Times(betas[toState], alphas[fromState])));
double nlogProb = fst::Divide(nlogWeight, betas[0]).Value();
if(nlogProb < -1.0 || std::isinf(nlogProb) || std::isnan(nlogProb)) {
cerr << "FATAL ERROR: nlogProb = " << nlogProb << " = alpha + arcWeight + beta - betas[0] = " << alphas[fromState].Value() << " + " << arcWeight.Value() << " + " << betas[toState].Value() << " - " << betas[0].Value() << endl << "will terminate." << endl;
assert(false);
}
// fix precision problems
if(nlogProb < 0) {
nlogProb = 0;
}
double prob = MultinomialParams::nExp(nlogProb);
meanPerLabel[yI].push_back(xI);
nNormalizingConstant[yI].push_back(LogVal<double>(-nlogProb, init_lnx()));
gammaMle[yIM1][yI] += prob;

// prepare the schedule for visiting states in the next timestep
iP1States.insert(toState);
}
}

// prepare for next timestep
iStates = iP1States;
iP1States.clear();
}
}

void HmmModel2::NormalizeMleMeanAndUpdateMean(boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> >& means,
boost::unordered_map< int64_t, std::vector<LogVal<double>>>& nNormalizingConstant) {

boost::unordered_map<int64_t, LogVal<double>> sum;
// init
for(auto y:yDomain) {
sum[y] = LogVal<double>::Zero();
}

// sum
for (const auto& t : nNormalizingConstant) {
sum[t.first] += std::accumulate(t.second.begin(), t.second.end(), LogVal<double>::Zero());
}

// clear
for(auto y:yDomain) {
neuralMean[y].setZero(Eigen::NEURAL_SIZE,1);

}

for (auto y : yDomain) {
for (auto i = 0; i < means[y].size(); i++) {
const auto weight = (nNormalizingConstant[y][i] / sum[y]).as_float();
neuralMean[y] += weight * means[y][i];
}
}
}
Loading