ldmt-muri · as1986 · Nov 12, 2014 · Nov 17, 2014 · Nov 18, 2014 · Nov 23, 2014
diff --git a/Makefile-latentCrfPosTagger b/Makefile-latentCrfPosTagger
@@ -3,7 +3,7 @@ SINGLE=-c
 BEFORE=-x c++ -std=c++11
 LIBS=-lfst -ldl -lboost_mpi  -lboost_program_options -lboost_serialization -lboost_thread -lboost_system -lrt
 #-llbfgs
-OPT=-O3 -DNDEBUG
+OPT=-O3 -DDEBUG -DNEURALCONST=${NEURALCONST}
 # -g
 all: train-latentCrfPosTagger
 

diff --git a/alignment/LatentCrfAligner.cc b/alignment/LatentCrfAligner.cc
@@ -171,13 +171,13 @@ void LatentCrfAligner::InitTheta() {
 
 void LatentCrfAligner::PrepareExample(unsigned exampleId) {
   yDomain.clear();
-  this->yDomain.insert(LatentCrfAligner::START_OF_SENTENCE_Y_VALUE); // always insert the conceptual yValue of word at position -1 in a sentence
+  this->yDomain.push_back(LatentCrfAligner::START_OF_SENTENCE_Y_VALUE); // always insert the conceptual yValue of word at position -1 in a sentence
   // if null alignments are enabled, this length will include the null token that was inserted at the begging of all source sentences
   unsigned srcSentLength = testingMode? testSrcSents[exampleId].size() : srcSents[exampleId].size();
   // each position in the src sentence, including null, should have an entry in yDomain
   unsigned firstPossibleYValue = learningInfo.allowNullAlignments? NULL_POSITION : NULL_POSITION + 1;
   for(unsigned i = firstPossibleYValue; i < firstPossibleYValue + srcSentLength; ++i) {
-    yDomain.insert(i);
+    yDomain.push_back(i);
   }
 }
 
@@ -340,4 +340,4 @@ void LatentCrfAligner::FireFeatures(int yI, int yIM1, unsigned sentId, int i,
 			 LatentCrfModel::START_OF_SENTENCE_Y_VALUE, firstPos, 
 			 activeFeatures);
     assert(GetObservableSequence(sentId).size() > 0);
-}
+}
diff --git a/core/BasicTypes.h b/core/BasicTypes.h
@@ -6,7 +6,7 @@ enum FeatureTemplate {
   PRECOMPUTED, PRECOMPUTED_PAIR,
   ALIGNMENT_JUMP, LOG_ALIGNMENT_JUMP, 
   // pos 
-  LABEL_BIGRAM, OTHER_POS, PHRASE,
+  LABEL_BIGRAM, OTHER_POS, PHRASE, SEQUENCE_METADATA,
   // alignment
   SRC_BIGRAM, ALIGNMENT_JUMP_IS_ZERO, SRC0_TGT0, 
   DIAGONAL_DEVIATION, SYNC_START, SYNC_END, OTHER_ALIGNERS, NULL_ALIGNMENT, NULL_ALIGNMENT_LENGTH_RATIO, 

diff --git a/core/EigenDenseBaseAddons.h b/core/EigenDenseBaseAddons.h
@@ -0,0 +1,39 @@
+/* 
+ * File:   EigenDenseBaseAddons.h
+ * Author: as1986
+ *
+ * Created on September 16, 2014, 10:22 PM
+ */
+
+#ifndef EIGEN_DENSE_BASE_ADDONS_H_
+#define EIGEN_DENSE_BASE_ADDONS_H_
+
+friend class boost::serialization::access;
+
+template<class Archive>
+void save(Archive & ar, const unsigned int version) const {
+    derived().eval();
+    const Index rows = derived().rows(), cols = derived().cols();
+    ar & rows;
+    ar & cols;
+    for (Index j = 0; j < cols; ++j)
+        for (Index i = 0; i < rows; ++i)
+            ar & derived().coeff(i, j);
+}
+
+template<class Archive>
+void load(Archive & ar, const unsigned int version) {
+    Index rows, cols;
+    ar & rows;
+    ar & cols;
+    if (rows != derived().rows() || cols != derived().cols())
+        derived().resize(rows, cols);
+    ar & boost::serialization::make_array(derived().data(), derived().size());
+}
+
+template<class Archive>
+void serialize(Archive & ar, const unsigned int file_version) {
+    boost::serialization::split_member(ar, *this, file_version);
+}
+
+#endif // EIGEN_DENSE_BASE_ADDONS_H_
diff --git a/core/HmmModel.cc b/core/HmmModel.cc
@@ -39,6 +39,20 @@ HmmModel2::HmmModel2(const string &textFilename,
 
   // populate the observations vector with encoded sentences
   vocabEncoder.Read(textFilename, observations);
+
+
+
+  if(!learningInfo.neuralRepFilename.empty()) {
+      neuralRep.clear();
+      LatentCrfModel::readNeuralRep(learningInfo.neuralRepFilename, neuralRep);      
+      for(auto sentence:neuralRep) {
+          cerr << "sen length:\t" << sentence.size() << endl;
+          for(auto emb:sentence) {
+              cerr << emb.mean() << " ";
+          }
+          cerr << endl;
+      }
+  }
 
   // initialize theta and gamma parameters
   InitParams();
@@ -68,6 +82,16 @@ void HmmModel2::InitParams(){
     cerr << "nlogGamma params: " << endl;
     nlogGamma.PrintParams();
   }
+
+  if(!learningInfo->neuralRepFilename.empty()) {
+      assert(neuralMean.size()==0);
+      assert(neuralVar.size()==0);
+      for(auto y: yDomain) {
+          neuralMean[y].setRandom(Eigen::NEURAL_SIZE,1);
+          neuralVar[y].setIdentity();
+      }
+      cerr << "initialized neural means\n";
+  }
 }
 
 void HmmModel2::PersistParams(string &prefix) {
@@ -81,9 +105,19 @@ void HmmModel2::PersistParams(string &prefix) {
 }
 
 // builds the lattice of all possible label sequences
-void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogArc> &fst) {
+void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogArc> &fst, unsigned sentId) {
   // arcs represent a particular choice of y_i at time step i
   // arc weights are - log \theta_{x_i|y_i} - log \gamma_{y_i|y_{i-1}}
+
+    if(sentId==55665566) {
+        assert(learningInfo->neuralRepFilename.empty());    
+    }
+
+    vector<Eigen::VectorNeural> neural;
+    if(! learningInfo->neuralRepFilename.empty()) {
+        neural = GetNeuralSequence(sentId);
+    }
+
   assert(fst.NumStates() == 0);
   int startState = fst.AddState();
   fst.SetStart(startState);
@@ -118,7 +152,12 @@ void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogAr
 	}
 
 	// compute arc weight
-	double arcWeight = nlogGamma[yIM1][yI] + nlogTheta[yI][x[i]];
+	double arcWeight = nlogGamma[yIM1][yI];
+        if(!learningInfo->neuralRepFilename.empty()) {
+            arcWeight += getGaussianPDF(yI, neural[i]);
+        } else {
+            arcWeight += nlogTheta[yI][x[i]];
+        }
 	if(arcWeight < 0 || std::isinf(arcWeight) || std::isnan(arcWeight)) {
 	  cerr << "FATAL ERROR: arcWeight = " << arcWeight << endl << "will terminate." << endl;
 	}
@@ -148,7 +187,8 @@ void HmmModel2::BuildThetaGammaFst(vector<int64_t> &x, VectorFst<FstUtils::LogAr
 void HmmModel2::BuildThetaGammaFst(unsigned sentId, VectorFst<FstUtils::LogArc> &fst, vector<FstUtils::LogWeight> &alphas, vector<FstUtils::LogWeight> &betas) {
 
   // first, build the lattice
-  BuildThetaGammaFst(observations[sentId], fst);
+  BuildThetaGammaFst(observations[sentId], fst, sentId);
+
 
   // then compute forward/backward state potentials
   assert(alphas.size() == 0);
@@ -221,11 +261,18 @@ void HmmModel2::Train(){
     // expectation
     double nloglikelihood = 0;
     ConditionalMultinomialParam<int64_t> thetaMle, gammaMle;
+    boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> > meanPerLabel;
+    boost::unordered_map< int64_t, std::vector<LogVal<double >>> nNormalizingConstant;
     for(unsigned sentId = 0; sentId < observations.size(); sentId++) {
       VectorFst<FstUtils::LogArc> fst;
       vector<FstUtils::LogWeight> alphas, betas; 
       BuildThetaGammaFst(sentId, fst, alphas, betas);
-      UpdateMle(sentId, fst, alphas, betas, thetaMle, gammaMle);
+      if(!learningInfo->neuralRepFilename.empty()) {
+          // cerr << "training with word embeddings\n";
+          UpdateMle(sentId, fst, alphas, betas, meanPerLabel, nNormalizingConstant, gammaMle);
+      } else {
+          UpdateMle(sentId, fst, alphas, betas, thetaMle, gammaMle);
+      }
       double sentNlogProb = betas[0].Value();
       if(sentNlogProb < -0.01) {
 	cerr << "FATAL ERROR: sentNlogProb = " << sentNlogProb << " in sent #" << sentId << endl << "will terminate." << endl;
@@ -240,11 +287,16 @@ void HmmModel2::Train(){
       nloglikelihood += sentNlogProb;
     }
 
-    // maximization
-    MultinomialParams::NormalizeParams(thetaMle, learningInfo->multinomialSymmetricDirichletAlpha, 
+    if(!learningInfo->neuralRepFilename.empty()) {
+        NormalizeMleMeanAndUpdateMean(meanPerLabel,
+        nNormalizingConstant);
+    } else {
+        // maximization
+        MultinomialParams::NormalizeParams(thetaMle, learningInfo->multinomialSymmetricDirichletAlpha, 
                                        false, true, 
                                        learningInfo->variationalInferenceOfMultinomials);
-    nlogTheta = thetaMle;
+        nlogTheta = thetaMle;
+    }
     MultinomialParams::NormalizeParams(gammaMle, learningInfo->multinomialSymmetricDirichletAlpha, 
                                        false, true, 
                                        learningInfo->variationalInferenceOfMultinomials);
@@ -264,7 +316,19 @@ void HmmModel2::Train(){
 void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels) {
   //cerr << "inside HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels)" << endl;
   VectorFst<FstUtils::LogArc> fst;
-  BuildThetaGammaFst(tokens, fst);
+  BuildThetaGammaFst(tokens, fst, 55665566);
+
+  VectorFst<FstUtils::StdArc> fst2, shortestPath;
+  fst::ArcMap(fst, &fst2, FstUtils::LogToTropicalMapper());
+  fst::ShortestPath(fst2, &shortestPath);
+  std::vector<int> dummy;
+  FstUtils::LinearFstToVector(shortestPath, dummy, labels);
+  assert(labels.size() == tokens.size());
+}
+void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels, unsigned sentId) {
+  //cerr << "inside HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels)" << endl;
+  VectorFst<FstUtils::LogArc> fst;
+  BuildThetaGammaFst(tokens, fst, sentId);
 
   VectorFst<FstUtils::StdArc> fst2, shortestPath;
   fst::ArcMap(fst, &fst2, FstUtils::LogToTropicalMapper());
@@ -273,3 +337,140 @@ void HmmModel2::Label(vector<int64_t> &tokens, vector<int> &labels) {
   FstUtils::LinearFstToVector(shortestPath, dummy, labels);
   assert(labels.size() == tokens.size());
 }
+
+
+  void HmmModel2::Label(vector<vector<string> > &tokens, vector<vector<int> > &labels) {
+    assert(labels.size() == 0);
+    labels.resize(tokens.size());
+    if(!learningInfo->neuralRepFilename.empty()) {
+        cerr << "labeling with word embeddings\n";
+        for(unsigned i = 0 ; i <tokens.size(); i++) {
+            Label(observations[i], labels[i], i);
+        }
+    }
+    else {
+        for(unsigned i = 0 ; i <tokens.size(); i++) {
+            UnsupervisedSequenceTaggingModel::Label(tokens[i], labels[i]);
+        }
+    }
+  }
+
+void HmmModel2::Label(string &inputFilename, string &outputFilename) {
+    std::vector<std::vector<std::string> > tokens;
+    StringUtils::ReadTokens(inputFilename, tokens);
+    vector<vector<int> > labels;
+    Label(tokens, labels);
+    StringUtils::WriteTokens(outputFilename, labels);
+  }
+
+double HmmModel2::getGaussianPDF(int64_t yi, const Eigen::VectorNeural& zi) {
+    if(zi.isConstant(Eigen::NONE)) {
+        return 0;
+    }
+
+    const auto c = -0.5 * Eigen::NEURAL_SIZE * log(2 * M_PI);
+    const auto& mean = neuralMean[yi];
+    const auto& diff = zi - mean;
+
+    //double inner_product = diff.transpose() *  var_inverse * diff;
+    double inner_product = diff.squaredNorm();
+    if(std::isinf(inner_product)) {
+        cerr << "inner product inf!\n";
+        assert(false);
+        return 2.0e100;
+    }
+    double log_pdf = c - 0.5 * inner_product;
+
+    return -log_pdf;
+}
+
+const vector<Eigen::VectorNeural>& HmmModel2::GetNeuralSequence(int exampleId) {
+    assert(exampleId < neuralRep.size());
+    return neuralRep[exampleId];
+}
+
+void HmmModel2::UpdateMle(const unsigned sentId,
+			  const VectorFst<FstUtils::LogArc> &fst, 
+			  const vector<FstUtils::LogWeight> &alphas, 
+			  const vector<FstUtils::LogWeight> &betas, 
+        boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> > &meanPerLabel,
+        boost::unordered_map< int64_t, std::vector<LogVal<double >>> &nNormalizingConstant,
+			  ConditionalMultinomialParam<int64_t> &gammaMle){
+
+  // schedule for visiting states such that we know the timestep for each arc
+  set<int> iStates, iP1States;
+  iStates.insert(fst.Start());
+    const vector<Eigen::VectorNeural>&x = GetNeuralSequence(sentId);
+    const auto zeros = Eigen::VectorNeural::Zero(Eigen::NEURAL_SIZE, 1);
+  // for each timestep
+  for(int i = 0; i < x.size(); i++) {
+    auto xI = x[i];
+
+    // from each state at timestep i
+    for(set<int>::const_iterator iStatesIter = iStates.begin(); 
+	iStatesIter != iStates.end(); 
+	iStatesIter++) {
+      int fromState = *iStatesIter;
+
+      // for each arc leaving this state
+      for(ArcIterator< VectorFst<FstUtils::LogArc> > aiter(fst, fromState); !aiter.Done(); aiter.Next()) {
+	const FstUtils::LogArc &arc = aiter.Value();
+	int yIM1 = arc.ilabel;
+	int yI = arc.olabel;
+	const FstUtils::LogWeight &arcWeight = arc.weight;
+	int toState = arc.nextstate;
+
+	// compute marginal weight of passing on this arc
+	const FstUtils::LogWeight nlogWeight(fst::Times(arcWeight , fst::Times(betas[toState], alphas[fromState])));
+	double nlogProb = fst::Divide(nlogWeight, betas[0]).Value();
+	if(nlogProb < -1.0 || std::isinf(nlogProb) || std::isnan(nlogProb)) {
+	  cerr << "FATAL ERROR: nlogProb = " << nlogProb << " = alpha + arcWeight + beta - betas[0] = " << alphas[fromState].Value() << " + " << arcWeight.Value() << " + " << betas[toState].Value() << " - " << betas[0].Value() << endl << "will terminate." << endl;
+	  assert(false);
+	}
+	// fix precision problems
+	if(nlogProb < 0) {
+	  nlogProb = 0;
+	}
+	double prob = MultinomialParams::nExp(nlogProb);
+                    meanPerLabel[yI].push_back(xI);
+                    nNormalizingConstant[yI].push_back(LogVal<double>(-nlogProb, init_lnx()));
+	gammaMle[yIM1][yI] += prob;
+
+	// prepare the schedule for visiting states in the next timestep
+	iP1States.insert(toState);
+      } 
+    }
+
+    // prepare for next timestep
+    iStates = iP1States;
+    iP1States.clear();
+  }
+}
+
+void HmmModel2::NormalizeMleMeanAndUpdateMean(boost::unordered_map< int64_t, std::vector<Eigen::VectorNeural> >& means,
+        boost::unordered_map< int64_t, std::vector<LogVal<double>>>& nNormalizingConstant) {
+
+    boost::unordered_map<int64_t, LogVal<double>> sum;
+    // init
+    for(auto y:yDomain) {
+        sum[y] = LogVal<double>::Zero();
+    }
+
+    // sum
+    for (const auto& t : nNormalizingConstant) {
+        sum[t.first] += std::accumulate(t.second.begin(), t.second.end(), LogVal<double>::Zero());
+    }
+
+    // clear
+    for(auto y:yDomain) {
+        neuralMean[y].setZero(Eigen::NEURAL_SIZE,1);
+
+    }
+
+    for (auto y : yDomain) {
+        for (auto i = 0; i < means[y].size(); i++) {
+            const auto weight = (nNormalizingConstant[y][i] / sum[y]).as_float();
+            neuralMean[y] += weight * means[y][i];
+        }
+    }
+}