Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python 3 compatibility, Fixes for non-standard url issues , Link predictions for entities, Integration to Django UI #1

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea/
Empty file added __init__.py
Empty file.
74 changes: 37 additions & 37 deletions ekl_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# To train the embeddings for a given knowledge graph and event dataset
# put the path to the kg *path_to_kg* (optional sequence dataset *path_to_sequence*)
# fiddle with the parameter settings, then run:
# fiddle with the parameter settings, then run:
# python ekl_experiment.py

# Up to now there is no flag to switch to GPU support, but this should be
Expand Down Expand Up @@ -44,28 +44,28 @@
from event_models.LinearEventModel import Skipgram, ConcatenationFull, ConcatenationCause, Average
from event_models.Autoencoder import ConvolutionalAutoEncoder, LSTMAutoencoder

from prep.batch_generators import SkipgramBatchGenerator, TripleBatchGenerator, PredictiveEventBatchGenerator, FuturePredictiveBatchGenerator, AutoEncoderBatchGenerator
from prep.batch_generators import SkipgramBatchGenerator, TripleBatchGenerator, PredictiveEventBatchGenerator, \
FuturePredictiveBatchGenerator, AutoEncoderBatchGenerator
from prep.etl import embs_to_df, prepare_sequences, message_index
from prep.preprocessing import PreProcessor
from experiments.experiment_helper import slice_ontology, get_kg_statistics, get_low_dim_embs, get_zero_shot_scenario, \
cross_parameter_eval, TranslationModels, embs_to_df, Parameters, evaluate_on_test

rnd = np.random.RandomState(42)


if __name__ == '__main__':
####### PATH PARAMETERS ########
base_path = "./clones/" # "./traffic_data/"
base_path = "./clones/" # "./traffic_data/"
path_to_store_model = base_path + "Embeddings/"
path_to_events = base_path + "Sequences/"
path_to_kg = base_path + "Ontology/clones_enhanced.rdf" # "Ontology/traffic_individuals.xml" #
path_to_kg = base_path + "Ontology/clones_enhanced.rdf" # "Ontology/traffic_individuals.xml" #
path_to_store_sequences = base_path + "Sequences/"
path_to_store_embeddings = base_path + "Embeddings/"
traffic_data = False
path_to_sequence = base_path + 'Sequences/sequence.txt'
num_sequences = None
pre_train = False
supp_event_embeddings = None # base_path + "Embeddings/supplied_embeddings.pickle"
supp_event_embeddings = None # base_path + "Embeddings/supplied_embeddings.pickle"
preprocessor = PreProcessor(path_to_kg)
tk = None
bern_probs = None
Expand Down Expand Up @@ -114,15 +114,15 @@
param_dict = {}
param_dict['embedding_size'] = [40]
param_dict['seq_data_size'] = [1.0]
param_dict['batch_size'] = [32] # [32, 64, 128]
param_dict['learning_rate'] = [0.2] # [0.5, 0.8, 1.0]
param_dict['lambd'] = [0.001] # regularizer (RESCAL)
param_dict['alpha'] = [0.5] # event embedding weighting
param_dict['batch_size'] = [32] # [32, 64, 128]
param_dict['learning_rate'] = [0.2] # [0.5, 0.8, 1.0]
param_dict['lambd'] = [0.001] # regularizer (RESCAL)
param_dict['alpha'] = [0.5] # event embedding weighting
eval_step_size = 1000
num_epochs = 100
num_negative_triples = 2
test_proportion = 0.2
validation_proportion = 0.1 # 0.1
validation_proportion = 0.1 # 0.1
bernoulli = True
fnsim = l2_similarity

Expand All @@ -138,8 +138,8 @@

# SKIP Parameters
if event_layer is not None:
param_dict['num_skips'] = [2] # range(5, 9)
param_dict['num_sampled'] = [7] # [5, 8]
param_dict['num_skips'] = [2] # range(5, 9)
param_dict['num_sampled'] = [7] # [5, 8]
shared = True

if traffic_data:
Expand Down Expand Up @@ -176,7 +176,7 @@
param_combs = cross_parameter_eval(param_dict)
for comb_num, tmp_param_dict in enumerate(param_combs):
params = Parameters(**tmp_param_dict)
num_steps = (train_size / params.batch_size) * num_epochs
num_steps = int((train_size / params.batch_size) * num_epochs)

print("Progress: {0} prct".format(int((100.0 * comb_num) / len(param_combs))))
print("Embedding size: ", params.embedding_size)
Expand All @@ -202,34 +202,34 @@
elif event_layer == ConcatenationCause:
sg = PredictiveEventBatchGenerator(sequences, num_skips, rnd)
event_model = event_layer(num_entities, vocab_size, params.embedding_size, num_skips, shared=shared,
alpha=params.alpha)
alpha=params.alpha)
else:
batch_size_sg = 0
num_sampled = 0
event_model = None
pre_train = False

# Model Selection
if model_type == TranslationModels.Trans_E:
param_list = [num_entities, num_relations, params.embedding_size, params.batch_size,
batch_size_sg, num_sampled, vocab_size, fnsim, params.learning_rate,
event_model]
batch_size_sg, num_sampled, vocab_size, fnsim, params.learning_rate,
event_model]
model = TransE(*param_list)
elif model_type == TranslationModels.Trans_H:
param_list = [num_entities, num_relations, params.embedding_size, params.batch_size,
batch_size_sg, num_sampled, vocab_size, params.learning_rate, event_model,
params.lambd]
batch_size_sg, num_sampled, vocab_size, params.learning_rate, event_model,
params.lambd]
model = TransH(*param_list)
elif model_type == TranslationModels.RESCAL:
param_list = [num_entities, num_relations, params.embedding_size, params.batch_size,
batch_size_sg, num_sampled, vocab_size, params.learning_rate, event_model,
params.lambd]
batch_size_sg, num_sampled, vocab_size, params.learning_rate, event_model,
params.lambd]
model = RESCAL(*param_list)
elif model_type == TranslationModels.TEKE:
pre_trainer = EmbeddingPreTrainer(unique_msgs, SkipgramBatchGenerator(sequences, num_skips, rnd),
pre_train_embeddings)
pre_train_embeddings)
initE = pre_trainer.get(pre_train_steps, params.embedding_size, batch_size_sg, num_sampled, vocab_size,
num_entities)
num_entities)
tk = TEKEPreparation(sequences, initE, num_entities)
param_list = [num_entities, num_relations, params.embedding_size, params.batch_size, fnsim, tk]
model = TEKE(*param_list)
Expand All @@ -252,9 +252,9 @@
if pre_train and model_type != TranslationModels.TEKE:
# TODO: adapt to selected event_model for pre-training
pre_trainer = EmbeddingPreTrainer(unique_msgs, SkipgramBatchGenerator(sequences, num_skips, rnd),
pre_train_embeddings)
pre_train_embeddings)
initE = pre_trainer.get(pre_train_steps, params.embedding_size, batch_size_sg, num_sampled, vocab_size,
num_entities)
num_entities)
session.run(model.assign_initial(initE))

if store_embeddings:
Expand All @@ -268,9 +268,9 @@
valid_batch_pos, _ = valid_tg.next(valid_size)

feed_dict = {
model.inpl: batch_pos[1, :], model.inpr: batch_pos[0, :], model.inpo: batch_pos[2, :],
model.inpln: batch_neg[1, :], model.inprn: batch_neg[0, :], model.inpon: batch_neg[2, :],
model.global_step: b
model.inpl: batch_pos[1, :], model.inpr: batch_pos[0, :], model.inpo: batch_pos[2, :],
model.inpln: batch_neg[1, :], model.inprn: batch_neg[0, :], model.inpon: batch_neg[2, :],
model.global_step: b
}

if event_model is not None and not model_type == TranslationModels.TEKE:
Expand Down Expand Up @@ -337,26 +337,26 @@
encoding='utf-8')

# TODO: only of best model (not last)
df_embs = embs_to_df(entity_embs[len(entity_embs)-1], reverse_entity_dictionary)
df_embs = embs_to_df(entity_embs[len(entity_embs) - 1], reverse_entity_dictionary)
df_embs.to_csv(path_to_store_embeddings + "entity_embeddings" + '_last_cleaned' + ".csv", sep=',',
encoding='utf-8')
encoding='utf-8')

# Reset graph, load best model and apply to test data set
with open(base_path + 'evaluation_parameters_' + model_name +
'_best.csv', "wb") as eval_file:
'_best.csv', "w") as eval_file:
writer = csv.writer(eval_file)
results, relation_results = evaluate_on_test(model_type, best_param_list, test_tg, save_path_global, test_size,
reverse_relation_dictionary)
writer.writerow (
["relation", "embedding_size", "batch_size", "learning_rate", "num_skips", "num_sampled",
"batch_size_sg", "mean_rank", "mrr", "hits_top_10", "hits_top_3", "hits_top_1"]
reverse_relation_dictionary)
writer.writerow(
["relation", "embedding_size", "batch_size", "learning_rate", "num_skips", "num_sampled",
"batch_size_sg", "mean_rank", "mrr", "hits_top_10", "hits_top_3", "hits_top_1"]
)
writer.writerow(
['all', params.embedding_size, params.batch_size, params.learning_rate, num_skips, num_sampled,
batch_size_sg, results[0], results[1], results[2], results[3], results[4]]
)
for rel in relation_results:
writer.writerow (
writer.writerow(
[rel, params.embedding_size, params.batch_size, params.learning_rate, num_skips, num_sampled,
batch_size_sg, relation_results[rel]['MeanRank'], relation_results[rel]['MRR'],
relation_results[rel]['Hits@10'], relation_results[rel]['Hits@3'], relation_results[rel]['Hits@1']]
Expand Down
Loading