diff --git a/global.R b/global.R index 544d47e..1986f76 100644 --- a/global.R +++ b/global.R @@ -12,12 +12,37 @@ library(quanteda) library(pdftools) library(readtext) library(msgxtractr) +library(transport) #Enable bookmarking via url #enableBookmarking(store = "server") #Custom functions +#Jensen-Shannon divergence +JensenShannon <- function(query, matrix){ + p <- query[1,] + q <- t(matrix) # transpose matrix + m = (p + q)/2 + KLDp = mapply(KLD, p, m) + KLDq = mapply(KLD, p, m) + jsd = sqrt(0.5*(KLDp + KLDq)) + jsd[is.nan(jsd)] <- 0 + return (jsd) +} + +#Kullback Leibler divergence +KLD <- function(a,b){ + sum(a * log(a/b)) +} + +#Earth Movers distance +EarthMoversDistance <- function(query, matrix){ + matrix <- t(matrix) + emdismatrix = apply(matrix, 1,function(i) { wasserstein1d(query, i, p = 1, wa = NULL, wb = NULL)}) + return(emdismatrix) +} + CiteListPubmed <- function(elinkxml){ #Function to return data frame of citations for articles in entrez link xml document. #Input: diff --git a/server.R b/server.R index f0b97e3..458affe 100644 --- a/server.R +++ b/server.R @@ -2463,13 +2463,25 @@ if((file.exists(paste0(getwd(),"/ToPMine/topicalPhrases/win_run.bat")) == TRUE) topicvec <- c(temp$theta) names(topicvec) <- paste("Topic", c(1:length(topicvec))) + ##Distance Metrics #Find the cosine distance between search document and all existing documents by topic proportions + # jsdist <- JensenShannon(temp$theta, topicmodel$TopicModel$theta) cosdist <- proxy::dist(x = temp$theta, y = topicmodel$TopicModel$theta, method = "cosine") + # emdist <- EarthMoversDistance(temp$theta, t(topicmodel$TopicModel$theta)) + # browser() - #Get document order from closest match to furthest + #Get document order from closest match to furthest using Jensen Shannon + # closedocs <- topicmodel$Metadata$PMID[order(jsdist)] + # distperc <- jsdist[order(jsdist)] + + #Get document order from closest match to furthest using Cosine Distance closedocs <- topicmodel$Metadata$PMID[order(cosdist)] distperc <- cosdist[order(cosdist)] + #Get document order from closest match to furthest using Earth Movers Distance + # closedocs <- topicmodel$Metadata$PMID[order(emdist)] + # distperc <- emdist[order(emdist)] + # #Old cosine distance code that calculates distance between all documents instead of just the required distances to the new document # #Find the cosine distance between search document and all existing documents by topic proportions # cosdist <- proxy::dist(rbind(temp$theta, topicmodel$TopicModel$theta), method = "cosine")