library(tidyverse)
## παίρνει τις διαδρομές και τις κάνει character vectors
parseData <- function(data_file, skip) {
sapply(Filter(function(x) {nchar(x) > 2}, read_lines(data_file, skip = skip)), function(x) { strsplit(x, " ") })
}
topics <- c("frontpage", "news", "tech", "local", "opinion", "on-air", "misc", "weather", "msn-news",
"health", "living", "business", "msn-sports", "sports", "summary", "bbs", "travel")
## εδώ για να προσθέτουμε τα resets σε κάθε διαδρομή, αυτή θα είναι ένα συν την τελευταία κατάσταση
addReset <- function(path, k, states) {
RESET_ELEM <- "-1"
if (k > 0) {
c(as.character(array(RESET_ELEM, k)), path, as.character(RESET_ELEM))
} else {
c(path, as.character(RESET_ELEM))
}
}
updateOrInit <- function(env,target,value) {
if (is.null(env[[target]])) {
env[[target]] <- new.env(hash = TRUE)
env[[target]][[value]] <- 1
} else {
if (is.null(env[[target]][[value]])) {
env[[target]][[value]] <-1
} else {
env[[target]][[value]] <- env[[target]][[value]] + 1
}
}
}
fitPaths <- function(paths, k, states) {
FAKE_ELEM <- "0"
result <- new.env(hash = TRUE)
for (i in paths) {
path <- addReset(i, k, states)
len <- length(path)
for(j in k:(len-1)) {
value <- path[[j+1]]
if (k == 0) {
updateOrInit(result,FAKE_ELEM,value)
} else {
if (k == 1) {
target <- as.character(path[[j]])
}
else {
target <- paste(path[(j-k+1):j], collapse=",")
}
updateOrInit(result,target,value)
}
}
}
return(lapply(result, "as.list"))
}
## πιθανότητες μετάβασης
mlePaths <- function(fitted) {
lapply(fitted, function(x) {
sum <- Reduce("+", x)
lapply(x, function(x) { x / sum})
})
}
loglikelihood <- function(freqs) {
unlisted_freqs <- unlist(freqs)
probs <- unlist(mlePaths(freqs))
sum(unlisted_freqs * log(probs))
}
## βαθμοί ελευθερίας
degreesOfFreedom <- function(states, k_null, k_test) {
(states^k_test - states^k_null) * (states -1)
}
likelihoodRatio <- function(likelihood_null, likelihood_test) {
-2 * (likelihood_null - likelihood_test)
}
aic <- function(likelihood_ratio, dof) {
likelihood_ratio - 2 * dof
}
bic <- function(likelihood_ratio, dof, observation_count) {
likelihood_ratio - dof * log(observation_count)
}
simulation <- function(input, k, states, topics = topics, skip = 7) {
data <- parseData(input, skip)
freqs <- sapply(0:k, function (x) { fitPaths(data, x, states) })
zero_order <- tail(freqs[[1]][[1]], n = states) # remove reset element count for the calculation
zero_order <- zero_order[as.character(sort(as.numeric(names(zero_order))))] # sort named list by numeric character
observation_count <- sum(unlist(zero_order))
freq_percentages <- tibble(ID = 1:states,
Topic = topics,
Frequency = sapply(zero_order, function(x) { x / observation_count }))
loglikelihoods <- tibble(Order = 0:k,
Loglikelihood = sapply(0:k, function (x) { loglikelihood(freqs[[x+1]]) }))
df <- tibble(Null = integer(),
Test = integer(),
LR = numeric(),
dofs = numeric(),
p_value = numeric(),
AIC = numeric(),
BIC = numeric())
for (i in 0:(k-1)) {
for (j in (i+1):k) {
likelihood_ratio <- likelihoodRatio(loglikelihoods[[i+1, 2]],loglikelihoods[[j+1, 2]])
dof <- degreesOfFreedom(states + 1, i, j)
p <- 1 - pchisq(likelihood_ratio, dof)
AIC <- aic(likelihood_ratio, dof)
BIC <- bic(likelihood_ratio, dof, observation_count)
df <- df %>% add_row(Null = i,
Test = j,
LR = likelihood_ratio,
dofs = dof,
p_value = p,
AIC = AIC,
BIC = BIC)
}
}
return(list(frequencies = freq_percentages, loglikelihoods = loglikelihoods, results = df))
}
## plotting and data retrieval
freqCol <- function(freq_percentages) {
ggplot(data = freq_percentages) +
geom_col(mapping = aes(x = ID, y = Frequency, fill = Topic)) +
labs(title = "Συχνότητες εμφάνισης θεματικών στο δείγμα")
}
loglikelihoodGraph <- function(loglikelihoods) {
max <- loglikelihoods %>% filter(Loglikelihood == max(Loglikelihood))
print(max)
ggplot(data = loglikelihoods) +
geom_path(mapping = aes(x = Order, y = Loglikelihood)) +
geom_point(mapping = aes(x = Order, y = Loglikelihood),
shape = 15) +
geom_point(mapping = aes(x = max$Order, y = max(Loglikelihood)),
colour = "red",
size = 3) +
labs(title = "Λογαριθμο-πιθανοφάνειες για την ανάλογη τάξη Μαρκοβιανής αλυσίδας")
}
likelihoodRatioTable <- function(results) {
results[c(1,2,3)]
}
aicGraph <- function(results, test_model) {
aic <- results %>% filter(Test == test_model) %>% select(Null, AIC)
min_null <- aic %>% filter(AIC == min(AIC))
ggplot(data = aic) +
geom_path(mapping = aes(x = Null, y = AIC)) +
geom_point(mapping = aes(x = Null, y = AIC),
shape = 15) +
geom_point(mapping = aes(x = min_null$Null, y = min(AIC)),
colour = "red",
size = 3) +
labs(title = paste("AIC με εναλλακτική υπόθεση = ", test_model))
}
bicGraph <- function(results, test_model) {
bic <- results %>% filter(Test == test_model) %>% select(Null, BIC)
min_null <- bic %>% filter(BIC == min(BIC))
ggplot(data = bic) +
geom_path(mapping = aes(x = Null, y = BIC)) +
geom_point(mapping = aes(x = Null, y = BIC),
shape = 15) +
geom_point(mapping = aes(x = min_null$Null, y = min(BIC)),
colour = "red",
size = 3) +
labs(title = paste("BIC με εναλλακτική υπόθεση = ", test_model))
}