# Load packages
library(plyr)
library(dplyr)
library(tm)
library(gplots)
library(ggplot2)
library(quanteda)
library(readr)
library(seededlda)
library(slam)
library(jsonlite)
library(tm)
library(tidyr)
library(knitr)
<- readRDS("data/processed/cleaned.RDS")
light
#Set up the parameters
<- light %>%
light_interval ::mutate(span = as.factor(cut(year,
dplyrbreaks = c(seq(from = 1945, to = 2022, by = 10), 2022)))) %>%
::arrange(year)
dplyr
# I added two additional stop words that aren't captured in the generic stop words dictionary.
<- c("will", "must")
mystopwords <- c(stopwords("english"), mystopwords) custom_stopwords
Dynamic UNGDC (updated)
This version does use tf-idf for LDA analysis. For the older version, refer to UNGDC_topic_modeling.qmd
. I created a separate version for two reasons. First, some of the functions and options deprecated from the quanteda
R package. Earlier version might not be reproducible. Second, the inclusion of tf-idf to generate LDA analysis has a tradeoff. Since it gives less weights to terms that appear frequently across the documents, by definition, tf-idf lowers the correlation between terms over different time window. It is harder to notice a clear linkage between two topics represented by different terms. However, unlike the earlier version that excludes tf-idf, topics are more specific, and substantively meaningful.
- TO DO: Reduce the number of topics from 10 to 5. Tf-idf increased the between-document variation and relatively decreased the commonality across the documents. It became harder to find a common topic, and it reducing the number of topics might help. I also need to implement moving window to produce a smoother transition of topics over time.
Dynamic Topic Modeling for UNGDC
In order to generate LDA topic modeling results for the corpus of UNGD, I split the corpus into different time frames. The entire time span of 1945 until 2022 is split into 8 intervals, with a duration of 10 years.
Term Frequency-Inverse Matrix and Descriptive Data Visualization
- To inspect the data and frequent words across time intervals, below code generates top-20 terms based on the tf-idf scores.
- Input dataset: “data/processed/cleaned.RDS”.
# Function for generating tf_idf and plots.
sapply(levels(light_interval$span), function(i) {
<- light_interval %>% dplyr::filter(span %in% i)
subset_i <- Corpus(VectorSource(subset_i$text))
corpus_subset <- TermDocumentMatrix(corpus_subset,
tdm control = list(weighting = weightTfIdf,
removePunctuation = TRUE,
stemming = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removewords = mystopwords))
<- slam::row_sums(as.matrix(tdm))
top_terms
# Create a data frame with terms and tfidf values
<- data.frame(term = names(top_terms), tfidf = top_terms)
top_terms_df
# Order the terms by tfidf value
<- top_terms_df[order(top_terms_df$tfidf, decreasing = TRUE), ]
top_terms_df
# Select the top 20 terms
<- head(top_terms_df, 20)
top_terms_df
<- ggplot(top_terms_df, aes(x = reorder(term, tfidf), y = tfidf)) +
figure_i geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(title = "Top 20 Terms by TF-IDF",
x = "Terms",
y = "TF-IDF Score") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
<- file.path("figs/", paste0("plot_", i, ".png"))
output_file ggsave(output_file, figure_i, width = 8, height = 5, units = "in")
})
- dfm function helps remove stop words and perform other preprocessing steps to create a more refined document-feature matrix. Additionally, the subsequent dfm_tfidf function is used to compute TF-IDF (Term Frequency-Inverse Document Frequency) scores, which down-weights terms that appear frequently across documents.
LDA Topic Modeling
In this section, I perform topic modeling using LDA, an unsupervised method to estimate vectors that represent topics within a speech.It assumes that each speech document is a mixture of topics, and each word is attributable to one of the document’s topics.
Current parameters in the below code sets the number of topics as 10, and uses the TF-IDF scores.
In order not to replicate the entire process of LDA modeling, download this folder that contains lda outputs here
[`output/lda/decade_0120_replicate`
](https://drive.google.com/drive/folders/1woSqpayZGCozaFe1NoZuEYzdx7zv32wr?usp=share_link).
LDA Generator Updated Function
## New version since dfm() options deprecated within the quanteda package.
<- function(corpus, span_levels, num_topics = 10) {
lda_generator <- "output/lda/decade_0120_replicate"
output_dir if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
for (i in span_levels) {
<- corpus[corpus$span == i, ]
subset
# Use tokens() instead of dfm() for tokenization
<- tokens(subset$text,
subset_tokens remove_numbers = TRUE,
remove_punct = TRUE,
split_hyphens = TRUE) %>%
tokens_wordstem() %>%
tokens_tolower() %>%
tokens_remove(pattern = custom_stopwords) %>%
tokens_compound(pattern = phrase(c("human rights*", "united nations")))
# Apply TF-IDF transformation directly on tokens
<- dfm(subset_tokens) %>%
subset_tfidf dfm_tfidf(scheme_tf = "count", scheme_df = "inverse")
<- system.time({
timing <- file.path(output_dir, paste0("dfm_", i, ".RDS"))
dfm_output_file saveRDS(convert(subset_tfidf, to="data.frame"), dfm_output_file)
<- textmodel_lda(subset_tfidf, k = num_topics)
tmod_lda <- file.path(output_dir, paste0("lda_model_", i, ".RDS"))
lda_output_file saveRDS(tmod_lda, lda_output_file)
})
cat(sprintf("Time taken for %s: %s seconds\n", i, timing[3]))
}
}
# Loop through each decade and save separate LDA results.
<- levels(light_interval$span)
span_levels #lda_generator(light_interval, span_levels)
Reading in LDA results
After running the LDA model, I read in each LDA results as a separate element in a list form. Below code prins out top 10 terms associated with each topic in the LDA models for different span levels. Each row represents one semantic topic.
<- function(span_levels, output_dir = "output/lda/decade_0120_replicate") {
read_lda_models <- list()
lda_models
for (i in span_levels) {
<- file.path(output_dir, paste0("lda_model_", i, ".RDS"))
lda_output_file
if (file.exists(lda_output_file)) {
<- readRDS(lda_output_file)
lda_model <- lda_model
lda_models[[i]] cat(sprintf("LDA model for %s successfully loaded.\n", i))
else {
} cat(sprintf("LDA model file for %s not found.\n", i))
}
}
return(lda_models)
}
<- read_lda_models(span_levels) lda_models
LDA model for (1945,1955] successfully loaded.
LDA model for (1955,1965] successfully loaded.
LDA model for (1965,1975] successfully loaded.
LDA model for (1975,1985] successfully loaded.
LDA model for (1985,1995] successfully loaded.
LDA model for (1995,2005] successfully loaded.
LDA model for (2005,2015] successfully loaded.
LDA model for (2015,2022] successfully loaded.
<- function(lda_models, span_levels) {
topic_tables <- list()
topic_tables
for (i in span_levels) {
if (i %in% names(lda_models)) {
<- lda_models[[i]]
lda_model <- terms(lda_model, 10)
terms <- data.frame(Terms = terms)
topic_table <- topic_table
topic_tables[[i]] else {
} cat(sprintf("LDA model for %s not found.\n", i))
}
}
<- do.call(rbind, topic_tables)
all_topics return(all_topics)
}
<- topic_tables(lda_models, span_levels)
topic_tables print(knitr::kable(topic_tables))
| |Terms.topic1 |Terms.topic2 |Terms.topic3 |Terms.topic4 |Terms.topic5 |Terms.topic6 |Terms.topic7 |Terms.topic8 |Terms.topic9 |Terms.topic10 |
|:--------------|:-------------|:------------|:------------|:------------|:------------|:------------|:------------|:------------|:-------------|:-------------|
|(1945,1955].1 |ussr |arab |german |argentin |bolivia |hyderabad |netherland |resumpt |communist |india |
|(1945,1955].2 |soviet |israel |czechoslovak |latin |cuba |egypt |bandung |korea |soviet |australian |
|(1945,1955].3 |yugoslav |palestin |polish |trade |greek |india |african |collect |chines |commiss |
|(1945,1955].4 |atom |jerusalem |soviet |chile |greec |sudan |south |recommend |communism |council |
|(1945,1955].5 |yugoslavia |morocco |germani |uruguay |dominican |egyptian |geneva |independ |china |think |
|(1945,1955].6 |armament |tunisia |people’ |veto |guatemala |pakistan |africa |leader |korea |say |
|(1945,1955].7 |union |jew |poland |american |colombia |el |indonesia |europ |union |veto |
|(1945,1955].8 |prohibit |refuge |weapon |panama |bolivian |salvador |thailand |veto |imprison |soviet |
|(1945,1955].9 |weapon |jewish |hydrogen |venezuela |guatemalan |sudanes |zealand |revis |costa |china |
|(1945,1955].10 |american |franc |european |per |cuban |british |asia |collabor |mainland |arbitr |
|(1955,1965].1 |particip |bantu |malaysia |cambodia |netherland |pakistan |american |african |german |arab |
|(1955,1965].2 |moscow |iceland |zealand |lao |congo |cyprus |cuba |africa |socialist |israel |
|(1955,1965].3 |industri |south |australia |spain |indonesia |turkish |cuban |portug |soviet |palestin |
|(1955,1965].4 |coexist |indian |philippin |communist |irian |kashmir |panama |mali |czechoslovak |egypt |
|(1955,1965].5 |fund |canada |malaya |viet |nigeria |india |guatemala |portugues |albania |algerian |
|(1955,1965].6 |cent |canadian |australian |spanish |berlin |turkey |dominican |somali |romanian |libya |
|(1955,1965].7 |trade |danish |feder |chines |belgian |greek |venezuela |congo |germani |franc |
|(1955,1965].8 |scienc |goa |india |cambodian |indonesian |greec |latin |austrian |byelorussian |libyan |
|(1955,1965].9 |concept |africa |indonesia |pathet |west |nepal |america |cameroon |nato |canal |
|(1955,1965].10 |invest |chairman |manila |royal |want |jammu |paraguay |malagasi |albanian |jordan |
|(1965,1975].1 |like |pakistan |african |imperialist |like |israel |austria |cuba |turkey |haiti |
|(1965,1975].2 |programm |india |ghana |khmer |socialist |arab |spain |iceland |cyprus |oil |
|(1965,1975].3 |nuclear |zealand |rwanda |aggress |soviet |isra |salvador |venezuela |turkish |australia |
|(1965,1975].4 |big |ireland |portug |imperi |german |palestin |italian |cuban |argentina |haitian |
|(1965,1975].5 |neighbour |netherland |uganda |revisionist |mongolian |zionist |el |panama |greec |volta |
|(1965,1975].6 |space |japan |burundi |albania |byelorussian |palestinian |itali |bolivia |argentin |philippin |
|(1965,1975].7 |strategi |burma |africa |albanian |ssr |yemen |gibraltar |zair |peru |price |
|(1965,1975].8 |franc |kashmir |portugues |viet |czechoslovak |israel’ |rica |chile |greek |upper |
|(1965,1975].9 |youth |fiji |kenya |cambodia |ukrainian |aggress |hondura |latin |brazil |food |
|(1965,1975].10 |madam |pacif |oau |chines |europ |iraq |uruguay |dominican |dahomey |australian |
|(1975,1985].1 |malta |soviet |ireland |imperialist |turkey |benin |guinea |pleasur |guatemala |¬ |
|(1975,1985].2 |zionist |japan |panama |vietnames |yemen |burundi |papua |dialog |uganda |barbado |
|(1975,1985].3 |bahama |europ |ecuador |kampuchea |egypt |mali |zealand |cooper |timor |paraguay |
|(1975,1985].4 |itali |german |latin |chines |morocco |rwanda |pacif |program |nicaragua |ghana |
|(1975,1985].5 |iranian |union |bolivia |lao |turkish |seneg |chad |per |guinea |tion |
|(1975,1985].6 |iraqi |socialist |dominican |thailand |pakistan |zair |bangladesh |refuge |hondura |guyana |
|(1975,1985].7 |mediterranean |mongolian |rica |nam |sudan |oau |australia |india |salvador |ment |
|(1975,1985].8 |tobago |poland |costa |viet |arab |kenya |surinam |fortieth |revolutionari |con |
|(1975,1985].9 |libyan |austria |spain |ethiopia |islam |chad |equatori |sea |guatemalan |venezuela |
|(1975,1985].10 |islam |detent |american |romania |isra |mauritania |canada |indian |angola |caribbean |
|(1985,1995].1 |wish |wish |wish |panama |wish |wish |wish |wish |wish |wish |
|(1985,1995].2 |paraguay |co |islam |burundi |guinea |canada |european |saint |cooper |malawi |
|(1985,1995].3 |american |organis |arab |myanmar |pacif |netherland |europ |bahama |boutro |african |
|(1985,1995].4 |latin |program |sri |rwanda |viet |philippin |ukrain |nepal |eighth |chad |
|(1985,1995].5 |bolivia |dialog |iranian |romania |nam |want |albania |caribbean |bosnia |niger |
|(1985,1995].6 |dominican |twelv |lebanon |zair |japan |revolut |belarus |pakistan |herzegovina |swaziland |
|(1985,1995].7 |ecuador |align |lanka |panamanian |equatori |canadian |poland |barbado |somalia |uganda |
|(1985,1995].8 |hondura |namibia |iraqi |belgium |solomon |let |csce |haiti |l993 |kenya |
|(1985,1995].9 |chile |disarma |ireland |canal |zealand |enemi |austria |surinam |fiftieth |benin |
|(1985,1995].10 |costa |drug |tunisia |rwandes |papua |children |croatia |india |npt |angola |
|(1995,2005].1 |outset |outset |outset |outset |outset |outset |outset |outset |outset |outset |
|(1995,2005].2 |african |azerbaijan |island |korea |croatia |marino |trinidad |afghanistan |arab |sri |
|(1995,2005].3 |africa |cyprus |caribbean |korean |european |san |tobago |taliban |iraq |ethiopia |
|(1995,2005].4 |guinea |tajikistan |pacif |nepal |herzegovina |women |belarus |swaziland |israel |lanka |
|(1995,2005].5 |congo |armenia |saint |pakistan |mongolia |sixtieth |slovakia |ecuador |palestinian |eritrea |
|(1995,2005].6 |malawi |turkmenistan |papua |ireland |kosovo |iraq |panama |bolivia |isra |andorra |
|(1995,2005].7 |chad |turkey |bahama |thailand |€ |outcom |haiti |myanmar |lebanon |eritrean |
|(1995,2005].8 |burundi |kazakhstan |barbado |asean |latvia |weapon |mexico |estonia |malta |cuba |
|(1995,2005].9 |liberia |georgia |solomon |monaco |bosnia |document |guatemala |chile |kuwait |truth |
|(1995,2005].10 |uganda |turkish |small |india |bulgaria |uruguay |dominican |paraguay |iraqi |muslim |
|(2005,2015].1 |everi |everi |everi |everi |everi |everi |everi |everi |everi |everi |
|(2005,2015].2 |nepal |pakistan |serbia |japan |mdgs |guinea |island |arab |ecuador |azerbaijan |
|(2005,2015].3 |iceland |iran |fiji |timor |treati |african |sid |yemen |panama |georgia |
|(2005,2015].4 |trinidad |muslim |european |mongolia |g |korea |pacif |kuwait |paraguay |asean |
|(2005,2015].5 |burundi |islam |kosovo |ireland |nuclear |mali |solomon |syrian |marino |kazakhstan |
|(2005,2015].6 |sri |god |bosnia |lest |mediat |somalia |saint |iraq |peru |ukrain |
|(2005,2015].7 |tobago |war |herzegovina |bangladesh |disput |korean |bahama |lebanon |america |afghanistan |
|(2005,2015].8 |canada |nuclear |cyprus |latvia |disarma |sudan |caribbean |palestinian |american |moldova |
|(2005,2015].9 |malawi |want |malta |cambodia |migrat |philippin |grenada |egypt |latin |thailand |
|(2005,2015].10 |zambia |israel |croatia |australia |multilater |bissau |small |libya |bolivia |turkmenistan |
|(2015,2022].1 |distinct |african |israel |distinct |ukrain |india |pacif |bosnia |pandem |korea |
|(2015,2022].2 |peacekeep |mali |syrian |azerbaijan |european |pakistan |island |herzegovina |covid |malaysia |
|(2015,2022].3 |andorra |sudan |iran |armenia |russian |sri |ocean |saint |un |asean |
|(2015,2022].4 |trade |sahel |brazil |trinidad |russia |lanka |solomon |caribbean |vaccin |mongolia |
|(2015,2022].5 |weapon |congo |colombia |tobago |eu |costa |tonga |beliz |75th |thailand |
|(2015,2022].6 |migrant |chad |spain |burundi |ireland |bangladesh |papua |mauritius |marino |kazakhstan |
|(2015,2022].7 |educ |guinea |venezuela |kingdom |georgia |canada |australia |nepal |bhutan |turkmenistan |
|(2015,2022].8 |energi |africa |iraq |morocco |serbia |rica |tanzania |guatemala |kenya |japan |
|(2015,2022].9 |refuge |madagascar |palestinian |arab |europ |kashmir |micronesia |moldova |botswana |kyrgyzstan |
|(2015,2022].10 |sdgs |burkina |lebanon |yemeni |montenegro |muslim |tuvalu |bahama |somalia |tajikistan |
Each column in the dataset corresponds to a vector of terms representing a specific topic. However, extracting substantively meaningful topics poses challenges due to several issues. One notable challenge is the variability in the set of terms used to represent the same topic across different time periods. For instance, the topic of international security may be discussed in relation to the Soviet Union and North Korea in earlier time periods, whereas in more recent times, it may be associated with Russia and Ukraine.
Another important problem is identifying related topics over time. There is a difficulty of establishing connections between topics and understanding their evolution across different temporal contexts. Some topics and terms disappear abruptly, while new topics emerge. Identifying the connection between vectors poses a challenge.
Dynamic Topic Modeling
- To address the above mentioned challenges, we refered to existing papers.
"BERTopic Dynamic Topic Modeling"
(https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html)- Greene and cross, 2017 (https://doi.org/10.1017/pan.2016.7)
This generates output for a single pair of time frames
<-lda_models[[1]]
model1<-lda_models[[2]]
model2
# phi value is a topic probability of every word
<- model1$phi
phi1
#phi1$topic <- sequence(nrow(phi1))
<- model2$phi
phi2 #phi2$topic <- sequence(nrow(phi2))
# Convert matrices to data frames
<- as.data.frame(phi1)
phi1_df <- as.data.frame(phi2)
phi2_df
<- order(colMeans(phi1_df), decreasing = TRUE)
order_phi1 <- order(colMeans(phi2_df), decreasing = TRUE)
order_phi2
# Reorder columns based on the mean
<- phi1_df[, order_phi1]
phi1_df <- phi2_df[, order_phi2]
phi2_df
# Identify columns to drop based on colMeans
## Try without dropping
<- colMeans(phi1_df) < 0.00001
columns_to_drop_phi1 <- colMeans(phi2_df) < 0.00001
columns_to_drop_phi2
# Drop identified columns
<- phi1_df[, !columns_to_drop_phi1, drop = FALSE]
phi1_df <- phi2_df[, !columns_to_drop_phi2, drop = FALSE]
phi2_df
# Get the union of column names
<- union(colnames(phi1_df), colnames(phi2_df))
all_terms
#fill missing values with zeros
<- bind_cols(phi1_df, setNames(data.frame(matrix(0, nrow = nrow(phi1_df), ncol = length(setdiff(all_terms, colnames(phi1_df))))), setdiff(all_terms, colnames(phi1_df))))
phi1_union <- bind_cols(phi2_df, setNames(data.frame(matrix(0, nrow = nrow(phi2_df), ncol = length(setdiff(all_terms, colnames(phi2_df))))), setdiff(all_terms, colnames(phi2_df))))
phi2_union
# Reorder columns alphabetically
<- phi1_union[, order(colnames(phi1_union))]
phi1_union <- phi2_union[, order(colnames(phi2_union))]
phi2_union
dim(phi1_union)
dim(phi2_union)
<-cor(t(phi1_union), t(phi2_union))
cor
heatmap.2(cor,
Rowv = FALSE, Colv = FALSE,
col = heat.colors(256),
trace = "none", # no row/column names
key = TRUE, keysize = 1.5,
density.info = "none", margins = c(5, 5),
cexCol = 1, cexRow = 1, # adjust text size
notecol = "black", notecex = 0.7,
main = "Correlation Matrix",
xlab = "Period 2", ylab = "Period 1",
symkey = FALSE)
<- order(colMeans(phi1_union), decreasing = TRUE)
order_phi1_union <- phi1_union[, order_phi1_union]
phi1_result
<- order(colMeans(phi2_union), decreasing = TRUE)
order_phi2_union <- phi2_union[, order_phi2_union]
phi2_result
<- orderBasedOnRow(phi1_union, 1)
phi1_result_row <-phi1_result_row%>%
phi1_result_long::pivot_longer(everything(), names_to="term_1", values_to="probability_1")
tidyr
<- orderBasedOnRow(phi2_union, 6)
phi2_result_row <-phi2_result_row%>%
phi2_result_long::pivot_longer(everything(), names_to="term_2", values_to="probability_2")
tidyr
<-bind_cols(phi1_result_long, phi2_result_long) pair
#Function to print out the words
<- function(df, I) {
orderBasedOnRow # Order columns based on the Ith row values
<- order(apply(df, 2, function(x) x[I]), decreasing = TRUE)
ordered_cols
# Reorder the data frame columns
<- df[, ordered_cols]
ordered_df
<- ordered_df[I, 1:10]
ordered_row
return(ordered_row)
}
Below function generates heatmaps for a pair of models.
<- function(model1, model2, correlation_threshold = 0.9) {
generate_heatmap <- model1$phi
phi1 <- model2$phi
phi2
<- as.data.frame(phi1)
phi1_df <- as.data.frame(phi2)
phi2_df
<- union(colnames(phi1_df), colnames(phi2_df))
all_terms
<- bind_cols(phi1_df, setNames(data.frame(matrix(0, nrow = nrow(phi1_df), ncol = length(setdiff(all_terms, colnames(phi1_df))))), setdiff(all_terms, colnames(phi1_df))))
phi1_union <- bind_cols(phi2_df, setNames(data.frame(matrix(0, nrow = nrow(phi2_df), ncol = length(setdiff(all_terms, colnames(phi2_df))))), setdiff(all_terms, colnames(phi2_df))))
phi2_union
<- phi1_union[, order(colnames(phi1_union))]
phi1_union <- phi2_union[, order(colnames(phi2_union))]
phi2_union
dim(phi1_union)
dim(phi2_union)
<- cor(t(phi1_union), t(phi2_union))
cor_matrix
# Heatmap for correlation matrix
heatmap.2(cor_matrix,
Rowv = FALSE, Colv = FALSE,
col = heat.colors(16),
trace = "none", # no row/column names
key = TRUE, keysize = 1.5,
density.info = "none", margins = c(5, 5),
cexCol = 1, cexRow = 1, # adjust text size
notecol = "black", notecex = 0.7,
xlab = "Time 2",
ylab = "Time 1",
symkey = FALSE)
return(list(phi1_union = phi1_union, phi2_union = phi2_union, cor_matrix = cor_matrix))
}
Rows with high correlation
# Function to print the ordered rows for each topic with high correlation
<- function(phi1_union, phi2_union, cor_matrix, high_corr_indices, correlation_threshold = 0.9) {
print_ordered_rows # Find indices where correlation is higher than the threshold
<- which(cor_matrix > correlation_threshold & !is.na(cor_matrix), arr.ind = TRUE)
high_corr_indices
# Create an empty list to store results
<- list()
result_list
# Print the ordered rows for each topic with high correlation
for (i in seq_len(nrow(high_corr_indices))) {
<- high_corr_indices[i, 1]
model1_topic <- high_corr_indices[i, 2]
model2_topic
# Print the ordered rows for each model's topic
cat(paste("Model 1 - Topic", model1_topic), "\n")
<- orderBasedOnRow(phi1_union, model1_topic)
phi1_result_row
cat(paste("Model 2 - Topic", model2_topic), "\n")
<- orderBasedOnRow(phi2_union, model2_topic)
phi2_result_row
# Convert result rows to long format
<- phi1_result_row %>%
phi1_result_long ::pivot_longer(everything(), names_to = "term_1", values_to = "probability_1")
tidyr
<- phi2_result_row %>%
phi2_result_long ::pivot_longer(everything(), names_to = "term_2", values_to = "probability_2")
tidyr
# Combine phi1 and phi2 results
<- bind_cols(phi1_result_long, phi2_result_long)
pair
# Append the result to the list
<- pair
result_list[[i]]
}
# Combine all results into a single dataframe
<- do.call(bind_rows, result_list)
final_result
return(final_result)
}
Execute functions over pairs
# Loop through pairs of models to generate heatmaps and print results
for (i in 1:(length(lda_models) - 1)) {
<- lda_models[[i]]
model1 <- lda_models[[i + 1]]
model2
<- generate_heatmap(model1, model2, correlation_threshold = 0.5)
result
<- result$phi1_union
phi1_union <- result$phi2_union
phi2_union <- result$cor_matrix
cor_matrix
# Print ordered rows only if there are high correlations
if (any(cor_matrix > 0.5, na.rm = TRUE)) {
<- phi1_union[, order(colMeans(phi1_union), decreasing = TRUE)]
phi1_result <- phi2_union[, order(colMeans(phi2_union), decreasing = TRUE)]
phi2_result
# Call the modified function and pass high_corr_indices as an argument
<- print_ordered_rows(phi1_result, phi2_result, cor_matrix, high_corr_indices, correlation_threshold = 0.5)
final_result print(final_result)
} }
Model 1 - Topic 3
Model 2 - Topic 9
Model 1 - Topic 2
Model 2 - Topic 10
# A tibble: 20 × 4
term_1 probability_1 term_2 probability_2
<chr> <dbl> <chr> <dbl>
1 german 0.0150 german 0.0138
2 czechoslovak 0.0119 socialist 0.0102
3 polish 0.0113 soviet 0.00986
4 soviet 0.0108 czechoslovak 0.00816
5 germani 0.0101 albania 0.00793
6 people’ 0.00766 romanian 0.00732
7 poland 0.00693 germani 0.00728
8 weapon 0.00632 byelorussian 0.00726
9 hydrogen 0.00619 nato 0.00629
10 geneva 0.00595 albanian 0.00548
11 arab 0.0271 arab 0.0249
12 israel 0.0228 israel 0.0231
13 palestin 0.0129 palestin 0.0155
14 jerusalem 0.0104 egypt 0.0105
15 morocco 0.00949 algerian 0.0101
16 tunisia 0.00899 libya 0.00916
17 jew 0.00821 franc 0.00826
18 refuge 0.00728 libyan 0.00782
19 jewish 0.00635 canal 0.00776
20 franc 0.00500 jordan 0.00729
Model 1 - Topic 8
Model 2 - Topic 3
Model 1 - Topic 10
Model 2 - Topic 6
Model 1 - Topic 7
Model 2 - Topic 8
# A tibble: 30 × 4
term_1 probability_1 term_2 probability_2
<chr> <dbl> <chr> <dbl>
1 african 0.0150 african 0.00885
2 africa 0.00815 ghana 0.00692
3 portug 0.00735 rwanda 0.00661
4 mali 0.00729 portug 0.00620
5 portugues 0.00670 uganda 0.00618
6 somali 0.00661 burundi 0.00593
7 congo 0.00627 africa 0.00530
8 austrian 0.00601 portugues 0.00462
9 cameroon 0.00564 kenya 0.00460
10 malagasi 0.00553 oau 0.00452
# ℹ 20 more rows
Model 1 - Topic 4
Model 2 - Topic 4
# A tibble: 10 × 4
term_1 probability_1 term_2 probability_2
<chr> <dbl> <chr> <dbl>
1 imperialist 0.0162 imperialist 0.0108
2 khmer 0.00847 vietnames 0.0106
3 aggress 0.00749 kampuchea 0.00971
4 imperi 0.00718 chines 0.00956
5 revisionist 0.00666 lao 0.00954
6 albania 0.00659 thailand 0.00947
7 albanian 0.00594 nam 0.00870
8 viet 0.00576 viet 0.00863
9 cambodia 0.00553 ethiopia 0.00833
10 chines 0.00545 romania 0.00801