Help with LDA (Gensim Script)

Hi,

I’m struggling to debug my code. At the moment, no error message appears, and the model does not generate (I have left it running overnight and still nothing)

Would someone be able to read through and check for bugs. I have combed over it, ran it through AI and still nothing.

Thanks so much


## This code creates the LDA topic model for the unique documents in the Tiktok dataset, for topic numbers n = 2:28,
## it calculates the coherence scores for the topic and creates and intertopic-distance visualisation for the model with the highest coherence score, with the LDAvis.
## Then it also assignes the dominant topic to all Text documents utilised 

##Make sure these are installed 
#pip install pymongo
#pip install Scipy
#pip install numpy
#pip install nltk
#pip install pyLDAvis
#pip install gensim

#Import relevant libraries:
import pymongo
import time
import gensim
import os
import csv
import re
import operator
import warnings
import numpy as np
import json
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import strip_punctuation
from pprint import pprint
from gensim.corpora.mmcorpus import MmCorpus
from gensim.models import ldamulticore
from gensim import models
from nltk.corpus import stopwords
import pyLDAvis.gensim
import gc
import logging
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import MmCorpus
from gensim.test.utils import get_tmpfile 
from collections import OrderedDict

### Set up workspace

# Set working directory
os.chdir('D:/Users/Alexander Hiscock/Documents/Archaeology Research/Archaeology/PhD/Thesis (Sections, Work, Data)/CODE/Analysis/LDA - Text/LDA/Tiktok')

# Define the paths to input:
file_path = 'TikTok_Texts.json'

# Define the paths to outputs
path2corpus='outputs/topic_models/corpus'
path2dictionary='outputs/topic_models/dictionary'
path2model= 'outputs/topic_models/models_'
path2coherence = 'outputs/03_01_01_coherenceScores.csv' # Path to model coherence scores
path2html = 'outputs/03_01_02_topic_model.html' # Path to the best model visualisation in html

# Define language to use for the model, and the threshold for the number of topics
language='english'
max_topics=31

# Load the JSON file containing all texts

with open(file_path, 'r', encoding='utf8') as file:
    data = json.load(file)

# Extracting texts and ignoring the codes and dates
texts = [item['Text'] for item in data if 'Text' in item]

# Removing links and usernames if they appear in the text/Might edit to keep links to establish linking behaviour within topics:
i=0
j=len(texts)
while(i<j):
    texts[i] = re.sub('http/S+', '', texts[i])
    texts[i] = re.sub('@/S+', '', texts[i])

# Import and define stpowords:
nltk.download('stopwords')
stops = set(stopwords.words('english'))
#Also add new stopwords. The search terms are added as most primary content will contain them.
new_stops = set(["Antonine","Wall"])

## Get rid of english stopwords and user defined stopwords:
texts = [[word for word in text if word not in stops] for text in texts]
texts = [[word for word in text if word not in new_stops] for text in texts]

#Lemmatize all the words in the document:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
texts= [[lemmatizer.lemmatize(token) for token in text] for text in texts]

# Create bigrams and trigrams:

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(texts, min_count=20)
for idx in range(len(texts)):
    for token in bigram[texts[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            texts[idx].append(token)

# Make dictionary and the corpus
train_texts = texts 
dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]
    
### Save corpus and dictionary:  
MmCorpus.serialize(path2corpus, corpus)
mm = MmCorpus(path2corpus)
dictionary.save_as_text(path2dictionary)
dictionary = Dictionary.load_from_text(path2dictionary)

# Set up the list to hold coherence values for each topic:
c_v = []
# Loop over to create models with 2 to 30 topics, and calculate coherence scores for it:
for num_topics in range(2, max_topics):
    print(num_topics)
    lm = models.LdaMulticore(corpus=mm, num_topics=num_topics,     id2word=dictionary,chunksize=9000,passes=100,eval_every=1,iterations=500,workers=4) # Create a model for num_topics topics
    print("Calculating coherence score...")
    cm = CoherenceModel(model=lm, texts=train_texts, dictionary=dictionary, coherence='c_v') # Calculate the coherence score for the topics
    print("Saving model...")
    lm.save(path2model+str(num_topics)) # Save the model
    lm.clear() # Clear the data from the model
    del lm # Delete the model
    gc.collect() # Clears data from the workspace to free up memory
    c_v.append(cm.get_coherence()) # Append the coherence score to the list


# Save the coherence scores to the file:    
with open(path2coherence, 'a') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(["n_topics","coherence_score"])
    i=2
    for score in c_v:
        print(i)
        writer.writerow([i,score])
        i=i+1

#Get the best topic model and construct the visualisation

n=c_v.index(max(c_v))+2 # Get the number of topics with the highest coherence score
lm = LdaModel.load(path2model+str(n)) # Load the number of topics with the highest coherence score into the workspace
tm = pyLDAvis.gensim.prepare(lm, mm, dictionary) # Prepare the visualisation
pyLDAvis.save_html(tm, path2html+str(n)+'.html') # Save the visualisation


### Assign topics to texts, using their Codes, along with Probability score:
# Reorder topic: 

# Change the topics order to be consistent with the to be consistent with the pyLDAvis topic model (ordered from the most
# frequent one to the least frequent one) and assign dominant topic to each document:

# Get the topic order
to=tm.topic_order

# set up writing to a file
# List to hold the processed data for each document
documents_data = []

# Loop over all the documents in the corpus, and assign topic and probabilities to each:
for i in range(len(corpus)):
    topics = lm.get_document_topics(corpus[i])  # Get topic probabilities for the document
    topics = list(topics)    
    topics = [list(topic) for topic in topics]  # Reformat topics probabilities for the analysis

    # Reorder topics according to pyLDAvis numbering
    topics = [["Topic " + str(to.index(topic[0]) + 1), topic[1]] for topic in topics] 
    topics = sorted(topics)
    topics_dict = dict(topics)

    # Get dominant topic and value for the documents
    topics_dict['dominant_topic'] = max(topics_dict, key=topics_dict.get)
    topics_dict['dominant_value'] = topics_dict[topics_dict['dominant_topic']]
    topics_dict["Code"] = Code[i]
    topics_dict["text"] = texts[i]

    # Add the document's data to the list
    documents_data.append(topics_dict)

# Write the data to a JSON file
with open('TextTopics.json', 'w', encoding='utf-8') as f:
    json.dump(documents_data, f, ensure_ascii=False, indent=4)
#

#Now retreiving Mongo assemblages, and merging these jsons to them.

# MongoDB connection string
connection_string = "~"
database_name = "Antonine_Wall_Value_Assemblages"

# Connect to MongoDB
client = pymongo.MongoClient(connection_string)
db = client[database_name]

# Function to fetch data from a given collection
def fetch_data_from_collection(collection_name):
    collection = db[collection_name]
    return list(collection.find({}))

# Fetching data from collection
tiktok_data = fetch_data_from_collection("2TikTok")

#Merging TikTok

# Load TikTok_Topics JSON file
with open('TikTok_Topics.json', 'r') as file:
    tiktok_topics = json.load(file)

# Categorize TikTok topics into A and B forms
tiktok_topics_a = {topic['Code']: topic for topic in tiktok_topics if topic['Code'].startswith('TT.') and topic['Code'].count('.') == 1}
tiktok_topics_b = {topic['Code']: topic for topic in tiktok_topics if topic['Code'].startswith('TT.') and topic['Code'].count('.') == 2}

# Merge function
def merge_tiktok_data(tiktok_data, topics_a, topics_b):
    for entry in tiktok_data:
        # Merge with form A topics
        code_a = entry.get('Code', '')
        if code_a in topics_a:
            entry.update(topics_a[code_a])

        # Merge with form B topics in comments
        if 'Comments' in entry:
            for comment in entry['Comments']:
                code_b = comment.get('Comment', '')  # Assuming 'Comment' holds the code
                if code_b in topics_b:
                    comment.update(topics_b[code_b])

# tiktok_data is already loaded from MongoDB
merge_tiktok_data(tiktok_data, tiktok_topics_a, tiktok_topics_b)

# Save the merged data to a JSON file
with open('Merged_TikTok_Data.json', 'w') as file:
     json.dump(tiktok_data, file, indent=4)


# Load the merged TikTok data (if not already loaded)
with open('Merged_TikTok_Data.json', 'r') as file:
     merged_tiktok_data = json.load(file)

# Name of the new MongoDB collection
new_collection_name = "2.TikTok"

# Create a new collection (if it doesn't exist)
new_collection = db[new_collection_name]

# Insert the merged data
new_collection.insert_many(merged_tiktok_data)

Break it down to smaller parts and debug each part.

You have this loop:

i=0
j=len(texts)
while(i<j):
    texts[i] = re.sub('http/S+', '', texts[i])
    texts[i] = re.sub('@/S+', '', texts[i])

The value of i never changes.