main.py

#%%
# Importing Libraries 
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
from wordcloud import WordCloud


#%%
train_data = pd.read_csv("data\\train.csv")

#%%
# Analyzing the DataFrame : Viewing the Data

train_data.head()

#%%
# Info about the data

train_data.info()

train_data.dropna(subset=['abstract'], inplace=True)

#%%
# Cleaning Data : Checking for Empty Cells

train_data.isnull().values.sum()

#%%
# Checking for Duplicate Cells

train_data.duplicated().sum()

#%%
# Column Data Type Assessment

train_data.dtypes.value_counts()

# %%
# Function to extract non-alphanumeric characters from a string
def extract_non_alphanumeric(text):
    non_alphanumeric = re.findall(r'[^a-zA-Z0-9\s]', text)
    return non_alphanumeric

# List to store non-alphanumeric characters
non_alphanumeric_list = []

# Iterate over each title in the "title" column of train_data
for title in train_data['title']:
    non_alphanumeric_list.extend(extract_non_alphanumeric(title))

# Remove duplicates
non_alphanumeric_list = list(set(non_alphanumeric_list))

print("List of non-alphanumeric characters:", non_alphanumeric_list)
print("Number of non-alphanumeric characters:", len(non_alphanumeric_list))

# %%
# Function to extract non-alphanumeric characters from a string
def extract_non_alphanumeric(text):
    non_alphanumeric = re.findall(r'[^a-zA-Z0-9\s]', text)
    return non_alphanumeric

# List to store non-alphanumeric characters
non_alphanumeric_list = []

# Iterate over each abstract in the "abstract" column of train_data
for abstract in train_data['abstract']:
    if isinstance(abstract, str):  # Check if the abstract is a string
        non_alphanumeric_list.extend(extract_non_alphanumeric(abstract))

# Remove duplicates
non_alphanumeric_list = list(set(non_alphanumeric_list))

print("List of non-alphanumeric characters in the abstract column:", non_alphanumeric_list)
print("Count of non-alphanumeric characters in the abstract column:", len(non_alphanumeric_list))

#%%
# Define the clean_text function
def clean_text(text):
    ''' This function removes punctuations, HTML tags, URLs, and Non-Alphanumeric words.
    '''
    unwanted_chars_patterns = [
        r'[!?,;:—".]',  # Remove punctuation
        r'<[^>]+>',  # Remove HTML tags
        r'http[s]?://\S+',  # Remove URLs
        r'\W',  # Non-Alphanumeric
    ]
    
    for pattern in unwanted_chars_patterns:
        text = re.sub(pattern, '', text)
    
    return text

# Download NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
nltk.download('punkt')

def preprocess_text(text):
    ''' This function performs tokenization of text and also uses Snowball Stemmer for stemming of words.
    '''
    if isinstance(text, str):  # Check if text is a string
        # Tokenizing the text and removing stopwords
        tokens = nltk.word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words and word.isalpha() and len(word) >= 3]
        # Applying Snowball stemming
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)
    else:
        return ''  # Return an empty string for non-string inputs

# Define the sentence_to_words function
def sentence_to_words(data_frame, column_name):
    ''' This function converts a sentence into words keeping words that are alphanumeric only.
        Also makes all the words lowercase.
    '''
    list_of_words_in_sentence = []

    for sent in data_frame[column_name].values:
        sent = clean_text(sent)
        # Split the sentence into words and keep only alphanumeric words
        words = [word.lower() for word in sent.split() if word.isalnum()]
        list_of_words_in_sentence.append(words)

    return list_of_words_in_sentence

# Apply text processing to the "abstract" column of train_data
train_data_cleaned = train_data.copy()  # Create a copy of the original DataFrame
train_data_cleaned['abstract'] = train_data_cleaned['abstract'].apply(preprocess_text)
# train_data_cleaned['abstract'] = train_data_cleaned['abstract'].apply(sentence_to_words)

# Save the cleaned dataset as train_data_cleaned
# train_data_cleaned.to_csv('train_data_cleaned.csv', index=False)


# %%
print(train_data_cleaned["abstract"].head())

# %%
# Tokenize the cleaned abstracts into words
abstract_words = [word for abstract in train_data_cleaned['abstract'] for word in abstract.split()]

# Count the frequency of each word
word_freq = Counter(abstract_words)

# Sort the words based on their frequency
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# Display the top N most used words
top_n = 30  # Change this value to display more or fewer top words
print(f"Top {top_n} most used words in the 'abstract' column:")
for word, freq in sorted_word_freq[:top_n]:
    print(f"{word}: {freq}")

# %%
# Get the top 30 most used words and their frequencies
top_words = [word[0] for word in sorted_word_freq[:30]]
word_frequencies = [word[1] for word in sorted_word_freq[:30]]

# Calculate the total number of words
total_words = sum(word_frequencies)

# Calculate the percentages rounded to one decimal precision
word_percentages = [(freq / total_words) * 100 for freq in word_frequencies]
word_percentages_rounded = [round(percentage, 1) for percentage in word_percentages]

# Plotting the histogram
plt.figure(figsize=(12, 6))
plt.bar(range(len(top_words)), word_percentages, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Percentage')
plt.title('Top 30 Most Used Words in Abstracts')
plt.xticks(range(len(top_words)), top_words, rotation=90)

# Show percentage on top of bars
for i, percentage in enumerate(word_percentages_rounded):
    plt.text(i, percentage + 0.5, f'{percentage}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()


# %%[markdown]
# Number of Unique Labels

#%%
# Extract labels from strings and flatten them
all_labels = [int(label.strip('[]')) for sublist in train_data['numerical_classification_labels'].str.split() for label in sublist if label.strip('[]')]

# Find the unique labels
unique_labels = np.unique(all_labels)

# Get the total number of unique labels
num_unique_labels = len(unique_labels)

print("Total number of unique labels:", num_unique_labels)
print("Unique labels:")
print(unique_labels)

# %%
# Count the frequency of each label
label_counts = Counter(all_labels)

# Get the top 10 most frequent labels
top_labels = label_counts.most_common(10)

# Extract label values and their frequencies
top_label_values = [label[0] for label in top_labels]
top_label_frequencies = [label[1] for label in top_labels]

# Plotting the bar plot
plt.figure(figsize=(10, 6))
plt.bar(range(len(top_label_values)), top_label_frequencies, color='skyblue')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.title('Top 10 Most Frequently Used Labels')
plt.xticks(range(len(top_label_values)), top_label_values)
plt.tight_layout()
plt.show()

# %%
# Initialize an empty dictionary for label mapping
label_mapping = {}

# Iterate over the rows of the DataFrame
for index, row in train_data.iterrows():
    # Extract labels and their numerical encodings
    actual_labels = row['classification_labels']
    encoded_labels = row['numerical_classification_labels']
    
    # Split the labels and encoded values
    actual_labels = actual_labels.strip("[]").split("' '")
    encoded_labels = [int(label) for label in encoded_labels.strip("[]").split()]
    
    # Map each label to its encoded value
    for actual_label, encoded_label in zip(actual_labels, encoded_labels):
        label_mapping[actual_label.strip("'")] = encoded_label

# Print the label mapping dictionary
print(label_mapping)


#%%
# Flatten the lists in the 'classification_labels' column and split them
all_labels = [label.strip("''") for sublist in train_data['classification_labels'] for label in sublist.strip('[]').split("' '")]

# Count the frequency of each label
label_counts = Counter(all_labels)

# Get the top 10 most frequent labels
top_labels = label_counts.most_common(10)

# Extract label values and their frequencies
top_label_values = [label[0] for label in top_labels]
top_label_frequencies = [label[1] for label in top_labels]

# Calculate total number of labels
total_labels = sum(top_label_frequencies)

# Calculate percentages
label_percentages = [(freq / total_labels) * 100 for freq in top_label_frequencies]

# Plotting the bar plot with y-axis in percentage
plt.figure(figsize=(10, 6))
plt.bar(range(len(top_label_values)), label_percentages, color='skyblue')
plt.xlabel('Labels')
plt.ylabel('Percentage')
plt.title('Top 10 Most Frequently Used Labels (From classification_labels)')
plt.xticks(range(len(top_label_values)), top_label_values, rotation=45, ha='right')
plt.tight_layout()
plt.show()

#%%
# Combine all abstracts into a single string
# all_abstracts = ' '.join(train_data['abstract'])

# # Generate word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_abstracts)

# # Display the word cloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud of Abstracts')
# plt.show()

#%%
# Convert abstracts to strings, handling any float values
# all_abstracts = ' '.join(str(abstract) for abstract in train_data['abstract'])

# # Generate word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_abstracts)

# # Display the word cloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud of Abstracts')
# plt.show()

#%%
# Filter out float values and calculate length of each abstract
abstract_lengths = train_data['abstract'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# Plot histogram of abstract lengths
plt.figure(figsize=(8, 6))
plt.hist(abstract_lengths, bins=30, color='skyblue', edgecolor='black')
plt.xlabel('Abstract Length')
plt.ylabel('Frequency')
plt.title('Histogram of Abstract Lengths')
plt.xlim(0, 750)  # Set x-axis range from 0 to 750
plt.show()

#%%
# Extract labels from strings and flatten them
all_labels = [int(label.strip('[]')) for sublist in train_data['numerical_classification_labels'].str.split() for label in sublist if label.strip('[]')]

# Find the unique labels
unique_labels = np.unique(all_labels)

# Get the total number of unique labels
num_unique_labels = len(unique_labels)

print("Total number of unique labels:", num_unique_labels)
print("Unique labels:")
print(unique_labels)

# %%
# Initialize an empty dictionary to store the label mappings
label_mapping = {}

# Iterate over each row in the dataset
for index, row in train_data.iterrows():
    classification_labels = row['classification_labels']
    numerical_classification_labels = row['numerical_classification_labels']
    # print(numerical_classification_labels)
    # Extract individual labels using regular expressions
    labels = re.findall(r"'(.*?)'", classification_labels)
    numerical_labels = re.findall(r'\d+', numerical_classification_labels)
    # print(numerical_labels)
    
    # Iterate over each label and numerical value pair
    for label, numerical_label in zip(labels, numerical_labels):
        # Add the mapping to the dictionary

        label_mapping[label] = numerical_label

print(label_mapping)

#%%
# Encoding the multi labels into new columns 

# Create a copy of the original DataFrame
encoded_data = train_data.copy()

# Iterate over each key in the label_mapping dictionary
for label, numerical_label in label_mapping.items():
    # Create a new column with the label name
    encoded_data[label] = 0
    # Iterate over each row in the DataFrame
    for index, row in encoded_data.iterrows():
        # Check if the label exists in the classification labels
        if numerical_label in row['numerical_classification_labels']:
            # Assign 1 to the corresponding column if the label exists
            encoded_data.at[index, label] = 1

# Remove the specified columns
columns_to_remove = ['id', 'title', 'classification_labels', 'numerical_classification_labels']
encoded_data.drop(columns=columns_to_remove, inplace=True)

# Save the new DataFrame as "Encoded_data.csv"
encoded_data.to_csv("data\\Encoded_data.csv", index=False)

# Display the first few rows of the encoded DataFrame
print(encoded_data.head())

#%%
## Reducing the number of rows for faster computation 
import os 
# Define the file path
file_path = 'data/Encoded_data.csv'

# Check if the file exists
if os.path.exists(file_path):
    # Read the CSV file into a pandas DataFrame
    encoded_data = pd.read_csv(file_path)
    # Display the first few rows of the DataFrame for verification
    print(encoded_data.head())
else:
    print(f"The file '{file_path}' does not exist.")

encoded_data = encoded_data.head(100000) # Change accordingly

#%%
# Set multilabel_yx
multilabel_yx = encoded_data.iloc[:, 1:].values

# Train and test split
total_size = encoded_data.shape[0]
train_size = int(0.80 * total_size)
x_train = encoded_data.head(train_size)
y_train = multilabel_yx[:train_size, :]
x_test = encoded_data.tail(total_size - train_size)
y_test = multilabel_yx[train_size:, :]

#%%
# Extract labels for train and test sets
y_train = multilabel_yx[:train_size, :]
y_test = multilabel_yx[train_size:, :]

# %%
# Featurization using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(min_df=0.001, smooth_idf=True, norm="l2", 
                             tokenizer=lambda x: x.split(), sublinear_tf=False, ngram_range=(1, 3))

#%%
# Fit and transform on the training data
x_train_multilabel = vectorizer.fit_transform(x_train['abstract'])

# Transform the test data
x_test_multilabel = vectorizer.transform(x_test['abstract'])

#%%
# Save the tokenized data as "Tokenized_data.csv"
tokenized_data = pd.DataFrame(x_train_multilabel.toarray(), columns=vectorizer.get_feature_names_out())
# tokenized_data.to_csv("data\\Tokenized_data.csv", index=False)

# Display the first few rows of the encoded DataFrame
print(tokenized_data.head())

#%%
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, classification_report

# Define the classifier using OneVsRest strategy with SGDClassifier
classifier = MultiOutputClassifier(SGDClassifier(loss='log_loss', alpha=0.00001, penalty='l1'), n_jobs=-1)

# Fit the classifier on the training data
classifier.fit(x_train_multilabel, y_train)

# Make predictions on the test data
predictions = classifier.predict(x_test_multilabel)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, predictions)
macro_f1_score = f1_score(y_test, predictions, average='macro')
micro_f1_score = f1_score(y_test, predictions, average='micro')
hamming_loss_value = hamming_loss(y_test, predictions)
precision_recall_report = classification_report(y_test, predictions)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Macro F1 Score:", macro_f1_score)
print("Micro F1 Score:", micro_f1_score)
print("Hamming Loss:", hamming_loss_value)
print("Precision-Recall Report:\n", precision_recall_report)


# %%
import pickle 

with open('multioutput_classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)

# %%
# Make predictions on the training data
train_predictions = classifier.predict(x_train_multilabel)

# Calculate evaluation metrics for training data
train_accuracy = accuracy_score(y_train, train_predictions)
train_macro_f1_score = f1_score(y_train, train_predictions, average='macro')
train_micro_f1_score = f1_score(y_train, train_predictions, average='micro')
train_hamming_loss_value = hamming_loss(y_train, train_predictions)
train_precision_recall_report = classification_report(y_train, train_predictions)

# Print the evaluation metrics for training data
print("Training Data Evaluation:")
print("Accuracy:", train_accuracy)
print("Macro F1 Score:", train_macro_f1_score)
print("Micro F1 Score:", train_micro_f1_score)
print("Hamming Loss:", train_hamming_loss_value)
print("Precision-Recall Report:\n", train_precision_recall_report)

#%%
#### Word2Vec - Average #############
# Importing necessary libraries
import pandas as pd
from gensim.models import Word2Vec
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, classification_report
import pickle

# Assuming train_data is your DataFrame with 'abstract' column
# Convert 'abstract' column to list of lists of words
sentences = [abstract.split() for abstract in train_data['abstract']]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

#%%
# Function to calculate average word vectors
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

# Create average word vector features
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                for tokenized_sentence in corpus]
    return np.array(features)

#%%
# Convert abstracts to averaged word vectors
x_train_word2vec = averaged_word_vectorizer(x_train['abstract'].apply(lambda x: x.split()), model, 100)
x_test_word2vec = averaged_word_vectorizer(x_test['abstract'].apply(lambda x: x.split()), model, 100)

# Create MultiOutputClassifier with SGDClassifier
classifier = MultiOutputClassifier(SGDClassifier(loss='log_loss', alpha=0.000001, penalty='l1'), n_jobs=-1)

# Fit classifier
classifier.fit(x_train_word2vec, y_train)

# Predictions
predictions = classifier.predict(x_test_word2vec)

# Evaluation metrics
accuracy = accuracy_score(y_test, predictions)
macro_f1_score = f1_score(y_test, predictions, average='macro')
micro_f1_score = f1_score(y_test, predictions, average='micro')
hamming_loss_value = hamming_loss(y_test, predictions)
precision_recall_report = classification_report(y_test, predictions)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Macro F1 Score:", macro_f1_score)
print("Micro F1 Score:", micro_f1_score)
print("Hamming Loss:", hamming_loss_value)
print("Precision-Recall Report:\n", precision_recall_report)

# Save the model
with open('multioutput_classifier_word2vec.pkl', 'wb') as file:
    pickle.dump(classifier, file)

# %%
################################tfidf-wor2vec#########################################
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, classification_report
import numpy as np
import pickle

# Assuming train_data is your DataFrame with 'abstract' column
# Convert 'abstract' column to list
sentences = train_data['abstract'].apply(lambda x: x.split()).tolist()

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=0.001, tokenizer=lambda x: x.split())
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['abstract'])

# Map feature names to column index
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

def tfidf_weighted_word_vectors(words, tfidf_vector, tfidf_feature_names, model):
    word_vector_sum = np.zeros((model.vector_size,), dtype="float64")
    weight_sum = 0.0
    
    for word in words:
        if word in model.wv:
            try:
                word_vector = model.wv[word]
                tfidf_weight = tfidf_vector[0, tfidf_feature_names.index(word)]
                word_vector_sum += (word_vector * tfidf_weight)
                weight_sum += tfidf_weight
            except ValueError:
                continue
                
    if weight_sum != 0:
        word_vector_sum /= weight_sum
        
    return word_vector_sum

def tfidf_weighted_word2vec(corpus, tfidf_matrix, tfidf_feature_names, model):
    tfidf_feature_names_list = list(tfidf_feature_names)  # Convert to list
    features = [tfidf_weighted_word_vectors(tokenized_sentence, tfidf_vector, tfidf_feature_names_list, model)
                for tokenized_sentence, tfidf_vector in zip(corpus.apply(lambda x: x.split()), tfidf_matrix)]
    return np.array(features)

# Convert abstracts to TF-IDF weighted Word2Vec vectors
x_train_tfidf_word2vec = tfidf_weighted_word2vec(x_train['abstract'], tfidf_matrix[:len(x_train)], tfidf_feature_names, model)
x_test_tfidf_word2vec = tfidf_weighted_word2vec(x_test['abstract'], tfidf_matrix[len(x_train):], tfidf_feature_names, model)

# MultiOutputClassifier with SGDClassifier
classifier = MultiOutputClassifier(SGDClassifier(loss='log_loss', alpha=0.00001, penalty='l1'), n_jobs=-1)

# Fit classifier
classifier.fit(x_train_tfidf_word2vec, y_train)

# Predictions
predictions = classifier.predict(x_test_tfidf_word2vec)

# Evaluation metrics
accuracy = accuracy_score(y_test, predictions)
macro_f1_score = f1_score(y_test, predictions, average='macro')
micro_f1_score = f1_score(y_test, predictions, average='micro')
hamming_loss_value = hamming_loss(y_test, predictions)
precision_recall_report = classification_report(y_test, predictions)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Macro F1 Score:", macro_f1_score)
print("Micro F1 Score:", micro_f1_score)
print("Hamming Loss:", hamming_loss_value)
print("Precision-Recall Report:\n", precision_recall_report)

# Save the model
with open('multioutput_classifier_tfidf_word2vec.pkl', 'wb') as file:
    pickle.dump(classifier, file)

#%%