Publish about 1 month ago55 views

sma

sma

exp 2:

#API KEY: AIzaSyBledLbiXH51O5VPZppiwasKQvo4EN56Vo

>import requests

video = "7ARBJQn6QkM&t=940s"

api_key = "AIzaSyBledLbiXH51O5VPZppiwasKQvo4EN56Vo"

video_info_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet&id={}&key={}'.format(video, api_key)

video_info_response = requests.get(video_info_url)

from google.colab import auth

auth.authenticate_user ()

video_info_data = video_info_response.json()

video_info_data > comments_url = f'https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video}&key={api_key}'

comments_response = requests.get(comments_url)

comments_data = comments_response.json()

comments_data >comments = [item['snippet']['topLevelComment']['snippet' ]['textOriginal'] for item in comments_data['items']]

print(comments) >from textblob import TextBlob

def get_comments_sentiment (comments) :

analysis = TextBlob(comment)

if analysis.sentiment.polarity>0:

return 'Positive'

elif analysis.sentiment.polarity==0:

return 'Neutral'

else:

return 'Negative' >comment_list = []

sentiment_list = []

for comment in comments:

sentiment = get_comments_sentiment(comment)

comment_list.append(comment)

sentiment_list.append(sentiment)

print(f"{comment} : {sentiment}") >import pandas as pd

sentiment_df= pd.DataFrame({'Comment': comment_list, 'sentiment': sentiment_list})

display(sentiment_df.head()) exp 3 > import matplotlib.pyplot as plt

import numpy as np

from wordcloud import WordCloud

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS >sentiment_df['sentiment'].value_counts(normalize=True) >reviews=" ".join(sentiment_df['Comment'])

word_cloud= WordCloud(background_color='white', stopwords=ENGLISH_STOP_WORDS, width=900, height=300)

word_cloud.generate(reviews)

plt.rcParams["figure.figsize"]=(10,10)

plt.imshow(word_cloud, interpolation="bilinear")

plt.axis('off')

plt.show() exp 4

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from collections import Counter

df = pd.read_csv('/content/omicron.csv')

df.info()

def word_frequency(posts):

words = []

for post in posts:

if isinstance(post, str): # Check if the post is a string

words.extend(post.split())

return Counter(words)

def find_common_hastags(posts):

hashtags=[]

for post in posts:

if isinstance(post, str): # Check if the post is a string

for word in post.split():

if word.startswith('#'):

hashtags.append(word)

return Counter(hashtags)

def top_users(df, n=5):

user_counts=df['user_name'].value_counts()

return user_counts.head(n)

word_freq=word_frequency(df['text'])

common_hashtags=find_common_hastags(df['text'])

top_users_result = top_users(df)

print("Word Frequency:\n", word_freq.most_common(10))

print("\nCommon Hashtags:\n", common_hashtags.most_common(10))

print("\n Top Users by postcount: \n ", top_users_result)

top_hashtags = common_hashtags.most_common(5)

hashtags, counts = zip(*top_hashtags)

plt.bar(hashtags, counts)

plt.xlabel('Hashtags')

plt.ylabel('Frequency')

plt.title('Top 5 Hashtags')

plt.xticks(rotation=45)

plt.show()

# Visualize top users

plt.figure(figsize=(6, 4))

top_users_result.plot(kind='bar')

plt.title('Top 5 Users by Post Count')

plt.xlabel('User')

plt.ylabel('Post Count')

plt.xticks(rotation=45, ha='right')

plt.tight_layout()

plt.show()

from collections import Counter

word_cloud.generate(reviews)

# Get word frequency from reviews, excluding stop words

words = reviews.split()

filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]

word_freq_reviews = Counter(filtered_words)

# Get top 5 words

top_words = word_freq_reviews.most_common(5)

words, counts = zip(*top_words)

# Create bar chart

plt.figure(figsize=(10, 6))

plt.bar(words, counts)

plt.xlabel('Words')

plt.ylabel('Frequency')

plt.title('Top 5 Word Counts from Comments (Excluding Stop Words)')

plt.xticks(rotation=45, ha='right')

plt.tight_layout()

plt.show()

exp 6

import re

import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay

import matplotlib.pyplot as plt >from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

pattern = r"[a-zA-z]+"

vect = TfidfVectorizer(stop_words='english', token_pattern=pattern, ngram_range=(1, 2), max_features=50)

vect.fit(sentiment_df['Comment']) >tokenized_features = vect.transform(sentiment_df['Comment'])

features = pd.DataFrame(tokenized_features.toarray(), columns=vect.get_feature_names_out())

features.head() >sentiment_df["char_count"] = sentiment_df["Comment"].str.count(r"\S")

sentiment_df["word_count"] = sentiment_df["Comment"].str.count(pattern)

sentiment_df["avg_word_length"] = sentiment_df["char_count"]/sentiment_df["word_count"]

sentiment_df["avg_word_length"] = sentiment_df["avg_word_length"].replace(np.inf, 0)

sentiment_df.head() >x=pd.concat([features, sentiment_df.loc[:,"char_count":]], axis=1)

y=sentiment_df['sentiment']

x_train, x_text, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

rf=RandomForestClassifier(random_state=42).fit(x_train, y_train)

y_pred= rf.predict(x_text)

print(classification_report(y_test, y_pred))

>ConfusionMatrixDisplay.from_estimator(rf, x_text, y_test, normalize="all")

plt.title("Confusion Matrix")

plt.show()

exp 7

!pip install gensim pyLDAvis

import random

import nltk

import pandas as pd

from gensim import corpora, models

import pyLDAvis

# minimal pyLDAvis import that works with gensim 4+

try:

import pyLDAvis.gensim_models as gensimvis

except Exception:

import pyLDAvis.gensim as gensimvis

nltk.download("punkt", quiet=True)

nltk.download("stopwords", quiet=True)

nltk.download('punkt_tab', quiet=True) # Added to resolve LookupError for punkt_tab

from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

# --- synthetic dataset (small) ---

templates = [

"I love the new phone battery life!",

"What a game last night, unbelievable comeback!",

"Tried a new ramen place and it was fantastic.",

"Just booked a trip to Bali — any tips?",

"That movie made me cry, such a masterpiece."

]

comments = [random.choice(templates) + random.choice(["", " :)", " !!!", " 🤯"]) for _ in range(200)]

# --- preprocess ---

def tok(s):

t = nltk.word_tokenize(str(s).lower())

return [w for w in t if w.isalpha() and w not in stop and len(w) > 2]

docs = [tok(c) for c in comments]

docs = [d for d in docs if d] # drop empty

# --- dict, corpus, LDA ---

dictionary = corpora.Dictionary(docs)

dictionary.filter_extremes(no_below=2, no_above=0.8)

corpus = [dictionary.doc2bow(d) for d in docs]

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10, alpha="auto")

# --- output ---

for i, t in lda.print_topics(-1):

print(f"Topic {i}: {t}")

vis = gensimvis.prepare(lda, corpus, dictionary)

pyLDAvis.save_html(vis, "lda_core_short.html")

print("Saved: lda_core_short.html")