sma
sma
exp 2:
#API KEY: AIzaSyBledLbiXH51O5VPZppiwasKQvo4EN56Vo
>import requests
video = "7ARBJQn6QkM&t=940s"
api_key = "AIzaSyBledLbiXH51O5VPZppiwasKQvo4EN56Vo"
video_info_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet&id={}&key={}'.format(video, api_key)
video_info_response = requests.get(video_info_url)
from google.colab import auth
auth.authenticate_user ()
video_info_data = video_info_response.json()
video_info_data > comments_url = f'https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video}&key={api_key}'
comments_response = requests.get(comments_url)
comments_data = comments_response.json()
comments_data >comments = [item['snippet']['topLevelComment']['snippet' ]['textOriginal'] for item in comments_data['items']]
print(comments) >from textblob import TextBlob
def get_comments_sentiment (comments) :
analysis = TextBlob(comment)
if analysis.sentiment.polarity>0:
return 'Positive'
elif analysis.sentiment.polarity==0:
return 'Neutral'
else:
return 'Negative' >comment_list = []
sentiment_list = []
for comment in comments:
sentiment = get_comments_sentiment(comment)
comment_list.append(comment)
sentiment_list.append(sentiment)
print(f"{comment} : {sentiment}") >import pandas as pd
sentiment_df= pd.DataFrame({'Comment': comment_list, 'sentiment': sentiment_list})
display(sentiment_df.head()) exp 3 > import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS >sentiment_df['sentiment'].value_counts(normalize=True) >reviews=" ".join(sentiment_df['Comment'])
word_cloud= WordCloud(background_color='white', stopwords=ENGLISH_STOP_WORDS, width=900, height=300)
word_cloud.generate(reviews)
plt.rcParams["figure.figsize"]=(10,10)
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis('off')
plt.show() exp 4
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
df = pd.read_csv('/content/omicron.csv')
df.info()
def word_frequency(posts):
words = []
for post in posts:
if isinstance(post, str): # Check if the post is a string
words.extend(post.split())
return Counter(words)
def find_common_hastags(posts):
hashtags=[]
for post in posts:
if isinstance(post, str): # Check if the post is a string
for word in post.split():
if word.startswith('#'):
hashtags.append(word)
return Counter(hashtags)
def top_users(df, n=5):
user_counts=df['user_name'].value_counts()
return user_counts.head(n)
word_freq=word_frequency(df['text'])
common_hashtags=find_common_hastags(df['text'])
top_users_result = top_users(df)
print("Word Frequency:\n", word_freq.most_common(10))
print("\nCommon Hashtags:\n", common_hashtags.most_common(10))
print("\n Top Users by postcount: \n ", top_users_result)
top_hashtags = common_hashtags.most_common(5)
hashtags, counts = zip(*top_hashtags)
plt.bar(hashtags, counts)
plt.xlabel('Hashtags')
plt.ylabel('Frequency')
plt.title('Top 5 Hashtags')
plt.xticks(rotation=45)
plt.show()
# Visualize top users
plt.figure(figsize=(6, 4))
top_users_result.plot(kind='bar')
plt.title('Top 5 Users by Post Count')
plt.xlabel('User')
plt.ylabel('Post Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
from collections import Counter
word_cloud.generate(reviews)
# Get word frequency from reviews, excluding stop words
words = reviews.split()
filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]
word_freq_reviews = Counter(filtered_words)
# Get top 5 words
top_words = word_freq_reviews.most_common(5)
words, counts = zip(*top_words)
# Create bar chart
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 5 Word Counts from Comments (Excluding Stop Words)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
exp 6
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt >from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
pattern = r"[a-zA-z]+"
vect = TfidfVectorizer(stop_words='english', token_pattern=pattern, ngram_range=(1, 2), max_features=50)
vect.fit(sentiment_df['Comment']) >tokenized_features = vect.transform(sentiment_df['Comment'])
features = pd.DataFrame(tokenized_features.toarray(), columns=vect.get_feature_names_out())
features.head() >sentiment_df["char_count"] = sentiment_df["Comment"].str.count(r"\S")
sentiment_df["word_count"] = sentiment_df["Comment"].str.count(pattern)
sentiment_df["avg_word_length"] = sentiment_df["char_count"]/sentiment_df["word_count"]
sentiment_df["avg_word_length"] = sentiment_df["avg_word_length"].replace(np.inf, 0)
sentiment_df.head() >x=pd.concat([features, sentiment_df.loc[:,"char_count":]], axis=1)
y=sentiment_df['sentiment']
x_train, x_text, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
rf=RandomForestClassifier(random_state=42).fit(x_train, y_train)
y_pred= rf.predict(x_text)
print(classification_report(y_test, y_pred))
>ConfusionMatrixDisplay.from_estimator(rf, x_text, y_test, normalize="all")
plt.title("Confusion Matrix")
plt.show()
exp 7
!pip install gensim pyLDAvis
import random
import nltk
import pandas as pd
from gensim import corpora, models
import pyLDAvis
# minimal pyLDAvis import that works with gensim 4+
try:
import pyLDAvis.gensim_models as gensimvis
except Exception:
import pyLDAvis.gensim as gensimvis
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download('punkt_tab', quiet=True) # Added to resolve LookupError for punkt_tab
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))
# --- synthetic dataset (small) ---
templates = [
"I love the new phone battery life!",
"What a game last night, unbelievable comeback!",
"Tried a new ramen place and it was fantastic.",
"Just booked a trip to Bali — any tips?",
"That movie made me cry, such a masterpiece."
]
comments = [random.choice(templates) + random.choice(["", " :)", " !!!", " 🤯"]) for _ in range(200)]
# --- preprocess ---
def tok(s):
t = nltk.word_tokenize(str(s).lower())
return [w for w in t if w.isalpha() and w not in stop and len(w) > 2]
docs = [tok(c) for c in comments]
docs = [d for d in docs if d] # drop empty
# --- dict, corpus, LDA ---
dictionary = corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=2, no_above=0.8)
corpus = [dictionary.doc2bow(d) for d in docs]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10, alpha="auto")
# --- output ---
for i, t in lda.print_topics(-1):
print(f"Topic {i}: {t}")
vis = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(vis, "lda_core_short.html")
print("Saved: lda_core_short.html")