Reddit Sentiment Analysis Y4 v3.0.0
Loading...
Searching...
No Matches
app.py
Go to the documentation of this file.
1# Mantas Macionis
2# C00242178
3# Retrieval, preprocessing of comments and sentiment prediction.
4
5from flask import Flask, render_template, request
6from flask_sqlalchemy import SQLAlchemy
7import matplotlib
8matplotlib.use('Agg') # Use the non-GUI 'Agg' backend
9import matplotlib.pyplot as plt
10import seaborn as sns
11import pandas as pd
12from datetime import datetime, timedelta
13import openai
14import praw
15from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
16import torch
17import re, string
18from nltk.tokenize import word_tokenize
19from nltk.corpus import stopwords
20from nltk.stem import WordNetLemmatizer
21from collections import Counter
22import os
23from uuid import uuid4
24from wordcloud import WordCloud, STOPWORDS
25from transformers import GPT2Tokenizer
26from flask import flash, redirect, url_for
27import urllib.parse
28from sqlalchemy.exc import SQLAlchemyError
29from sqlalchemy import func
30
31
32app = Flask(__name__)
33app.secret_key = os.getenv('FLASK_SECRET_KEY', 'AAA')
34
35# Configure SQLite database
36app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///search_history.db'
37app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
38db = SQLAlchemy(app)
39
40# Function to get the current time plus one hour
42 return datetime.utcnow() + timedelta(hours=1)
43
44class SearchHistory(db.Model):
45 id = db.Column(db.Integer, primary_key=True)
46 search_term = db.Column(db.String(150), nullable=False)
47 subreddit = db.Column(db.String(150), nullable=True)
48 sort_order = db.Column(db.String(50), nullable=True)
49 time_filter = db.Column(db.String(50), nullable=True)
50 comment_sort_order = db.Column(db.String(50), nullable=True)
51 overall_sentiment = db.Column(db.String(50), nullable=False)
52 positive_percentage = db.Column(db.Float, nullable=False)
53 total_comments = db.Column(db.Integer, nullable=False)
54 created_at = db.Column(db.DateTime, default=get_utc_plus_one)
55 quick_sentiment_summary = db.Column(db.String(150), nullable=True)
56 analysis_description = db.Column(db.String(512), nullable=True)
57 sentiment_pie_chart_filename = db.Column(db.String(255), nullable=True) # New field for sentiment pie chart image path
58 word_cloud_filename = db.Column(db.String(255), nullable=True) # New field for word cloud image path
59 gpt_prompt_type = db.Column(db.String(100), nullable=True) # New field for GPT prompt type
60
61
62# OpenAI API Key
63openai.api_key = ''
64
65# Reddit credentials setup
66reddit = praw.Reddit(client_id='',
67 client_secret='',
68 user_agent='')
69
70# Load the model and tokenizer
71model = DistilBertForSequenceClassification.from_pretrained('C:\\Users\\35387\\Desktop\\app1\\distilbert_sentiment_analysis')
72tokenizer = DistilBertTokenizerFast.from_pretrained('C:\\Users\\35387\\Desktop\\app1\\distilbert_sentiment_analysis_tokenizer')
73model.eval()
74
75def fetch_comments_from_posts(search_term, subreddit='all', sort_order='default', time_filter='all', comment_sort_order='top', max_comments=1000, max_comments_per_post=75):
76 comments = []
77 comment_count = 0
78 error_message = None # Initialize error message as None
79
80 search_query = {"time_filter": time_filter}
81 if sort_order != 'default':
82 search_query["sort"] = sort_order
83
84 try:
85 search_results = reddit.subreddit(subreddit).search(search_term, **search_query)
86
87 for submission in search_results:
88 if comment_count >= max_comments:
89 break
90 submission.comment_sort = comment_sort_order
91 submission.comments.replace_more(limit=1)
92 for comment in submission.comments.list():
93 if comment_count >= max_comments:
94 break
95 comments.append(comment.body)
96 comment_count += 1
97 if comment_count % max_comments_per_post == 0:
98 break
99 except Exception as e:
100 error_message = "The subreddit you entered could not be found. Please check the name and try again."
101
102 return comments, error_message
103
104
106 if pd.isna(text):
107 text = ''
108 text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
109 text = text.lower()
110 text = text.translate(str.maketrans('', '', string.punctuation))
111 tokens = word_tokenize(text)
112 stop_words = set(stopwords.words('english'))
113 tokens = [word for word in tokens if word not in stop_words]
114 lemmatizer = WordNetLemmatizer()
115 tokens = [lemmatizer.lemmatize(word) for word in tokens]
116 return ' '.join(tokens)
117
119 inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
120 with torch.no_grad():
121 logits = model(**inputs).logits
122 sentiment = torch.argmax(logits, dim=1).item()
123 return 'Positive' if sentiment == 1 else 'Negative'
124
125def interpret_sentiment(positive_percentage):
126 if positive_percentage >= 90:
127 return 'Highly Positive'
128 elif positive_percentage >= 76:
129 return 'Very Positive'
130 elif positive_percentage >= 56:
131 return 'Mostly Positive'
132 elif positive_percentage >= 45:
133 return 'Neutral'
134 elif positive_percentage >= 26:
135 return 'Mostly Negative'
136 elif positive_percentage >= 11:
137 return 'Very Negative'
138 else:
139 return 'Highly Negative'
140
141# Converting bullet points to an HTML list
143 # Split the text into lines
144 lines = text.split('\n')
145
146 # Initialize an empty string to build the HTML
147 html_content = ""
148
149 for line in lines:
150 # Check if the line starts with double asterisks which indicates a heading or bold text
151 if line.strip().startswith('**'):
152 # Find the position of the second set of double asterisks, if any
153 end_bold = line.find('**', 2)
154 if end_bold != -1:
155 # Extract and bold only the text between the first set of double asterisks
156 # and add the rest of the line as regular text
157 bold_text = line[2:end_bold].strip()
158 regular_text = line[end_bold+2:].strip()
159 html_content += f"<strong>{bold_text}</strong> {regular_text}<br>"
160 else:
161 # If no closing asterisks, bold the entire line (removing the initial asterisks)
162 cleaned_line = line.strip().strip('*').strip()
163 html_content += f"<strong>{cleaned_line}</strong><br>"
164 elif line.strip().startswith('-'):
165 # Remove the dash and convert to an HTML list item
166 cleaned_line = line.strip()[1:].strip()
167 html_content += f"<li>{cleaned_line}</li>"
168 else:
169 # Treat lines without bullet points as regular text, not as list items
170 html_content += f"{line}<br>"
171
172 # Wrap the list items in <ul> tags if there are any list items
173 if "<li>" in html_content:
174 html_content = f"<ul>{html_content}</ul>"
175 else:
176 # If there are no list items, remove the last <br> tag
177 html_content = html_content.rstrip("<br>")
178
179 return html_content
180
181def prepare_comments_for_gpt(comments, max_tokens=16385, prompt_type='default'):
182 # Map prompt type to filename
183 prompt_files = {
184 'default': 'data/gpt_prompt.txt',
185 'contextual': 'data/gpt_contextual.txt',
186 'emotional': 'data/gpt_emotional.txt',
187 'comparative': 'data/gpt_comparative.txt',
188 'impact': 'data/gpt_impact.txt',
189 'ai_generated': 'data/gpt_ai_generated.txt' # Path for AI-generated prompts
190 }
191 prompt_file = prompt_files.get(prompt_type, 'data/gpt_prompt.txt')
192
193 # Initialize the GPT-2 tokenizer
194 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
195
196 # Load your prompt text from a file
197 prompt_path = os.path.join(os.path.dirname(__file__), prompt_file)
198 with open(prompt_path, 'r') as file:
199 prompt_text = file.read()
200
201 safe_margin = 512 # Space for the model's response
202
203 # Tokenize the prompt to calculate its token count
204 prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=True)
205
206 # The maximum number of tokens that can be dedicated to comments
207 max_comment_tokens = max_tokens - len(prompt_tokens) - safe_margin
208
209 prepared_comments = []
210 total_tokens = 0
211
212 for comment in comments:
213 comment_tokens = tokenizer.encode(comment, add_special_tokens=True)
214 if total_tokens + len(comment_tokens) > max_comment_tokens:
215 break
216 prepared_comments.append(comment)
217 total_tokens += len(comment_tokens)
218
219 # After collecting comments, check the combined text does not exceed the token limit
220 combined_text = f"{prompt_text}\n\n" + '\n'.join([f"{idx + 1}: {comment}" for idx, comment in enumerate(prepared_comments)])
221 combined_tokens = tokenizer.encode(combined_text, add_special_tokens=True)
222
223 # If the combined text exceeds the max tokens, trim the comments
224 while len(combined_tokens) > max_tokens - safe_margin:
225 if not prepared_comments:
226 break # Avoids infinite loop if no comments can fit
227 prepared_comments.pop() # Remove the last comment
228 combined_text = f"{prompt_text}\n\n" + '\n'.join([f"{idx + 1}: {comment}" for idx, comment in enumerate(prepared_comments)])
229 combined_tokens = tokenizer.encode(combined_text, add_special_tokens=True)
230
231 return combined_text, len(prepared_comments)
232
233
234def chatgpt_sentiment_analysis(comments, prompt_type):
235 combined_text, used_comments = prepare_comments_for_gpt(comments, prompt_type=prompt_type)
236
237 if used_comments == 0:
238 return "Error", "No comments were analyzed due to token limit restrictions."
239
240 try:
241 response = openai.ChatCompletion.create(
242 model="gpt-3.5-turbo",
243 messages=[
244 {"role": "system", "content": "You are a helpful assistant."},
245 {"role": "user", "content": combined_text}
246 ],
247 temperature=0.7,
248 max_tokens=512,
249 )
250
251 full_response = response["choices"][0]["message"]["content"].strip()
252
253 # Check if the prompt type is 'ai_generated'
254 if prompt_type == 'ai_generated':
255 # Use a hardcoded quick sentiment summary and extract the description
256 quick_sentiment_summary = "No label due to AI-generated prompt"
257 analysis_description = convert_to_html_list(full_response)
258 else:
259 # Attempt to find the label in the response as usual
260 label_match = re.search(r"Overall Sentiment:\s*(Highly Negative|Very Negative|Mostly Negative|Neutral|Mostly Positive|Very Positive|Highly Positive)", full_response, re.IGNORECASE)
261 if label_match:
262 quick_sentiment_summary = label_match.group(1).strip() # Extracts the sentiment label
263 analysis_description_raw = full_response.replace(label_match.group(0), '').strip()
264 analysis_description = convert_to_html_list(analysis_description_raw)
265 else:
266 quick_sentiment_summary = "Label not found"
267 analysis_description = "Analysis description not found due to an unexpected response format."
268
269 except Exception as e:
270 print(f"An error occurred: {e}")
271 quick_sentiment_summary = "Error"
272 analysis_description = "Could not analyze sentiment due to an error."
273
274 return quick_sentiment_summary, analysis_description
275
276
277# Visualization Functions
278def save_sentiment_pie_chart(positive_percentage, image_path):
279 labels = ['Positive', 'Negative']
280 sizes = [positive_percentage, 100 - positive_percentage]
281 colors = ['#ff9999','#66b3ff']
282
283 plt.figure(figsize=(5, 5))
284 plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
285 plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
286 plt.savefig(image_path)
287 plt.close()
288
289def save_word_cloud(comments, image_path):
290 text = ' '.join(comments)
291 wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', max_words=100, width=800, height=400).generate(text)
292
293 plt.figure(figsize=(10, 5))
294 plt.imshow(wordcloud, interpolation='bilinear')
295 plt.axis('off')
296 plt.savefig(image_path)
297 plt.close()
298
299def fetch_comments(search_term, subreddit='all', limit=5):
300 comments = []
301 try:
302 # Fetch submissions from the subreddit based on the search term
303 for submission in reddit.subreddit(subreddit).search(search_term, limit=10):
304 submission.comment_sort = 'top' # Sort comments by top
305 submission.comments.replace_more(limit=0) # Replace MoreComments objects to expand all comments
306 for comment in submission.comments.list():
307 if len(comments) < limit:
308 comments.append(comment.body) # Collect comment text
309 else:
310 break
311 if len(comments) >= limit:
312 break
313 except Exception as e:
314 print(f"An error occurred while fetching comments: {str(e)}")
315 return []
316
317 return comments
318
319@app.route('/analysis', methods=['GET', 'POST'])
320def home():
321 if request.method == 'POST':
322 search_term = request.form.get('search_term')
323 prompt_type = request.form.get('prompt_type') or 'default'
324 subreddit = request.form.get('subreddit') or 'all'
325 sort_order = request.form.get('sort_order') or 'default'
326 time_filter = request.form.get('time_filter') or 'all'
327 comment_sort_order = request.form.get('comment_sort_order') or 'top'
328
329 comments, error_message = fetch_comments_from_posts(search_term, subreddit, sort_order, time_filter, comment_sort_order)
330
331 # Initialize filenames to None or a default value
332 sentiment_pie_chart_filename = None
333 word_cloud_filename = None
334
335 if error_message:
336 return render_template('index.html', error_message=error_message)
337
338 if not comments:
339 error_message = "Sorry, we couldn't find any comments related to your search. Please try a different search term or subreddit."
340 return render_template('index.html', error_message=error_message)
341
342 # Continue processing only if comments are present
343 else:
344 preprocessed_comments = [preprocess_text(comment) for comment in comments]
345 sentiments = [predict_sentiment(comment) for comment in preprocessed_comments]
346
347 positive_count = sentiments.count('Positive')
348 total_comments = len(sentiments)
349 positive_percentage = (positive_count / total_comments * 100) if total_comments else 0
350 overall_sentiment = interpret_sentiment(positive_percentage)
351
352 quick_sentiment_summary, analysis_description = chatgpt_sentiment_analysis(comments, prompt_type)
353
354 # File names for the visualizations
355 sentiment_pie_chart_filename = f"images/sentiment_pie_{uuid4()}.png"
356 word_cloud_filename = f"images/word_cloud_{uuid4()}.png"
357
358 # Generate and save the visualizations
359 sentiment_pie_chart_path = os.path.join(app.static_folder, sentiment_pie_chart_filename)
360 word_cloud_path = os.path.join(app.static_folder, word_cloud_filename)
361
362 save_sentiment_pie_chart(positive_percentage, sentiment_pie_chart_path)
363 save_word_cloud(comments, word_cloud_path)
364
365 new_search = SearchHistory(
366 search_term=search_term,
367 subreddit=subreddit,
368 sort_order=sort_order,
369 time_filter=time_filter,
370 comment_sort_order=comment_sort_order,
371 overall_sentiment=overall_sentiment,
372 positive_percentage=positive_percentage,
373 total_comments=total_comments,
374 quick_sentiment_summary=quick_sentiment_summary,
375 analysis_description=analysis_description,
376 sentiment_pie_chart_filename=sentiment_pie_chart_filename,
377 word_cloud_filename=word_cloud_filename,
378 gpt_prompt_type=prompt_type
379 )
380 db.session.add(new_search)
381 db.session.commit()
382
383 return render_template('index.html', search_term=search_term, subreddit=subreddit, sort_order=sort_order, time_filter=time_filter, comment_sort_order=comment_sort_order, overall_sentiment=overall_sentiment, positive_percentage=positive_percentage, total_comments=total_comments, quick_sentiment_summary=quick_sentiment_summary, analysis_description=analysis_description, sentiment_pie_chart_filename=sentiment_pie_chart_filename, word_cloud_filename=word_cloud_filename, prompt_type=prompt_type)
384
385 return render_template('index.html')
386
387@app.route('/generate_prompt', methods=['POST'])
389 search_term = request.form.get('search_term', '').strip()
390 emphasis_terms = request.form.get('emphasis_terms', '').strip()
391
392 # Less restrictive prompt structure
393 base_prompt = "(do not reply conversationally, simply do the task), create a prompt which will be given to an ai such as gpt 3.5, to make it analyze comments related to {} in the context of sentiment analysis, make the prompt nuanced and well thought out in terms of retrieving information which relates to aspects of:".format(search_term)
394
395 if emphasis_terms:
396 base_prompt += " Focus particularly on aspects such as {}.".format(emphasis_terms)
397
398 try:
399 response = openai.ChatCompletion.create(
400 model="gpt-3.5-turbo",
401 messages=[
402 {"role": "system", "content": "You are a helpful assistant."},
403 {"role": "user", "content": base_prompt}
404 ],
405 temperature=0.5
406 )
407 prompt = response['choices'][0]['message']['content'].strip()
408
409 # Save the AI-generated prompt to a file
410 prompt_path = os.path.join(os.path.dirname(__file__), 'data', 'gpt_ai_generated.txt')
411 with open(prompt_path, 'w') as file:
412 file.write(prompt)
413
414 return {'prompt': prompt, 'success': True}
415 except Exception as e:
416 return {'error': str(e), 'success': False}
417
418@app.route('/validate_search_term', methods=['POST'])
420 search_term = request.form.get('search_term')
421 comments = fetch_comments(search_term)
422
423 if not comments:
424 return {'valid': False, 'message': "Invalid search term entered. Please try a different search term before generating a prompt."}
425 else:
426 return {'valid': True, 'message': "Comments found. Proceeding with prompt generation."}
427
428@app.route('/')
430 return render_template('landing.html')
431
432@app.route('/history')
434 sort_order = request.args.get('sort', 'newest')
435 search_term = request.args.get('search')
436
437 query = SearchHistory.query
438
439 if search_term:
440 # Normalize both the search term and the database field to lowercase for case-insensitive matching
441 search_term = search_term.lower() # Convert search term to lowercase
442 query = query.filter(func.lower(SearchHistory.search_term) == search_term)
443
444 if sort_order == 'newest':
445 query = query.order_by(SearchHistory.created_at.desc())
446 elif sort_order == 'oldest':
447 query = query.order_by(SearchHistory.created_at)
448 elif sort_order == 'sentiment_asc':
449 query = query.order_by(SearchHistory.overall_sentiment)
450 elif sort_order == 'sentiment_desc':
451 query = query.order_by(SearchHistory.overall_sentiment.desc())
452
453 searches = query.all()
454
455 if not searches and search_term:
456 flash('No results found for your search term. Please try a different one.', 'info')
457
458 return render_template('history.html', searches=searches)
459
460if __name__ == '__main__':
461 with app.app_context():
462 db.create_all()
463 app.run(debug=True)
convert_to_html_list(text)
Definition app.py:142
interpret_sentiment(positive_percentage)
Definition app.py:125
preprocess_text(text)
Definition app.py:105
prepare_comments_for_gpt(comments, max_tokens=16385, prompt_type='default')
Definition app.py:181
chatgpt_sentiment_analysis(comments, prompt_type)
Definition app.py:234
history()
Definition app.py:433
landing()
Definition app.py:429
fetch_comments_from_posts(search_term, subreddit='all', sort_order='default', time_filter='all', comment_sort_order='top', max_comments=1000, max_comments_per_post=75)
Definition app.py:75
validate_search_term()
Definition app.py:419
get_utc_plus_one()
Definition app.py:41
save_word_cloud(comments, image_path)
Definition app.py:289
predict_sentiment(text)
Definition app.py:118
fetch_comments(search_term, subreddit='all', limit=5)
Definition app.py:299
model
Definition app.py:71
home()
Definition app.py:320
tokenizer
Definition app.py:72
save_sentiment_pie_chart(positive_percentage, image_path)
Definition app.py:278
generate_prompt()
Definition app.py:388