de/d0e/app_8py_source.html

# Mantas Macionis

# C00242178

# Retrieval, preprocessing of comments and sentiment prediction.


from flask import Flask, render_template, request

from flask_sqlalchemy import SQLAlchemy

import matplotlib

matplotlib.use('Agg')  # Use the non-GUI 'Agg' backend

import matplotlib.pyplot as plt

import seaborn as sns

import pandas as pd

from datetime import datetime, timedelta

import openai

import praw

from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

import torch

import re, string

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from collections import Counter

import os

from uuid import uuid4

from wordcloud import WordCloud, STOPWORDS

from transformers import GPT2Tokenizer

from flask import flash, redirect, url_for

import urllib.parse

from sqlalchemy.exc import SQLAlchemyError

from sqlalchemy import func


app = Flask(__name__)

app.secret_key = os.getenv('FLASK_SECRET_KEY', 'AAA')


# Configure SQLite database

app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///search_history.db'

app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

db = SQLAlchemy(app)


# Function to get the current time plus one hour


def get_utc_plus_one():

    return datetime.utcnow() + timedelta(hours=1)


class SearchHistory(db.Model):

    id = db.Column(db.Integer, primary_key=True)

    search_term = db.Column(db.String(150), nullable=False)

    subreddit = db.Column(db.String(150), nullable=True)

    sort_order = db.Column(db.String(50), nullable=True)

    time_filter = db.Column(db.String(50), nullable=True)

    comment_sort_order = db.Column(db.String(50), nullable=True)

    overall_sentiment = db.Column(db.String(50), nullable=False)

    positive_percentage = db.Column(db.Float, nullable=False)

    total_comments = db.Column(db.Integer, nullable=False)

    created_at = db.Column(db.DateTime, default=get_utc_plus_one)

    quick_sentiment_summary = db.Column(db.String(150), nullable=True)

    analysis_description = db.Column(db.String(512), nullable=True)

    sentiment_pie_chart_filename = db.Column(db.String(255), nullable=True)  # New field for sentiment pie chart image path

    word_cloud_filename = db.Column(db.String(255), nullable=True)  # New field for word cloud image path

    gpt_prompt_type = db.Column(db.String(100), nullable=True)  # New field for GPT prompt type


# OpenAI API Key

openai.api_key = ''


# Reddit credentials setup


reddit = praw.Reddit(client_id='',

                     client_secret='',

                     user_agent='')


# Load the model and tokenizer

model = DistilBertForSequenceClassification.from_pretrained('C:\\Users\\35387\\Desktop\\app1\\distilbert_sentiment_analysis')

tokenizer = DistilBertTokenizerFast.from_pretrained('C:\\Users\\35387\\Desktop\\app1\\distilbert_sentiment_analysis_tokenizer')

model.eval()


def fetch_comments_from_posts(search_term, subreddit='all', sort_order='default', time_filter='all', comment_sort_order='top', max_comments=1000, max_comments_per_post=75):

    comments = []

    comment_count = 0

    error_message = None  # Initialize error message as None


    search_query = {"time_filter": time_filter}

    if sort_order != 'default':

        search_query["sort"] = sort_order


    try:

        search_results = reddit.subreddit(subreddit).search(search_term, **search_query)


        for submission in search_results:

            if comment_count >= max_comments:

                break

            submission.comment_sort = comment_sort_order

            submission.comments.replace_more(limit=1)

            for comment in submission.comments.list():

                if comment_count >= max_comments:

                    break

                comments.append(comment.body)

                comment_count += 1

                if comment_count % max_comments_per_post == 0:

                    break

    except Exception as e:

        error_message = "The subreddit you entered could not be found. Please check the name and try again."


    return comments, error_message


def preprocess_text(text):

    if pd.isna(text):

        text = ''

    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))

    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()

    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)


def predict_sentiment(text):

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():

        logits = model(**inputs).logits

    sentiment = torch.argmax(logits, dim=1).item()

    return 'Positive' if sentiment == 1 else 'Negative'


def interpret_sentiment(positive_percentage):

    if positive_percentage >= 90:

        return 'Highly Positive'

    elif positive_percentage >= 76:

        return 'Very Positive'

    elif positive_percentage >= 56:

        return 'Mostly Positive'

    elif positive_percentage >= 45:

        return 'Neutral'

    elif positive_percentage >= 26:

        return 'Mostly Negative'

    elif positive_percentage >= 11:

        return 'Very Negative'

    else:

        return 'Highly Negative'


# Converting bullet points to an HTML list


def convert_to_html_list(text):

    # Split the text into lines

    lines = text.split('\n')


    # Initialize an empty string to build the HTML

    html_content = ""


    for line in lines:

        # Check if the line starts with double asterisks which indicates a heading or bold text

        if line.strip().startswith('**'):

            # Find the position of the second set of double asterisks, if any

            end_bold = line.find('**', 2)

            if end_bold != -1:

                # Extract and bold only the text between the first set of double asterisks

                # and add the rest of the line as regular text

                bold_text = line[2:end_bold].strip()

                regular_text = line[end_bold+2:].strip()

                html_content += f"<strong>{bold_text}</strong> {regular_text}<br>"

            else:

                # If no closing asterisks, bold the entire line (removing the initial asterisks)

                cleaned_line = line.strip().strip('*').strip()

                html_content += f"<strong>{cleaned_line}</strong><br>"

        elif line.strip().startswith('-'):

            # Remove the dash and convert to an HTML list item

            cleaned_line = line.strip()[1:].strip()

            html_content += f"<li>{cleaned_line}</li>"

        else:

            # Treat lines without bullet points as regular text, not as list items

            html_content += f"{line}<br>"


    # Wrap the list items in <ul> tags if there are any list items

    if "<li>" in html_content:

        html_content = f"<ul>{html_content}</ul>"

    else:

        # If there are no list items, remove the last <br> tag

        html_content = html_content.rstrip("<br>")


    return html_content


def prepare_comments_for_gpt(comments, max_tokens=16385, prompt_type='default'):

    # Map prompt type to filename

    prompt_files = {

        'default': 'data/gpt_prompt.txt',

        'contextual': 'data/gpt_contextual.txt',

        'emotional': 'data/gpt_emotional.txt',

        'comparative': 'data/gpt_comparative.txt',

        'impact': 'data/gpt_impact.txt',

        'ai_generated': 'data/gpt_ai_generated.txt'  # Path for AI-generated prompts

    }

    prompt_file = prompt_files.get(prompt_type, 'data/gpt_prompt.txt')


    # Initialize the GPT-2 tokenizer

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


    # Load your prompt text from a file

    prompt_path = os.path.join(os.path.dirname(__file__), prompt_file)

    with open(prompt_path, 'r') as file:

        prompt_text = file.read()


    safe_margin = 512  # Space for the model's response


    # Tokenize the prompt to calculate its token count

    prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=True)


    # The maximum number of tokens that can be dedicated to comments

    max_comment_tokens = max_tokens - len(prompt_tokens) - safe_margin


    prepared_comments = []

    total_tokens = 0


    for comment in comments:

        comment_tokens = tokenizer.encode(comment, add_special_tokens=True)

        if total_tokens + len(comment_tokens) > max_comment_tokens:

            break

        prepared_comments.append(comment)

        total_tokens += len(comment_tokens)


    # After collecting comments, check the combined text does not exceed the token limit

    combined_text = f"{prompt_text}\n\n" + '\n'.join([f"{idx + 1}: {comment}" for idx, comment in enumerate(prepared_comments)])

    combined_tokens = tokenizer.encode(combined_text, add_special_tokens=True)


    # If the combined text exceeds the max tokens, trim the comments

    while len(combined_tokens) > max_tokens - safe_margin:

        if not prepared_comments:

            break  # Avoids infinite loop if no comments can fit

        prepared_comments.pop()  # Remove the last comment

        combined_text = f"{prompt_text}\n\n" + '\n'.join([f"{idx + 1}: {comment}" for idx, comment in enumerate(prepared_comments)])

        combined_tokens = tokenizer.encode(combined_text, add_special_tokens=True)


    return combined_text, len(prepared_comments)


def chatgpt_sentiment_analysis(comments, prompt_type):

    combined_text, used_comments = prepare_comments_for_gpt(comments, prompt_type=prompt_type)


    if used_comments == 0:

        return "Error", "No comments were analyzed due to token limit restrictions."


    try:

        response = openai.ChatCompletion.create(

            model="gpt-3.5-turbo",

            messages=[

                {"role": "system", "content": "You are a helpful assistant."},

                {"role": "user", "content": combined_text}

            ],

            temperature=0.7,

            max_tokens=512,

        )


        full_response = response["choices"][0]["message"]["content"].strip()


        # Check if the prompt type is 'ai_generated'

        if prompt_type == 'ai_generated':

            # Use a hardcoded quick sentiment summary and extract the description

            quick_sentiment_summary = "No label due to AI-generated prompt"

            analysis_description = convert_to_html_list(full_response)

        else:

            # Attempt to find the label in the response as usual

            label_match = re.search(r"Overall Sentiment:\s*(Highly Negative|Very Negative|Mostly Negative|Neutral|Mostly Positive|Very Positive|Highly Positive)", full_response, re.IGNORECASE)

            if label_match:

                quick_sentiment_summary = label_match.group(1).strip()  # Extracts the sentiment label

                analysis_description_raw = full_response.replace(label_match.group(0), '').strip()

                analysis_description = convert_to_html_list(analysis_description_raw)

            else:

                quick_sentiment_summary = "Label not found"

                analysis_description = "Analysis description not found due to an unexpected response format."


    except Exception as e:

        print(f"An error occurred: {e}")

        quick_sentiment_summary = "Error"

        analysis_description = "Could not analyze sentiment due to an error."


    return quick_sentiment_summary, analysis_description


# Visualization Functions


def save_sentiment_pie_chart(positive_percentage, image_path):

    labels = ['Positive', 'Negative']

    sizes = [positive_percentage, 100 - positive_percentage]

    colors = ['#ff9999','#66b3ff']


    plt.figure(figsize=(5, 5))

    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)

    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    plt.savefig(image_path)

    plt.close()


def save_word_cloud(comments, image_path):

    text = ' '.join(comments)

    wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', max_words=100, width=800, height=400).generate(text)


    plt.figure(figsize=(10, 5))

    plt.imshow(wordcloud, interpolation='bilinear')

    plt.axis('off')

    plt.savefig(image_path)

    plt.close()


def fetch_comments(search_term, subreddit='all', limit=5):

    comments = []

    try:

        # Fetch submissions from the subreddit based on the search term

        for submission in reddit.subreddit(subreddit).search(search_term, limit=10):

            submission.comment_sort = 'top'  # Sort comments by top

            submission.comments.replace_more(limit=0)  # Replace MoreComments objects to expand all comments

            for comment in submission.comments.list():

                if len(comments) < limit:

                    comments.append(comment.body)  # Collect comment text

                else:

                    break

            if len(comments) >= limit:

                break

    except Exception as e:

        print(f"An error occurred while fetching comments: {str(e)}")

        return []


    return comments


@app.route('/analysis', methods=['GET', 'POST'])


def home():

    if request.method == 'POST':

        search_term = request.form.get('search_term')

        prompt_type = request.form.get('prompt_type') or 'default'

        subreddit = request.form.get('subreddit') or 'all'

        sort_order = request.form.get('sort_order') or 'default'

        time_filter = request.form.get('time_filter') or 'all'

        comment_sort_order = request.form.get('comment_sort_order') or 'top'


        comments, error_message = fetch_comments_from_posts(search_term, subreddit, sort_order, time_filter, comment_sort_order)


        # Initialize filenames to None or a default value

        sentiment_pie_chart_filename = None

        word_cloud_filename = None


        if error_message:

            return render_template('index.html', error_message=error_message)


        if not comments:

            error_message = "Sorry, we couldn't find any comments related to your search. Please try a different search term or subreddit."

            return render_template('index.html', error_message=error_message)


        # Continue processing only if comments are present

        else:

            preprocessed_comments = [preprocess_text(comment) for comment in comments]

            sentiments = [predict_sentiment(comment) for comment in preprocessed_comments]


            positive_count = sentiments.count('Positive')

            total_comments = len(sentiments)

            positive_percentage = (positive_count / total_comments * 100) if total_comments else 0

            overall_sentiment = interpret_sentiment(positive_percentage)


            quick_sentiment_summary, analysis_description = chatgpt_sentiment_analysis(comments, prompt_type)


            # File names for the visualizations

            sentiment_pie_chart_filename = f"images/sentiment_pie_{uuid4()}.png"

            word_cloud_filename = f"images/word_cloud_{uuid4()}.png"


            # Generate and save the visualizations

            sentiment_pie_chart_path = os.path.join(app.static_folder, sentiment_pie_chart_filename)

            word_cloud_path = os.path.join(app.static_folder, word_cloud_filename)


            save_sentiment_pie_chart(positive_percentage, sentiment_pie_chart_path)

            save_word_cloud(comments, word_cloud_path)


            new_search = SearchHistory(

                search_term=search_term,

                subreddit=subreddit,

                sort_order=sort_order,

                time_filter=time_filter,

                comment_sort_order=comment_sort_order,

                overall_sentiment=overall_sentiment,

                positive_percentage=positive_percentage,

                total_comments=total_comments,

                quick_sentiment_summary=quick_sentiment_summary,

                analysis_description=analysis_description,

                sentiment_pie_chart_filename=sentiment_pie_chart_filename,

                word_cloud_filename=word_cloud_filename,

                gpt_prompt_type=prompt_type

            )

            db.session.add(new_search)

            db.session.commit()


        return render_template('index.html', search_term=search_term, subreddit=subreddit, sort_order=sort_order, time_filter=time_filter, comment_sort_order=comment_sort_order, overall_sentiment=overall_sentiment, positive_percentage=positive_percentage, total_comments=total_comments, quick_sentiment_summary=quick_sentiment_summary, analysis_description=analysis_description, sentiment_pie_chart_filename=sentiment_pie_chart_filename, word_cloud_filename=word_cloud_filename, prompt_type=prompt_type)


    return render_template('index.html')


@app.route('/generate_prompt', methods=['POST'])


def generate_prompt():

    search_term = request.form.get('search_term', '').strip()

    emphasis_terms = request.form.get('emphasis_terms', '').strip()


    # Less restrictive prompt structure

    base_prompt = "(do not reply conversationally, simply do the task), create a prompt which will be given to an ai such as gpt 3.5, to make it analyze comments related to {} in the context of sentiment analysis, make the prompt nuanced and well thought out in terms of retrieving information which relates to aspects of:".format(search_term)


    if emphasis_terms:

        base_prompt += " Focus particularly on aspects such as {}.".format(emphasis_terms)


    try:

        response = openai.ChatCompletion.create(

            model="gpt-3.5-turbo",

            messages=[

                {"role": "system", "content": "You are a helpful assistant."},

                {"role": "user", "content": base_prompt}

            ],

            temperature=0.5

        )

        prompt = response['choices'][0]['message']['content'].strip()


        # Save the AI-generated prompt to a file

        prompt_path = os.path.join(os.path.dirname(__file__), 'data', 'gpt_ai_generated.txt')

        with open(prompt_path, 'w') as file:

            file.write(prompt)


        return {'prompt': prompt, 'success': True}

    except Exception as e:

        return {'error': str(e), 'success': False}


@app.route('/validate_search_term', methods=['POST'])


def validate_search_term():

    search_term = request.form.get('search_term')

    comments = fetch_comments(search_term)


    if not comments:

        return {'valid': False, 'message': "Invalid search term entered. Please try a different search term before generating a prompt."}

    else:

        return {'valid': True, 'message': "Comments found. Proceeding with prompt generation."}


@app.route('/')


def landing():

    return render_template('landing.html')


@app.route('/history')


def history():

    sort_order = request.args.get('sort', 'newest')

    search_term = request.args.get('search')


    query = SearchHistory.query


    if search_term:

        # Normalize both the search term and the database field to lowercase for case-insensitive matching

        search_term = search_term.lower()  # Convert search term to lowercase

        query = query.filter(func.lower(SearchHistory.search_term) == search_term)


    if sort_order == 'newest':

        query = query.order_by(SearchHistory.created_at.desc())

    elif sort_order == 'oldest':

        query = query.order_by(SearchHistory.created_at)

    elif sort_order == 'sentiment_asc':

        query = query.order_by(SearchHistory.overall_sentiment)

    elif sort_order == 'sentiment_desc':

        query = query.order_by(SearchHistory.overall_sentiment.desc())


    searches = query.all()


    if not searches and search_term:

        flash('No results found for your search term. Please try a different one.', 'info')


    return render_template('history.html', searches=searches)


if __name__ == '__main__':

    with app.app_context():

        db.create_all()

    app.run(debug=True)

app.SearchHistory
Definition app.py:44

app.convert_to_html_list
convert_to_html_list(text)
Definition app.py:142

app.interpret_sentiment
interpret_sentiment(positive_percentage)
Definition app.py:125

app.preprocess_text
preprocess_text(text)
Definition app.py:105

app.prepare_comments_for_gpt
prepare_comments_for_gpt(comments, max_tokens=16385, prompt_type='default')
Definition app.py:181

app.chatgpt_sentiment_analysis
chatgpt_sentiment_analysis(comments, prompt_type)
Definition app.py:234

app.history
history()
Definition app.py:433

app.landing
landing()
Definition app.py:429

app.fetch_comments_from_posts
fetch_comments_from_posts(search_term, subreddit='all', sort_order='default', time_filter='all', comment_sort_order='top', max_comments=1000, max_comments_per_post=75)
Definition app.py:75

app.validate_search_term
validate_search_term()
Definition app.py:419

app.get_utc_plus_one
get_utc_plus_one()
Definition app.py:41

app.save_word_cloud
save_word_cloud(comments, image_path)
Definition app.py:289

app.predict_sentiment
predict_sentiment(text)
Definition app.py:118

app.fetch_comments
fetch_comments(search_term, subreddit='all', limit=5)
Definition app.py:299

app.model
model
Definition app.py:71

app.home
home()
Definition app.py:320

app.tokenizer
tokenizer
Definition app.py:72

app.save_sentiment_pie_chart
save_sentiment_pie_chart(positive_percentage, image_path)
Definition app.py:278

app.generate_prompt
generate_prompt()
Definition app.py:388