From 860eb8d4b032af1aca57cefc7bd1de246da61452 Mon Sep 17 00:00:00 2001 From: elsong86 Date: Sat, 5 Oct 2024 14:32:15 -0700 Subject: [PATCH] Swapped out textblob for spaCY and Vader for added functionalities such as filter out non-English text, cleaning, and more accurate sentiment analysis. --- backend/app/routers/outscraper_reviews.py | 34 +++++++++++------- backend/app/utils/sentiment_analysis.py | 44 +++++++++++++++++++++++ backend/requirements.txt | 23 ++++++++++++ frontend/src/app/restaurant/[id]/page.tsx | 10 +++--- 4 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 backend/app/utils/sentiment_analysis.py diff --git a/backend/app/routers/outscraper_reviews.py b/backend/app/routers/outscraper_reviews.py index 4c37fc0..511ae70 100644 --- a/backend/app/routers/outscraper_reviews.py +++ b/backend/app/routers/outscraper_reviews.py @@ -1,6 +1,5 @@ from fastapi import APIRouter, HTTPException, Query, Depends, Request from outscraper import ApiClient -from textblob import TextBlob import logging import json import os @@ -8,8 +7,10 @@ from ..utils.rate_limiter import rate_limiter from ..utils.redis_utils import redis_client from ..services.supabase_service import SupabaseService +from ..utils.sentiment_analysis import analyze_sentiments +from lingua import Language, LanguageDetectorBuilder from pydantic import BaseModel, Field -from typing import Callable +from typing import Callable logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -21,6 +22,19 @@ # Instantiate the SupabaseService using the modularized Supabase client supabase_service = SupabaseService() +# Initialize Lingua language detector with English enabled +detector = LanguageDetectorBuilder.from_languages(Language.ENGLISH, Language.SPANISH).build() + +# Define the is_english function +def is_english(text): + try: + # Detect language using Lingua + detected_language = detector.detect_language_of(text) + return detected_language == Language.ENGLISH + except Exception as e: + logger.error(f"Error detecting language: {e}") + return False + router = APIRouter() def check_cache(place_id: str): @@ -42,7 +56,11 @@ def fetch_reviews_from_api(place_id: str): ) if results and isinstance(results, list) and len(results) > 0: reviews = results[0].get('reviews_data', []) - non_empty_reviews = [review for review in reviews if review.get('review_text') and review['review_text'].strip()] + + # Filter out non-English reviews using is_english + english_reviews = [review for review in reviews if is_english(review['review_text'])] + + non_empty_reviews = [review for review in english_reviews if review.get('review_text') and review['review_text'].strip()] # Cache the fetched reviews cache_key = f"reviews:{place_id}" @@ -55,6 +73,7 @@ def fetch_reviews_from_api(place_id: str): logger.error(f"Error fetching reviews from API: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) + def store_restaurant(place_id: str, name: str, address: str): try: # Check if the restaurant already exists @@ -81,15 +100,6 @@ def store_restaurant(place_id: str, name: str, address: str): logger.error(f"Failed to store restaurant: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) - -def analyze_sentiments(reviews): - sentiments = [TextBlob(review['review_text']).sentiment.polarity for review in reviews] - if sentiments: - average_sentiment = sum(sentiments) / len(sentiments) - else: - average_sentiment = 0.0 - return average_sentiment - def get_stored_reviews(place_id: str): # Define your freshness criteria (e.g., 1 week) freshness_limit = timedelta(weeks=1) diff --git a/backend/app/utils/sentiment_analysis.py b/backend/app/utils/sentiment_analysis.py new file mode 100644 index 0000000..06c3887 --- /dev/null +++ b/backend/app/utils/sentiment_analysis.py @@ -0,0 +1,44 @@ +import spacy +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +import logging + +logger = logging.getLogger(__name__) + +# Initialize spaCy and VADER +nlp = spacy.load("en_core_web_sm") +analyzer = SentimentIntensityAnalyzer() + +def clean_text(text): + doc = nlp(text) + cleaned_tokens = [] + + for token in doc: + # Only keep alphabetic tokens, remove stopwords and punctuation + if token.is_alpha and not token.is_stop: + cleaned_tokens.append(token.text.lower()) + + return ' '.join(cleaned_tokens) + +def analyze_sentiments(reviews): + sentiments = [] + + for review in reviews: + review_text = review['review_text'] + + # Clean the review text using spaCy + cleaned_text = clean_text(review_text) + + # Perform sentiment analysis with VADER on the cleaned text + sentiment_score = analyzer.polarity_scores(cleaned_text)['compound'] + + # Rescale VADER score from [-1, 1] to [0, 10] + scaled_score = (sentiment_score + 1) * 5 # Transforms the score to [0, 10] + sentiments.append(scaled_score) + + # Calculate the average sentiment + if sentiments: + average_sentiment = sum(sentiments) / len(sentiments) + else: + average_sentiment = 0.0 + + return average_sentiment diff --git a/backend/requirements.txt b/backend/requirements.txt index 9fe5e39..8758df4 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -5,16 +5,22 @@ annotated-types==0.7.0 anyio==4.4.0 attrs==24.2.0 Authlib==1.3.2 +blis==1.0.1 +catalogue==2.0.10 certifi==2024.7.4 cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 +cloudpathlib==0.19.0 +confection==0.1.5 cryptography==43.0.0 +cymem==2.0.8 deprecation==2.1.0 dnspython==2.6.1 dparse==0.6.4b0 ecdsa==0.19.0 email_validator==2.2.0 +en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 fastapi==0.112.0 filelock==3.12.4 frozenlist==1.4.1 @@ -28,17 +34,24 @@ hyperframe==6.0.1 idna==3.7 Jinja2==3.1.4 joblib==1.4.2 +langcodes==3.4.1 +language_data==1.2.0 +lingua-language-detector==2.0.2 +marisa-trie==1.2.0 markdown-it-py==3.0.0 MarkupSafe==2.1.5 marshmallow==3.22.0 mdurl==0.1.2 multidict==6.0.5 +murmurhash==1.0.10 nltk==3.9.1 +numpy==2.0.2 outscraper==5.2.1 packaging==24.1 pip-autoremove==0.10.0 pipdeptree==2.23.1 postgrest==0.16.11 +preshed==3.0.9 psutil==6.0.0 pyasn1==0.6.0 pycparser==2.22 @@ -61,18 +74,28 @@ safety-schemas==0.0.5 setuptools==73.0.1 shellingham==1.5.4 six==1.16.0 +smart-open==7.0.5 sniffio==1.3.1 +spacy==3.8.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 starlette==0.37.2 storage3==0.7.7 StrEnum==0.4.15 supabase==2.7.3 supafunc==0.5.1 textblob==0.18.0.post0 +thinc==8.3.2 tqdm==4.66.5 typer==0.12.5 typing_extensions==4.12.2 urllib3==2.2.2 uvicorn==0.30.5 +vaderSentiment==3.3.2 +wasabi==1.1.3 +weasel==0.4.1 websockets==12.0 +wrapt==1.16.0 yarl==1.9.4 zipp==3.20.1 diff --git a/frontend/src/app/restaurant/[id]/page.tsx b/frontend/src/app/restaurant/[id]/page.tsx index e5746e4..ad5c268 100644 --- a/frontend/src/app/restaurant/[id]/page.tsx +++ b/frontend/src/app/restaurant/[id]/page.tsx @@ -56,11 +56,11 @@ const RestaurantPage: React.FC = () => { } if (data.average_sentiment !== undefined) { - const scaledSentiment = ((data.average_sentiment + 1) / 2) * 10; - setAverageSentiment(scaledSentiment); - } else { - setAverageSentiment(null); - } + setAverageSentiment(data.average_sentiment); // No additional scaling needed + } else { + setAverageSentiment(null); + } + } catch (error) { console.error('Failed to fetch reviews:', error); } finally {