Merge pull request #36 from cornatul/enhancement/code-cleanup

Enhancement/code cleanup
PythonCompany · Nov 23, 2023 · 357008e · 357008e
2 parents 93b7cd9 + 3191dbb
commit 357008e
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 170 deletions.
diff --git a/api/endpoints/nlp.py b/api/endpoints/nlp.py
@@ -0,0 +1,80 @@
+
+import spacy
+import socials
+
+
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from markdownify import markdownify as md
+
+from fastapi import APIRouter, Path, Query, Depends
+from pydantic import BaseModel
+from cachetools import TTLCache
+from newspaper import Config, Article
+from spacy import displacy
+from spacy.lang.en.stop_words import STOP_WORDS
+from spacy.lang.en import English
+
+router = APIRouter()
+
+
+nlp = spacy.load("en_core_web_md")
+
+
+class ArticleAction(BaseModel):
+    link: str
+
+class SummarizeAction(BaseModel):
+    text: str
+
+@router.post("/nlp/article")
+async def root(article: ArticleAction):
+    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \
+                 'Chrome/50.0.2661.102 Safari/537.36 '
+    config = Config()
+    config.browser_user_agent = user_agent
+    config.request_timeout = 10
+    config.fetch_images = True
+    config.memoize_articles = True
+    config.follow_meta_refresh = True
+    crawler = Article(article.link, config=config, keep_article_html=True)
+    crawler.download()
+    crawler.parse()
+    # Basic NLP using NTLK
+    crawler.nlp()
+    # New NLP
+    doc = nlp(crawler.text)
+
+    sentiment = SentimentIntensityAnalyzer()
+
+    remove_entities = ["TIME", "DATE", "CARDINAL", "LANGUAGE", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]
+
+    entities = [(e.label_, e.text, e.start_char, e.end_char) for e in doc.ents]
+
+    filtered_entities = [ent for ent in entities if ent[0] not in remove_entities]
+
+    unique_values = set()
+
+    filtered_entities_unique = [ent for ent in filtered_entities if
+                                ent[1] not in unique_values and not unique_values.add(ent[1])]
+
+    social = socials.extract(article.link).get_matches_per_platform()
+
+    return {
+        "data": {
+            "title": crawler.title,
+            "date": crawler.publish_date,
+            "text": crawler.text,
+            "markdown": md(crawler.article_html, newline_style="BACKSLASH", strip=['a'], heading_style="ATX"),
+            "html": crawler.article_html,
+            "summary": crawler.summary,
+            "keywords": crawler.keywords,
+            "authors": crawler.authors,
+            "banner": crawler.top_image,
+            "images": crawler.images,
+            "entities": filtered_entities_unique,
+            "videos": crawler.movies,
+            "social": social,
+            "spacy": displacy.render(doc, style="ent"),
+            "sentiment": sentiment.polarity_scores(crawler.text),
+        },
+    }
diff --git a/api/endpoints/videos.py b/api/endpoints/videos.py
@@ -13,6 +13,13 @@ class TikTokAction(BaseModel):
     token: str
 
 
+async def get_hashtag_videos(token):
+    async with TikTokApi() as api:
+        await api.create_sessions(ms_tokens=token, num_sessions=1, sleep_after=3)
+        tag = api.hashtag(name="funny")
+        return tag.videos(count=30)
+
+
 @router.post("/videos")
 async def root(post: VideosAction):
     from youtube_search import YoutubeSearch

diff --git a/main.py b/main.py
@@ -1,54 +1,15 @@
-import newspaper
-import feedparser
-import json
-import spacy
-import socials
-import tweepy
-import datetime as dt
 import asyncio
-import os
-import subprocess
 
-from pytrends.request import TrendReq
-from datetime import datetime
-from spacy_html_tokenizer import create_html_tokenizer
-from cachetools import TTLCache
-from markdownify import markdownify as md
-from selenium import webdriver
-from feedfinder2 import find_feeds
 from fastapi import FastAPI
 from fastapi.responses import PlainTextResponse
-
 from fastapi.middleware.cors import CORSMiddleware
-from newspaper import Config
-from pydantic import BaseModel
-
-from newspaper import Article
-from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-from nltk.stem import *
-from nltk.corpus import wordnet
 
-from seoanalyzer import analyze
-from lighthouse import LighthouseRunner
-from classes.Bard import Chatbot
-from spacy import displacy
-from threading import Lock
-from classes.TextSummarizer import *
-from decouple import config
-from TikTokApi import TikTokApi
 
 from api.endpoints import feeds
 from api.endpoints import scrapper
 from api.endpoints import google
 from api.endpoints import seo
-
-stemmer = PorterStemmer()
-wordnet_lemmatizer = WordNetLemmatizer()
-nlp = spacy.load("en_core_web_md")
-now = dt.date.today()
-now = now.strftime('%m-%d-%Y')
-yesterday = dt.date.today() - dt.timedelta(days=1)
-yesterday = yesterday.strftime('%m-%d-%Y')
+from api.endpoints import nlp as nlp_endpoint
 
 app = FastAPI(
     title="Today Intel",
@@ -79,143 +40,14 @@
 app.include_router(scrapper.router)
 app.include_router(google.router)
 app.include_router(seo.router)
-
-
-# Related Functions
-def are_words_related(word1, word2):
-    # Get synsets (sets of synonyms) for each word
-    synsets1 = wordnet.synsets(word1)
-    synsets2 = wordnet.synsets(word2)
-
-    # Check if there is any common synset between the two words
-    common_synsets = set(synsets1).intersection(synsets2)
-
-    return len(common_synsets) > 0
-
-
-async def get_hashtag_videos(token):
-    async with TikTokApi() as api:
-        await api.create_sessions(ms_tokens=token, num_sessions=1, sleep_after=3)
-        tag = api.hashtag(name="funny")
-        return tag.videos(count=30)
-
-
-# Cache Settings
-cache = TTLCache(maxsize=500, ttl=6 * 60 * 60)
-
-
-class ArticleAction(BaseModel):
-    link: str
-
-
-class PostAction(BaseModel):
-    query: str
-
-
-class SummarizeAction(BaseModel):
-    text: str
+app.include_router(nlp_endpoint.router)
 
 
 @app.get("/")
 async def root():
     return {"data": "Welcome to the NLP API - for documentation please visit /docs for "}
 
 
-@app.post("/article")
-async def root(article: ArticleAction):
-    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) ' \
-                 'Chrome/50.0.2661.102 Safari/537.36 '
-    config = Config()
-    config.browser_user_agent = user_agent
-    config.request_timeout = 10
-    config.fetch_images = True
-    config.memoize_articles = True
-    config.follow_meta_refresh = True
-    crawler = Article(article.link, config=config, keep_article_html=True)
-    crawler.download()
-    crawler.parse()
-    # Basic NLP using NTLK
-    crawler.nlp()
-    # New NLP
-    doc = nlp(crawler.text)
-
-    sentiment = SentimentIntensityAnalyzer()
-
-    remove_entities = ["TIME", "DATE", "CARDINAL", "LANGUAGE", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]
-
-    entities = [(e.label_, e.text, e.start_char, e.end_char) for e in doc.ents]
-
-    filtered_entities = [ent for ent in entities if ent[0] not in remove_entities]
-
-    unique_values = set()
-
-    filtered_entities_unique = [ent for ent in filtered_entities if
-                                ent[1] not in unique_values and not unique_values.add(ent[1])]
-
-    social = socials.extract(article.link).get_matches_per_platform()
-
-    return {
-        "data": {
-            "title": crawler.title,
-            "date": crawler.publish_date,
-            "text": crawler.text,
-            "markdown": md(crawler.article_html, newline_style="BACKSLASH", strip=['a'], heading_style="ATX"),
-            "html": crawler.article_html,
-            "summary": crawler.summary,
-            "keywords": crawler.keywords,
-            "authors": crawler.authors,
-            "banner": crawler.top_image,
-            "images": crawler.images,
-            "entities": filtered_entities_unique,
-            "videos": crawler.movies,
-            "social": social,
-            "spacy": displacy.render(doc, style="ent"),
-            "sentiment": sentiment.polarity_scores(crawler.text),
-        },
-    }
-
-
-@app.post("/summarize")
-async def root(summarize: SummarizeAction):
-    # Counting number of words in original article
-    original_words = summarize.text.split()
-    original_words = [w for w in original_words if w.isalnum()]
-    num_words_in_original_text = len(original_words)
-
-    # Converting received text into sapcy Doc object
-    text = nlp(summarize.text)
-
-    # Extracting all sentences from the text in a list
-    sentences = list(text.sents)
-    total_sentences = len(sentences)
-
-    # Generating Frequency Matrix
-    freq_matrix = frequency_matrix(sentences)
-
-    # Generating Term Frequency Matrix
-    tf_matrixx = tf_matrix(freq_matrix)
-
-    # Getting number of sentences containing a particular word
-    num_sent_per_words = sentences_per_words(freq_matrix)
-
-    # Generating ID Frequency Matrix
-    idf_matrixx = idf_matrix(freq_matrix, num_sent_per_words, total_sentences)
-
-    # Generating Tf-Idf Matrix
-    tf_idf_matrixx = tf_idf_matrix(tf_matrixx, idf_matrixx)
-
-    # Generating Sentence score for each sentence
-    sentence_scores = score_sentences(tf_idf_matrixx)
-
-    # Setting threshold to average value (You are free to play with ther values)
-    threshold = average_score(sentence_scores)
-
-    # Getting summary
-    summary = create_summary(sentences, sentence_scores, 1.3 * threshold)
-
-    return {"data": summary}
-
-
 if __name__ == "__main__":
     # Create and run the event loop
     loop = asyncio.get_event_loop()