Added AI generated Summaries and images

2024-06-06 17:40:03 +01:00 · 2024-06-06 17:40:03 +01:00 · 105d14eebc
parent 0c86bd74ef
commit 105d14eebc
7 changed files with 350 additions and 153 deletions
--- a/Docker/config_template.json
+++ b/Docker/config_template.json
@ -16,5 +16,8 @@
        "country" : "$NEWSAPI_COUNTRY",
        "article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
        "article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
    },
    "pixabay" : {
        "api_key" : "$PIXABAY_API_KEY"
    }
 }
--- a/Docker/docker-compose.yml
+++ b/Docker/docker-compose.yml
@ -14,6 +14,7 @@ services:
      - NEWSAPI_COUNTRY=gb
      - ARTICLE_LIFETIME=6 # in hours
      - ARTICLE_INTERVAL=1 # in hours
      - PIXABAY_API_KEY=
  mongodb:
    container_name: notoric-snc-mongo
    image: mongo
--- a/Docker/generate-config.py
+++ b/Docker/generate-config.py
@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \
    .replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
    .replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
    .replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
-    .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL'])
+    .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \
    .replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \
 # Write the config to a file
 with open('config.json', 'w') as config_file:
--- a/Docker/requirements.txt
+++ b/Docker/requirements.txt
--- a/Docker/snc.py
+++ b/Docker/snc.py
@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
 groq_key = config['groq']['api_key']
 pixabayApiKey = config['pixabay']['api_key']
 # Connect to MongoDB
 print("Connecting to MongoDB...")
@ -125,7 +127,14 @@ def get_newsfeed(category='general'):
        article_data['author'] = article['author']
        article_data['category'] = category
        article_data['timestamp'] = datetime.now()
-        if (article['url'].contains("reuters.com") == False):
+
        if (article['url'].find("news.google") != -1):
            response = requests.get(article['url'])
            soup = BeautifulSoup(response.text, 'html.parser')
            htmlarticle = soup.find('article')
            if htmlarticle != None:
                if len(htmlarticle.text.strip()) > 250:
                    article_data['content'] = htmlarticle.text.strip()
                    articles.append(article_data)
    print("Newsfeed data retrieved!")
@ -134,8 +143,14 @@ def get_newsfeed(category='general'):
 # Get most interesting news articles with AI
 def get_interesting_news(articles):
    selected_articles = []
    if len(articles) <= 3:
        print("Not enough articles to select from! Using all articles...")
        selected_articles = articles
    else:
        print("Getting interesting news...")
    interesting_articles = []
        try:
            client = Groq(api_key=groq_key)
@ -187,8 +202,6 @@ def get_interesting_news(articles):
                    }
                }
    selected_articles = []
        article_index = [0, 1, 2]
        try:
            article_index[0] = response['most_interesting']['index']
@ -200,7 +213,6 @@ def get_interesting_news(articles):
            article_index = [0, 1, 2]
            print("Using default article selection...")
        for i in article_index:
            article = articles[i]
            selected_article = {}
@ -209,10 +221,93 @@ def get_interesting_news(articles):
            selected_article['url'] = article['url']
            selected_article['category'] = article['category']
            selected_article['timestamp'] = datetime.now()
            selected_article['content'] = article['content']
            selected_articles.append(selected_article)
        print("Interesting news retrieved!")
    # Get image & summary for all selected articles
    print("Getting images and summaries for selected articles...")
    for article in selected_articles:
        img_keywords = ""
        try:
            client = Groq(api_key=groq_key)
            completion = client.chat.completions.create(
                model="gemma-7b-it",
                messages=[
                    {
                        "role": "system",
                        "content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
                    },
                    {
                        "role": "user",
                        "content": article['title']
                    }
                ],
                temperature=0.5,
                max_tokens=1024,
                top_p=1,
                stream=False,
                stop=None,
            )
            img_keywords = str(completion.choices[0].message.content)
            img_keywords = img_keywords[:99]
        except Exception as e:
            print("Could not get image keywords, using defaults...")
            img_keywords = article['category'] + " News article"
        try:
            image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
            image_data = image_response.json()
            article['image'] = image_data['hits'][0]['largeImageURL']
            print("Image found!")
        except Exception as e:
            try:
                img_keywords = img_keywords.split(" ")[0]
                image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
                image_data = image_response.json()
                article['image'] = image_data['hits'][0]['largeImageURL']
                print("Image found with shortened prompt!")
            except Exception as e:
                try:
                    image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
                    image_data = image_response.json()
                    article['image'] = image_data['hits'][0]['largeImageURL']
                    print("Image found using category!")
                except Exception as e:
                    article['image'] = "https://picsum.photos/800/600"
        summary = ""
        try:
            client = Groq(api_key=groq_key)
            completion = client.chat.completions.create(
                model="gemma-7b-it",
                messages=[
                    {
                        "role": "system",
                        "content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
                    },
                    {
                        "role": "user",
                        "content": article['content']
                    }
                ],
                temperature=1.4,
                max_tokens=1024,
                top_p=1,
                stream=False,
                stop=None,
            )
            summary = str(completion.choices[0].message.content)
        except Exception as e:
            print(e)
            summary = "Read more about this article on the source website."
        article['summary'] = summary
    return selected_articles
 # Write newsfeed data to MongoDB
@ -238,9 +333,7 @@ def get_all_news():
 def delete_old_news():
    print("Deleting old news articles...")
-    hrs = int(config['news']['article_lifetime'])
+    db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }})
    db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }})
    print("Old news articles deleted!")
@ -249,7 +342,7 @@ def delete_old_news():
 create_collections()
 schedule.every(5).minutes.do(write_weather)
-schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news)
+schedule.every(config['news']['article_interval']).hours.do(get_all_news)
 schedule.every(1).hours.do(delete_old_news)
 write_weather()
--- a/requirements.txt
+++ b/requirements.txt
--- a/snc.py
+++ b/snc.py
@ -4,6 +4,9 @@ import json
 import pymongo
 import requests
 import schedule
 import re
 import requests
 from bs4 import BeautifulSoup
 from groq import Groq
 from datetime import datetime, timedelta
@ -37,6 +40,9 @@ if os.path.exists('config.json') == False:
            "country" : "gb",
            "article_lifetime": 6,
            "article_interval": 1
        },
        "pixabay" : {
            "api_key" : ""
        }
    }
@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
 groq_key = config['groq']['api_key']
 pixabayApiKey = config['pixabay']['api_key']
 # Connect to MongoDB
 print("Connecting to MongoDB...")
@ -149,19 +157,30 @@ def get_newsfeed(category='general'):
        article_data['author'] = article['author']
        article_data['category'] = category
        article_data['timestamp'] = datetime.now()
-        if (article['url'].contains("reuters.com") == False):
+
        if (article['url'].find("news.google") != -1):
            response = requests.get(article['url'])
            soup = BeautifulSoup(response.text, 'html.parser')
            htmlarticle = soup.find('article')
            if htmlarticle != None:
                if len(htmlarticle.text.strip()) > 250:
                    article_data['content'] = htmlarticle.text.strip()
                    articles.append(article_data)
    print("Newsfeed data retrieved!")
    return articles
 # Get most interesting news articles with AI
 def get_interesting_news(articles):
    selected_articles = []
    if len(articles) <= 3:
        print("Not enough articles to select from! Using all articles...")
        selected_articles = articles
    else:
        print("Getting interesting news...")
    interesting_articles = []
        try:
            client = Groq(api_key=groq_key)
@ -213,8 +232,6 @@ def get_interesting_news(articles):
                    }
                }
    selected_articles = []
        article_index = [0, 1, 2]
        try:
            article_index[0] = response['most_interesting']['index']
@ -226,7 +243,6 @@ def get_interesting_news(articles):
            article_index = [0, 1, 2]
            print("Using default article selection...")
        for i in article_index:
            article = articles[i]
            selected_article = {}
@ -235,10 +251,93 @@ def get_interesting_news(articles):
            selected_article['url'] = article['url']
            selected_article['category'] = article['category']
            selected_article['timestamp'] = datetime.now()
            selected_article['content'] = article['content']
            selected_articles.append(selected_article)
        print("Interesting news retrieved!")
    # Get image & summary for all selected articles
    print("Getting images and summaries for selected articles...")
    for article in selected_articles:
        img_keywords = ""
        try:
            client = Groq(api_key=groq_key)
            completion = client.chat.completions.create(
                model="gemma-7b-it",
                messages=[
                    {
                        "role": "system",
                        "content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
                    },
                    {
                        "role": "user",
                        "content": article['title']
                    }
                ],
                temperature=0.5,
                max_tokens=1024,
                top_p=1,
                stream=False,
                stop=None,
            )
            img_keywords = str(completion.choices[0].message.content)
            img_keywords = img_keywords[:99]
        except Exception as e:
            print("Could not get image keywords, using defaults...")
            img_keywords = article['category'] + " News article"
        try:
            image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
            image_data = image_response.json()
            article['image'] = image_data['hits'][0]['largeImageURL']
            print("Image found!")
        except Exception as e:
            try:
                img_keywords = img_keywords.split(" ")[0]
                image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
                image_data = image_response.json()
                article['image'] = image_data['hits'][0]['largeImageURL']
                print("Image found with shortened prompt!")
            except Exception as e:
                try:
                    image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
                    image_data = image_response.json()
                    article['image'] = image_data['hits'][0]['largeImageURL']
                    print("Image found using category!")
                except Exception as e:
                    article['image'] = "https://picsum.photos/800/600"
        summary = ""
        try:
            client = Groq(api_key=groq_key)
            completion = client.chat.completions.create(
                model="gemma-7b-it",
                messages=[
                    {
                        "role": "system",
                        "content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
                    },
                    {
                        "role": "user",
                        "content": article['content']
                    }
                ],
                temperature=1.4,
                max_tokens=1024,
                top_p=1,
                stream=False,
                stop=None,
            )
            summary = str(completion.choices[0].message.content)
        except Exception as e:
            print(e)
            summary = "Read more about this article on the source website."
        article['summary'] = summary
    return selected_articles
 # Write newsfeed data to MongoDB