From 105d14eebcec4e288250c4f360f1c22231efdde8 Mon Sep 17 00:00:00 2001
From: Notoric <tom@acornes.com>
Date: Thu, 6 Jun 2024 17:40:03 +0100
Subject: [PATCH] Added AI generated Summaries and images

---
 Docker/config_template.json |   3 +
 Docker/docker-compose.yml   |   1 +
 Docker/generate-config.py   |   3 +-
 Docker/requirements.txt     | Bin 720 -> 800 bytes
 Docker/snc.py               | 247 ++++++++++++++++++++++++-----------
 requirements.txt            | Bin 720 -> 800 bytes
 snc.py                      | 249 +++++++++++++++++++++++++-----------
 7 files changed, 350 insertions(+), 153 deletions(-)

diff --git a/Docker/config_template.json b/Docker/config_template.json
index c805c1a..b973bce 100644
--- a/Docker/config_template.json
+++ b/Docker/config_template.json
@@ -16,5 +16,8 @@
         "country" : "$NEWSAPI_COUNTRY",
         "article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
         "article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
+    },
+    "pixabay" : {
+        "api_key" : "$PIXABAY_API_KEY"
     }
 }
\ No newline at end of file
diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml
index 031fa35..7e5ba9b 100644
--- a/Docker/docker-compose.yml
+++ b/Docker/docker-compose.yml
@@ -14,6 +14,7 @@ services:
       - NEWSAPI_COUNTRY=gb
       - ARTICLE_LIFETIME=6 # in hours
       - ARTICLE_INTERVAL=1 # in hours
+      - PIXABAY_API_KEY=
   mongodb:
     container_name: notoric-snc-mongo
     image: mongo
diff --git a/Docker/generate-config.py b/Docker/generate-config.py
index 36cec46..25445a7 100644
--- a/Docker/generate-config.py
+++ b/Docker/generate-config.py
@@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \
     .replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
     .replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
     .replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
-    .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL'])
+    .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \
+    .replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \
 
 # Write the config to a file
 with open('config.json', 'w') as config_file:
diff --git a/Docker/requirements.txt b/Docker/requirements.txt
index 2e0139172d70b875e22b9247d2ebfdd093cdd974..014b2ccc948234a7b729649a21011826e7ee6dab 100644
GIT binary patch
delta 86
zcmcb>x`1thpLP;MDnlYeDMJZECPNyK&S5BK$OqB|3?>Y=Kxo3C$6&}{1SE|&dKxlG
X!(@wr>QfoYfEc6-ByT#olqnhjD^n4G

delta 14
VcmZ3$c7b(*-^Mx}rpc$6!T>6P1#AES

diff --git a/Docker/snc.py b/Docker/snc.py
index e70b15a..d827b7b 100644
--- a/Docker/snc.py
+++ b/Docker/snc.py
@@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
 
 groq_key = config['groq']['api_key']
 
+pixabayApiKey = config['pixabay']['api_key']
+
 # Connect to MongoDB
 print("Connecting to MongoDB...")
 
@@ -125,8 +127,15 @@ def get_newsfeed(category='general'):
         article_data['author'] = article['author']
         article_data['category'] = category
         article_data['timestamp'] = datetime.now()
-        if (article['url'].contains("reuters.com") == False):
-            articles.append(article_data)
+
+        if (article['url'].find("news.google") != -1):
+            response = requests.get(article['url'])
+            soup = BeautifulSoup(response.text, 'html.parser')
+            htmlarticle = soup.find('article')
+            if htmlarticle != None:
+                if len(htmlarticle.text.strip()) > 250:
+                    article_data['content'] = htmlarticle.text.strip()
+                    articles.append(article_data)
 
     print("Newsfeed data retrieved!")
     return articles
@@ -134,84 +143,170 @@ def get_newsfeed(category='general'):
 # Get most interesting news articles with AI
 
 def get_interesting_news(articles):
-    print("Getting interesting news...")
-    interesting_articles = []
-
-    try:
-        client = Groq(api_key=groq_key)
-        completion = client.chat.completions.create(
-            model="gemma-7b-it",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
-                },
-                {
-                    "role": "user",
-                    "content": str(articles)
-                }
-            ],
-            temperature=1.3,
-            max_tokens=1024,
-            top_p=1,
-            stream=False,
-            response_format={"type": "json_object"},
-            stop=None,
-        )
-
-        response = str(completion.choices[0].message.content)
-        response = response.replace("\n", " ")
-        response = json.loads(response)
-    except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
-        try:
-            response = e
-            response = response[18:]
-            response = json.loads(response)
-            response = response['error']['failed_generation']
-            response = response.replace("\n", " ")
-            response = json.loads(response)
-        except:
-            print("Error selecting articles! Using random selection...")
-            response = {
-                "most_interesting": {
-                    "index": 0,
-                    "title": "Interesting"
-                },
-                "second_most_interesting": {
-                    "index": 1,
-                    "title": "Interesting"
-                },
-                "third_most_interesting": {
-                    "index": 2,
-                    "title": "Interesting"
-                }
-            }
 
     selected_articles = []
+    
+    if len(articles) <= 3:
+        print("Not enough articles to select from! Using all articles...")
+        selected_articles = articles
+    else:
+        print("Getting interesting news...")
+
+        try:
+            client = Groq(api_key=groq_key)
+            completion = client.chat.completions.create(
+                model="gemma-7b-it",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
+                    },
+                    {
+                        "role": "user",
+                        "content": str(articles)
+                    }
+                ],
+                temperature=1.3,
+                max_tokens=1024,
+                top_p=1,
+                stream=False,
+                response_format={"type": "json_object"},
+                stop=None,
+            )
+
+            response = str(completion.choices[0].message.content)
+            response = response.replace("\n", " ")
+            response = json.loads(response)
+        except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
+            try:
+                response = e
+                response = response[18:]
+                response = json.loads(response)
+                response = response['error']['failed_generation']
+                response = response.replace("\n", " ")
+                response = json.loads(response)
+            except:
+                print("Error selecting articles! Using random selection...")
+                response = {
+                    "most_interesting": {
+                        "index": 0,
+                        "title": "Interesting"
+                    },
+                    "second_most_interesting": {
+                        "index": 1,
+                        "title": "Interesting"
+                    },
+                    "third_most_interesting": {
+                        "index": 2,
+                        "title": "Interesting"
+                    }
+                }
 
-    article_index = [0, 1, 2]
-    try:
-        article_index[0] = response['most_interesting']['index']
-        article_index[1] = response['second_most_interesting']['index']
-        article_index[2] = response['third_most_interesting']['index']
-        print("Selected articles:" + str(article_index))
-    except Exception as e:
-        print(e)
         article_index = [0, 1, 2]
-        print("Using default article selection...")
+        try:
+            article_index[0] = response['most_interesting']['index']
+            article_index[1] = response['second_most_interesting']['index']
+            article_index[2] = response['third_most_interesting']['index']
+            print("Selected articles:" + str(article_index))
+        except Exception as e:
+            print(e)
+            article_index = [0, 1, 2]
+            print("Using default article selection...")
 
+        for i in article_index:
+            article = articles[i]
+            selected_article = {}
+            selected_article['title'] = article['title']
+            selected_article['author'] = article['author']
+            selected_article['url'] = article['url']
+            selected_article['category'] = article['category']
+            selected_article['timestamp'] = datetime.now()
+            selected_article['content'] = article['content']
+            selected_articles.append(selected_article)
 
-    for i in article_index:
-        article = articles[i]
-        selected_article = {}
-        selected_article['title'] = article['title']
-        selected_article['author'] = article['author']
-        selected_article['url'] = article['url']
-        selected_article['category'] = article['category']
-        selected_article['timestamp'] = datetime.now()
-        selected_articles.append(selected_article)
+        print("Interesting news retrieved!")
 
-    print("Interesting news retrieved!")
+    # Get image & summary for all selected articles
+
+    print("Getting images and summaries for selected articles...")
+
+    for article in selected_articles:
+        img_keywords = ""
+        try:
+            client = Groq(api_key=groq_key)
+            completion = client.chat.completions.create(
+                model="gemma-7b-it",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
+                    },
+                    {
+                        "role": "user",
+                        "content": article['title']
+                    }
+                ],
+                temperature=0.5,
+                max_tokens=1024,
+                top_p=1,
+                stream=False,
+                stop=None,
+            )
+
+            img_keywords = str(completion.choices[0].message.content)
+            img_keywords = img_keywords[:99]
+        except Exception as e:
+            print("Could not get image keywords, using defaults...")
+            img_keywords = article['category'] + " News article"
+        
+        try:
+            image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
+            image_data = image_response.json()
+            article['image'] = image_data['hits'][0]['largeImageURL']
+            print("Image found!")
+        except Exception as e:
+            try:
+                img_keywords = img_keywords.split(" ")[0]
+                image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
+                image_data = image_response.json()
+                article['image'] = image_data['hits'][0]['largeImageURL']
+                print("Image found with shortened prompt!")
+            except Exception as e:
+                try:
+                    image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
+                    image_data = image_response.json()
+                    article['image'] = image_data['hits'][0]['largeImageURL']
+                    print("Image found using category!")
+                except Exception as e:
+                    article['image'] = "https://picsum.photos/800/600"
+
+        summary = ""
+        try:
+            client = Groq(api_key=groq_key)
+            completion = client.chat.completions.create(
+                model="gemma-7b-it",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
+                    },
+                    {
+                        "role": "user",
+                        "content": article['content']
+                    }
+                ],
+                temperature=1.4,
+                max_tokens=1024,
+                top_p=1,
+                stream=False,
+                stop=None,
+            )
+
+            summary = str(completion.choices[0].message.content)
+        except Exception as e:
+            print(e)
+            summary = "Read more about this article on the source website."
+        article['summary'] = summary
 
     return selected_articles
 
@@ -238,9 +333,7 @@ def get_all_news():
 def delete_old_news():
     print("Deleting old news articles...")
 
-    hrs = int(config['news']['article_lifetime'])
-
-    db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }})
+    db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }})
 
     print("Old news articles deleted!")
 
@@ -249,7 +342,7 @@ def delete_old_news():
 create_collections()
 
 schedule.every(5).minutes.do(write_weather)
-schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news)
+schedule.every(config['news']['article_interval']).hours.do(get_all_news)
 schedule.every(1).hours.do(delete_old_news)
 
 write_weather()
diff --git a/requirements.txt b/requirements.txt
index 2e0139172d70b875e22b9247d2ebfdd093cdd974..014b2ccc948234a7b729649a21011826e7ee6dab 100644
GIT binary patch
delta 86
zcmcb>x`1thpLP;MDnlYeDMJZECPNyK&S5BK$OqB|3?>Y=Kxo3C$6&}{1SE|&dKxlG
X!(@wr>QfoYfEc6-ByT#olqnhjD^n4G

delta 14
VcmZ3$c7b(*-^Mx}rpc$6!T>6P1#AES

diff --git a/snc.py b/snc.py
index 87311ea..2dfbf12 100644
--- a/snc.py
+++ b/snc.py
@@ -4,6 +4,9 @@ import json
 import pymongo
 import requests
 import schedule
+import re
+import requests
+from bs4 import BeautifulSoup
 from groq import Groq
 from datetime import datetime, timedelta
 
@@ -37,6 +40,9 @@ if os.path.exists('config.json') == False:
             "country" : "gb",
             "article_lifetime": 6,
             "article_interval": 1
+        },
+        "pixabay" : {
+            "api_key" : ""
         }
     }
 
@@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
 
 groq_key = config['groq']['api_key']
 
+pixabayApiKey = config['pixabay']['api_key']
+
 # Connect to MongoDB
 print("Connecting to MongoDB...")
 
@@ -149,10 +157,15 @@ def get_newsfeed(category='general'):
         article_data['author'] = article['author']
         article_data['category'] = category
         article_data['timestamp'] = datetime.now()
-        if (article['url'].contains("reuters.com") == False):
-            articles.append(article_data)
-        
-        
+
+        if (article['url'].find("news.google") != -1):
+            response = requests.get(article['url'])
+            soup = BeautifulSoup(response.text, 'html.parser')
+            htmlarticle = soup.find('article')
+            if htmlarticle != None:
+                if len(htmlarticle.text.strip()) > 250:
+                    article_data['content'] = htmlarticle.text.strip()
+                    articles.append(article_data)
 
     print("Newsfeed data retrieved!")
     return articles
@@ -160,84 +173,170 @@ def get_newsfeed(category='general'):
 # Get most interesting news articles with AI
 
 def get_interesting_news(articles):
-    print("Getting interesting news...")
-    interesting_articles = []
-
-    try:
-        client = Groq(api_key=groq_key)
-        completion = client.chat.completions.create(
-            model="gemma-7b-it",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
-                },
-                {
-                    "role": "user",
-                    "content": str(articles)
-                }
-            ],
-            temperature=1.3,
-            max_tokens=1024,
-            top_p=1,
-            stream=False,
-            response_format={"type": "json_object"},
-            stop=None,
-        )
-
-        response = str(completion.choices[0].message.content)
-        response = response.replace("\n", " ")
-        response = json.loads(response)
-    except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
-        try:
-            response = e
-            response = response[18:]
-            response = json.loads(response)
-            response = response['error']['failed_generation']
-            response = response.replace("\n", " ")
-            response = json.loads(response)
-        except:
-            print("Error selecting articles! Using random selection...")
-            response = {
-                "most_interesting": {
-                    "index": 0,
-                    "title": "Interesting"
-                },
-                "second_most_interesting": {
-                    "index": 1,
-                    "title": "Interesting"
-                },
-                "third_most_interesting": {
-                    "index": 2,
-                    "title": "Interesting"
-                }
-            }
 
     selected_articles = []
+    
+    if len(articles) <= 3:
+        print("Not enough articles to select from! Using all articles...")
+        selected_articles = articles
+    else:
+        print("Getting interesting news...")
+
+        try:
+            client = Groq(api_key=groq_key)
+            completion = client.chat.completions.create(
+                model="gemma-7b-it",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
+                    },
+                    {
+                        "role": "user",
+                        "content": str(articles)
+                    }
+                ],
+                temperature=1.3,
+                max_tokens=1024,
+                top_p=1,
+                stream=False,
+                response_format={"type": "json_object"},
+                stop=None,
+            )
+
+            response = str(completion.choices[0].message.content)
+            response = response.replace("\n", " ")
+            response = json.loads(response)
+        except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
+            try:
+                response = e
+                response = response[18:]
+                response = json.loads(response)
+                response = response['error']['failed_generation']
+                response = response.replace("\n", " ")
+                response = json.loads(response)
+            except:
+                print("Error selecting articles! Using random selection...")
+                response = {
+                    "most_interesting": {
+                        "index": 0,
+                        "title": "Interesting"
+                    },
+                    "second_most_interesting": {
+                        "index": 1,
+                        "title": "Interesting"
+                    },
+                    "third_most_interesting": {
+                        "index": 2,
+                        "title": "Interesting"
+                    }
+                }
 
-    article_index = [0, 1, 2]
-    try:
-        article_index[0] = response['most_interesting']['index']
-        article_index[1] = response['second_most_interesting']['index']
-        article_index[2] = response['third_most_interesting']['index']
-        print("Selected articles:" + str(article_index))
-    except Exception as e:
-        print(e)
         article_index = [0, 1, 2]
-        print("Using default article selection...")
+        try:
+            article_index[0] = response['most_interesting']['index']
+            article_index[1] = response['second_most_interesting']['index']
+            article_index[2] = response['third_most_interesting']['index']
+            print("Selected articles:" + str(article_index))
+        except Exception as e:
+            print(e)
+            article_index = [0, 1, 2]
+            print("Using default article selection...")
 
+        for i in article_index:
+            article = articles[i]
+            selected_article = {}
+            selected_article['title'] = article['title']
+            selected_article['author'] = article['author']
+            selected_article['url'] = article['url']
+            selected_article['category'] = article['category']
+            selected_article['timestamp'] = datetime.now()
+            selected_article['content'] = article['content']
+            selected_articles.append(selected_article)
 
-    for i in article_index:
-        article = articles[i]
-        selected_article = {}
-        selected_article['title'] = article['title']
-        selected_article['author'] = article['author']
-        selected_article['url'] = article['url']
-        selected_article['category'] = article['category']
-        selected_article['timestamp'] = datetime.now()
-        selected_articles.append(selected_article)
+        print("Interesting news retrieved!")
 
-    print("Interesting news retrieved!")
+    # Get image & summary for all selected articles
+
+    print("Getting images and summaries for selected articles...")
+
+    for article in selected_articles:
+        img_keywords = ""
+        try:
+            client = Groq(api_key=groq_key)
+            completion = client.chat.completions.create(
+                model="gemma-7b-it",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
+                    },
+                    {
+                        "role": "user",
+                        "content": article['title']
+                    }
+                ],
+                temperature=0.5,
+                max_tokens=1024,
+                top_p=1,
+                stream=False,
+                stop=None,
+            )
+
+            img_keywords = str(completion.choices[0].message.content)
+            img_keywords = img_keywords[:99]
+        except Exception as e:
+            print("Could not get image keywords, using defaults...")
+            img_keywords = article['category'] + " News article"
+        
+        try:
+            image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
+            image_data = image_response.json()
+            article['image'] = image_data['hits'][0]['largeImageURL']
+            print("Image found!")
+        except Exception as e:
+            try:
+                img_keywords = img_keywords.split(" ")[0]
+                image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
+                image_data = image_response.json()
+                article['image'] = image_data['hits'][0]['largeImageURL']
+                print("Image found with shortened prompt!")
+            except Exception as e:
+                try:
+                    image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
+                    image_data = image_response.json()
+                    article['image'] = image_data['hits'][0]['largeImageURL']
+                    print("Image found using category!")
+                except Exception as e:
+                    article['image'] = "https://picsum.photos/800/600"
+
+        summary = ""
+        try:
+            client = Groq(api_key=groq_key)
+            completion = client.chat.completions.create(
+                model="gemma-7b-it",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
+                    },
+                    {
+                        "role": "user",
+                        "content": article['content']
+                    }
+                ],
+                temperature=1.4,
+                max_tokens=1024,
+                top_p=1,
+                stream=False,
+                stop=None,
+            )
+
+            summary = str(completion.choices[0].message.content)
+        except Exception as e:
+            print(e)
+            summary = "Read more about this article on the source website."
+        article['summary'] = summary
 
     return selected_articles