From 105d14eebcec4e288250c4f360f1c22231efdde8 Mon Sep 17 00:00:00 2001 From: Notoric Date: Thu, 6 Jun 2024 17:40:03 +0100 Subject: [PATCH] Added AI generated Summaries and images --- Docker/config_template.json | 3 + Docker/docker-compose.yml | 1 + Docker/generate-config.py | 3 +- Docker/requirements.txt | Bin 720 -> 800 bytes Docker/snc.py | 247 ++++++++++++++++++++++++----------- requirements.txt | Bin 720 -> 800 bytes snc.py | 249 +++++++++++++++++++++++++----------- 7 files changed, 350 insertions(+), 153 deletions(-) diff --git a/Docker/config_template.json b/Docker/config_template.json index c805c1a..b973bce 100644 --- a/Docker/config_template.json +++ b/Docker/config_template.json @@ -16,5 +16,8 @@ "country" : "$NEWSAPI_COUNTRY", "article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME", "article_interval": "$NEWSAPI_ARTICLE_INTERVAL" + }, + "pixabay" : { + "api_key" : "$PIXABAY_API_KEY" } } \ No newline at end of file diff --git a/Docker/docker-compose.yml b/Docker/docker-compose.yml index 031fa35..7e5ba9b 100644 --- a/Docker/docker-compose.yml +++ b/Docker/docker-compose.yml @@ -14,6 +14,7 @@ services: - NEWSAPI_COUNTRY=gb - ARTICLE_LIFETIME=6 # in hours - ARTICLE_INTERVAL=1 # in hours + - PIXABAY_API_KEY= mongodb: container_name: notoric-snc-mongo image: mongo diff --git a/Docker/generate-config.py b/Docker/generate-config.py index 36cec46..25445a7 100644 --- a/Docker/generate-config.py +++ b/Docker/generate-config.py @@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \ .replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \ .replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \ .replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \ - .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) + .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \ + .replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \ # Write the config to a file with open('config.json', 'w') as config_file: diff --git a/Docker/requirements.txt b/Docker/requirements.txt index 2e0139172d70b875e22b9247d2ebfdd093cdd974..014b2ccc948234a7b729649a21011826e7ee6dab 100644 GIT binary patch delta 86 zcmcb>x`1thpLP;MDnlYeDMJZECPNyK&S5BK$OqB|3?>Y=Kxo3C$6&}{1SE|&dKxlG X!(@wr>QfoYfEc6-ByT#olqnhjD^n4G delta 14 VcmZ3$c7b(*-^Mx}rpc$6!T>6P1#AES diff --git a/Docker/snc.py b/Docker/snc.py index e70b15a..d827b7b 100644 --- a/Docker/snc.py +++ b/Docker/snc.py @@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr groq_key = config['groq']['api_key'] +pixabayApiKey = config['pixabay']['api_key'] + # Connect to MongoDB print("Connecting to MongoDB...") @@ -125,8 +127,15 @@ def get_newsfeed(category='general'): article_data['author'] = article['author'] article_data['category'] = category article_data['timestamp'] = datetime.now() - if (article['url'].contains("reuters.com") == False): - articles.append(article_data) + + if (article['url'].find("news.google") != -1): + response = requests.get(article['url']) + soup = BeautifulSoup(response.text, 'html.parser') + htmlarticle = soup.find('article') + if htmlarticle != None: + if len(htmlarticle.text.strip()) > 250: + article_data['content'] = htmlarticle.text.strip() + articles.append(article_data) print("Newsfeed data retrieved!") return articles @@ -134,84 +143,170 @@ def get_newsfeed(category='general'): # Get most interesting news articles with AI def get_interesting_news(articles): - print("Getting interesting news...") - interesting_articles = [] - - try: - client = Groq(api_key=groq_key) - completion = client.chat.completions.create( - model="gemma-7b-it", - messages=[ - { - "role": "system", - "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}" - }, - { - "role": "user", - "content": str(articles) - } - ], - temperature=1.3, - max_tokens=1024, - top_p=1, - stream=False, - response_format={"type": "json_object"}, - stop=None, - ) - - response = str(completion.choices[0].message.content) - response = response.replace("\n", " ") - response = json.loads(response) - except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles - try: - response = e - response = response[18:] - response = json.loads(response) - response = response['error']['failed_generation'] - response = response.replace("\n", " ") - response = json.loads(response) - except: - print("Error selecting articles! Using random selection...") - response = { - "most_interesting": { - "index": 0, - "title": "Interesting" - }, - "second_most_interesting": { - "index": 1, - "title": "Interesting" - }, - "third_most_interesting": { - "index": 2, - "title": "Interesting" - } - } selected_articles = [] + + if len(articles) <= 3: + print("Not enough articles to select from! Using all articles...") + selected_articles = articles + else: + print("Getting interesting news...") + + try: + client = Groq(api_key=groq_key) + completion = client.chat.completions.create( + model="gemma-7b-it", + messages=[ + { + "role": "system", + "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}" + }, + { + "role": "user", + "content": str(articles) + } + ], + temperature=1.3, + max_tokens=1024, + top_p=1, + stream=False, + response_format={"type": "json_object"}, + stop=None, + ) + + response = str(completion.choices[0].message.content) + response = response.replace("\n", " ") + response = json.loads(response) + except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles + try: + response = e + response = response[18:] + response = json.loads(response) + response = response['error']['failed_generation'] + response = response.replace("\n", " ") + response = json.loads(response) + except: + print("Error selecting articles! Using random selection...") + response = { + "most_interesting": { + "index": 0, + "title": "Interesting" + }, + "second_most_interesting": { + "index": 1, + "title": "Interesting" + }, + "third_most_interesting": { + "index": 2, + "title": "Interesting" + } + } - article_index = [0, 1, 2] - try: - article_index[0] = response['most_interesting']['index'] - article_index[1] = response['second_most_interesting']['index'] - article_index[2] = response['third_most_interesting']['index'] - print("Selected articles:" + str(article_index)) - except Exception as e: - print(e) article_index = [0, 1, 2] - print("Using default article selection...") + try: + article_index[0] = response['most_interesting']['index'] + article_index[1] = response['second_most_interesting']['index'] + article_index[2] = response['third_most_interesting']['index'] + print("Selected articles:" + str(article_index)) + except Exception as e: + print(e) + article_index = [0, 1, 2] + print("Using default article selection...") + for i in article_index: + article = articles[i] + selected_article = {} + selected_article['title'] = article['title'] + selected_article['author'] = article['author'] + selected_article['url'] = article['url'] + selected_article['category'] = article['category'] + selected_article['timestamp'] = datetime.now() + selected_article['content'] = article['content'] + selected_articles.append(selected_article) - for i in article_index: - article = articles[i] - selected_article = {} - selected_article['title'] = article['title'] - selected_article['author'] = article['author'] - selected_article['url'] = article['url'] - selected_article['category'] = article['category'] - selected_article['timestamp'] = datetime.now() - selected_articles.append(selected_article) + print("Interesting news retrieved!") - print("Interesting news retrieved!") + # Get image & summary for all selected articles + + print("Getting images and summaries for selected articles...") + + for article in selected_articles: + img_keywords = "" + try: + client = Groq(api_key=groq_key) + completion = client.chat.completions.create( + model="gemma-7b-it", + messages=[ + { + "role": "system", + "content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3" + }, + { + "role": "user", + "content": article['title'] + } + ], + temperature=0.5, + max_tokens=1024, + top_p=1, + stream=False, + stop=None, + ) + + img_keywords = str(completion.choices[0].message.content) + img_keywords = img_keywords[:99] + except Exception as e: + print("Could not get image keywords, using defaults...") + img_keywords = article['category'] + " News article" + + try: + image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3") + image_data = image_response.json() + article['image'] = image_data['hits'][0]['largeImageURL'] + print("Image found!") + except Exception as e: + try: + img_keywords = img_keywords.split(" ")[0] + image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3") + image_data = image_response.json() + article['image'] = image_data['hits'][0]['largeImageURL'] + print("Image found with shortened prompt!") + except Exception as e: + try: + image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3") + image_data = image_response.json() + article['image'] = image_data['hits'][0]['largeImageURL'] + print("Image found using category!") + except Exception as e: + article['image'] = "https://picsum.photos/800/600" + + summary = "" + try: + client = Groq(api_key=groq_key) + completion = client.chat.completions.create( + model="gemma-7b-it", + messages=[ + { + "role": "system", + "content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone" + }, + { + "role": "user", + "content": article['content'] + } + ], + temperature=1.4, + max_tokens=1024, + top_p=1, + stream=False, + stop=None, + ) + + summary = str(completion.choices[0].message.content) + except Exception as e: + print(e) + summary = "Read more about this article on the source website." + article['summary'] = summary return selected_articles @@ -238,9 +333,7 @@ def get_all_news(): def delete_old_news(): print("Deleting old news articles...") - hrs = int(config['news']['article_lifetime']) - - db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }}) + db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }}) print("Old news articles deleted!") @@ -249,7 +342,7 @@ def delete_old_news(): create_collections() schedule.every(5).minutes.do(write_weather) -schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news) +schedule.every(config['news']['article_interval']).hours.do(get_all_news) schedule.every(1).hours.do(delete_old_news) write_weather() diff --git a/requirements.txt b/requirements.txt index 2e0139172d70b875e22b9247d2ebfdd093cdd974..014b2ccc948234a7b729649a21011826e7ee6dab 100644 GIT binary patch delta 86 zcmcb>x`1thpLP;MDnlYeDMJZECPNyK&S5BK$OqB|3?>Y=Kxo3C$6&}{1SE|&dKxlG X!(@wr>QfoYfEc6-ByT#olqnhjD^n4G delta 14 VcmZ3$c7b(*-^Mx}rpc$6!T>6P1#AES diff --git a/snc.py b/snc.py index 87311ea..2dfbf12 100644 --- a/snc.py +++ b/snc.py @@ -4,6 +4,9 @@ import json import pymongo import requests import schedule +import re +import requests +from bs4 import BeautifulSoup from groq import Groq from datetime import datetime, timedelta @@ -37,6 +40,9 @@ if os.path.exists('config.json') == False: "country" : "gb", "article_lifetime": 6, "article_interval": 1 + }, + "pixabay" : { + "api_key" : "" } } @@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr groq_key = config['groq']['api_key'] +pixabayApiKey = config['pixabay']['api_key'] + # Connect to MongoDB print("Connecting to MongoDB...") @@ -149,10 +157,15 @@ def get_newsfeed(category='general'): article_data['author'] = article['author'] article_data['category'] = category article_data['timestamp'] = datetime.now() - if (article['url'].contains("reuters.com") == False): - articles.append(article_data) - - + + if (article['url'].find("news.google") != -1): + response = requests.get(article['url']) + soup = BeautifulSoup(response.text, 'html.parser') + htmlarticle = soup.find('article') + if htmlarticle != None: + if len(htmlarticle.text.strip()) > 250: + article_data['content'] = htmlarticle.text.strip() + articles.append(article_data) print("Newsfeed data retrieved!") return articles @@ -160,84 +173,170 @@ def get_newsfeed(category='general'): # Get most interesting news articles with AI def get_interesting_news(articles): - print("Getting interesting news...") - interesting_articles = [] - - try: - client = Groq(api_key=groq_key) - completion = client.chat.completions.create( - model="gemma-7b-it", - messages=[ - { - "role": "system", - "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}" - }, - { - "role": "user", - "content": str(articles) - } - ], - temperature=1.3, - max_tokens=1024, - top_p=1, - stream=False, - response_format={"type": "json_object"}, - stop=None, - ) - - response = str(completion.choices[0].message.content) - response = response.replace("\n", " ") - response = json.loads(response) - except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles - try: - response = e - response = response[18:] - response = json.loads(response) - response = response['error']['failed_generation'] - response = response.replace("\n", " ") - response = json.loads(response) - except: - print("Error selecting articles! Using random selection...") - response = { - "most_interesting": { - "index": 0, - "title": "Interesting" - }, - "second_most_interesting": { - "index": 1, - "title": "Interesting" - }, - "third_most_interesting": { - "index": 2, - "title": "Interesting" - } - } selected_articles = [] + + if len(articles) <= 3: + print("Not enough articles to select from! Using all articles...") + selected_articles = articles + else: + print("Getting interesting news...") + + try: + client = Groq(api_key=groq_key) + completion = client.chat.completions.create( + model="gemma-7b-it", + messages=[ + { + "role": "system", + "content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}" + }, + { + "role": "user", + "content": str(articles) + } + ], + temperature=1.3, + max_tokens=1024, + top_p=1, + stream=False, + response_format={"type": "json_object"}, + stop=None, + ) + + response = str(completion.choices[0].message.content) + response = response.replace("\n", " ") + response = json.loads(response) + except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles + try: + response = e + response = response[18:] + response = json.loads(response) + response = response['error']['failed_generation'] + response = response.replace("\n", " ") + response = json.loads(response) + except: + print("Error selecting articles! Using random selection...") + response = { + "most_interesting": { + "index": 0, + "title": "Interesting" + }, + "second_most_interesting": { + "index": 1, + "title": "Interesting" + }, + "third_most_interesting": { + "index": 2, + "title": "Interesting" + } + } - article_index = [0, 1, 2] - try: - article_index[0] = response['most_interesting']['index'] - article_index[1] = response['second_most_interesting']['index'] - article_index[2] = response['third_most_interesting']['index'] - print("Selected articles:" + str(article_index)) - except Exception as e: - print(e) article_index = [0, 1, 2] - print("Using default article selection...") + try: + article_index[0] = response['most_interesting']['index'] + article_index[1] = response['second_most_interesting']['index'] + article_index[2] = response['third_most_interesting']['index'] + print("Selected articles:" + str(article_index)) + except Exception as e: + print(e) + article_index = [0, 1, 2] + print("Using default article selection...") + for i in article_index: + article = articles[i] + selected_article = {} + selected_article['title'] = article['title'] + selected_article['author'] = article['author'] + selected_article['url'] = article['url'] + selected_article['category'] = article['category'] + selected_article['timestamp'] = datetime.now() + selected_article['content'] = article['content'] + selected_articles.append(selected_article) - for i in article_index: - article = articles[i] - selected_article = {} - selected_article['title'] = article['title'] - selected_article['author'] = article['author'] - selected_article['url'] = article['url'] - selected_article['category'] = article['category'] - selected_article['timestamp'] = datetime.now() - selected_articles.append(selected_article) + print("Interesting news retrieved!") - print("Interesting news retrieved!") + # Get image & summary for all selected articles + + print("Getting images and summaries for selected articles...") + + for article in selected_articles: + img_keywords = "" + try: + client = Groq(api_key=groq_key) + completion = client.chat.completions.create( + model="gemma-7b-it", + messages=[ + { + "role": "system", + "content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3" + }, + { + "role": "user", + "content": article['title'] + } + ], + temperature=0.5, + max_tokens=1024, + top_p=1, + stream=False, + stop=None, + ) + + img_keywords = str(completion.choices[0].message.content) + img_keywords = img_keywords[:99] + except Exception as e: + print("Could not get image keywords, using defaults...") + img_keywords = article['category'] + " News article" + + try: + image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3") + image_data = image_response.json() + article['image'] = image_data['hits'][0]['largeImageURL'] + print("Image found!") + except Exception as e: + try: + img_keywords = img_keywords.split(" ")[0] + image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3") + image_data = image_response.json() + article['image'] = image_data['hits'][0]['largeImageURL'] + print("Image found with shortened prompt!") + except Exception as e: + try: + image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3") + image_data = image_response.json() + article['image'] = image_data['hits'][0]['largeImageURL'] + print("Image found using category!") + except Exception as e: + article['image'] = "https://picsum.photos/800/600" + + summary = "" + try: + client = Groq(api_key=groq_key) + completion = client.chat.completions.create( + model="gemma-7b-it", + messages=[ + { + "role": "system", + "content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone" + }, + { + "role": "user", + "content": article['content'] + } + ], + temperature=1.4, + max_tokens=1024, + top_p=1, + stream=False, + stop=None, + ) + + summary = str(completion.choices[0].message.content) + except Exception as e: + print(e) + summary = "Read more about this article on the source website." + article['summary'] = summary return selected_articles