Added AI generated Summaries and images

This commit is contained in:
Notoric 2024-06-06 17:40:03 +01:00
parent 0c86bd74ef
commit 105d14eebc
7 changed files with 350 additions and 153 deletions

View File

@ -16,5 +16,8 @@
"country" : "$NEWSAPI_COUNTRY", "country" : "$NEWSAPI_COUNTRY",
"article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME", "article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
"article_interval": "$NEWSAPI_ARTICLE_INTERVAL" "article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
},
"pixabay" : {
"api_key" : "$PIXABAY_API_KEY"
} }
} }

View File

@ -14,6 +14,7 @@ services:
- NEWSAPI_COUNTRY=gb - NEWSAPI_COUNTRY=gb
- ARTICLE_LIFETIME=6 # in hours - ARTICLE_LIFETIME=6 # in hours
- ARTICLE_INTERVAL=1 # in hours - ARTICLE_INTERVAL=1 # in hours
- PIXABAY_API_KEY=
mongodb: mongodb:
container_name: notoric-snc-mongo container_name: notoric-snc-mongo
image: mongo image: mongo

View File

@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \
.replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \ .replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
.replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \ .replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
.replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \ .replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
.replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \
.replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \
# Write the config to a file # Write the config to a file
with open('config.json', 'w') as config_file: with open('config.json', 'w') as config_file:

Binary file not shown.

View File

@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
groq_key = config['groq']['api_key'] groq_key = config['groq']['api_key']
pixabayApiKey = config['pixabay']['api_key']
# Connect to MongoDB # Connect to MongoDB
print("Connecting to MongoDB...") print("Connecting to MongoDB...")
@ -125,7 +127,14 @@ def get_newsfeed(category='general'):
article_data['author'] = article['author'] article_data['author'] = article['author']
article_data['category'] = category article_data['category'] = category
article_data['timestamp'] = datetime.now() article_data['timestamp'] = datetime.now()
if (article['url'].contains("reuters.com") == False):
if (article['url'].find("news.google") != -1):
response = requests.get(article['url'])
soup = BeautifulSoup(response.text, 'html.parser')
htmlarticle = soup.find('article')
if htmlarticle != None:
if len(htmlarticle.text.strip()) > 250:
article_data['content'] = htmlarticle.text.strip()
articles.append(article_data) articles.append(article_data)
print("Newsfeed data retrieved!") print("Newsfeed data retrieved!")
@ -134,8 +143,14 @@ def get_newsfeed(category='general'):
# Get most interesting news articles with AI # Get most interesting news articles with AI
def get_interesting_news(articles): def get_interesting_news(articles):
selected_articles = []
if len(articles) <= 3:
print("Not enough articles to select from! Using all articles...")
selected_articles = articles
else:
print("Getting interesting news...") print("Getting interesting news...")
interesting_articles = []
try: try:
client = Groq(api_key=groq_key) client = Groq(api_key=groq_key)
@ -187,8 +202,6 @@ def get_interesting_news(articles):
} }
} }
selected_articles = []
article_index = [0, 1, 2] article_index = [0, 1, 2]
try: try:
article_index[0] = response['most_interesting']['index'] article_index[0] = response['most_interesting']['index']
@ -200,7 +213,6 @@ def get_interesting_news(articles):
article_index = [0, 1, 2] article_index = [0, 1, 2]
print("Using default article selection...") print("Using default article selection...")
for i in article_index: for i in article_index:
article = articles[i] article = articles[i]
selected_article = {} selected_article = {}
@ -209,10 +221,93 @@ def get_interesting_news(articles):
selected_article['url'] = article['url'] selected_article['url'] = article['url']
selected_article['category'] = article['category'] selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now() selected_article['timestamp'] = datetime.now()
selected_article['content'] = article['content']
selected_articles.append(selected_article) selected_articles.append(selected_article)
print("Interesting news retrieved!") print("Interesting news retrieved!")
# Get image & summary for all selected articles
print("Getting images and summaries for selected articles...")
for article in selected_articles:
img_keywords = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
},
{
"role": "user",
"content": article['title']
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
img_keywords = str(completion.choices[0].message.content)
img_keywords = img_keywords[:99]
except Exception as e:
print("Could not get image keywords, using defaults...")
img_keywords = article['category'] + " News article"
try:
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found!")
except Exception as e:
try:
img_keywords = img_keywords.split(" ")[0]
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found with shortened prompt!")
except Exception as e:
try:
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found using category!")
except Exception as e:
article['image'] = "https://picsum.photos/800/600"
summary = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
},
{
"role": "user",
"content": article['content']
}
],
temperature=1.4,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
summary = str(completion.choices[0].message.content)
except Exception as e:
print(e)
summary = "Read more about this article on the source website."
article['summary'] = summary
return selected_articles return selected_articles
# Write newsfeed data to MongoDB # Write newsfeed data to MongoDB
@ -238,9 +333,7 @@ def get_all_news():
def delete_old_news(): def delete_old_news():
print("Deleting old news articles...") print("Deleting old news articles...")
hrs = int(config['news']['article_lifetime']) db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }})
db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }})
print("Old news articles deleted!") print("Old news articles deleted!")
@ -249,7 +342,7 @@ def delete_old_news():
create_collections() create_collections()
schedule.every(5).minutes.do(write_weather) schedule.every(5).minutes.do(write_weather)
schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news) schedule.every(config['news']['article_interval']).hours.do(get_all_news)
schedule.every(1).hours.do(delete_old_news) schedule.every(1).hours.do(delete_old_news)
write_weather() write_weather()

Binary file not shown.

113
snc.py
View File

@ -4,6 +4,9 @@ import json
import pymongo import pymongo
import requests import requests
import schedule import schedule
import re
import requests
from bs4 import BeautifulSoup
from groq import Groq from groq import Groq
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -37,6 +40,9 @@ if os.path.exists('config.json') == False:
"country" : "gb", "country" : "gb",
"article_lifetime": 6, "article_lifetime": 6,
"article_interval": 1 "article_interval": 1
},
"pixabay" : {
"api_key" : ""
} }
} }
@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
groq_key = config['groq']['api_key'] groq_key = config['groq']['api_key']
pixabayApiKey = config['pixabay']['api_key']
# Connect to MongoDB # Connect to MongoDB
print("Connecting to MongoDB...") print("Connecting to MongoDB...")
@ -149,19 +157,30 @@ def get_newsfeed(category='general'):
article_data['author'] = article['author'] article_data['author'] = article['author']
article_data['category'] = category article_data['category'] = category
article_data['timestamp'] = datetime.now() article_data['timestamp'] = datetime.now()
if (article['url'].contains("reuters.com") == False):
if (article['url'].find("news.google") != -1):
response = requests.get(article['url'])
soup = BeautifulSoup(response.text, 'html.parser')
htmlarticle = soup.find('article')
if htmlarticle != None:
if len(htmlarticle.text.strip()) > 250:
article_data['content'] = htmlarticle.text.strip()
articles.append(article_data) articles.append(article_data)
print("Newsfeed data retrieved!") print("Newsfeed data retrieved!")
return articles return articles
# Get most interesting news articles with AI # Get most interesting news articles with AI
def get_interesting_news(articles): def get_interesting_news(articles):
selected_articles = []
if len(articles) <= 3:
print("Not enough articles to select from! Using all articles...")
selected_articles = articles
else:
print("Getting interesting news...") print("Getting interesting news...")
interesting_articles = []
try: try:
client = Groq(api_key=groq_key) client = Groq(api_key=groq_key)
@ -213,8 +232,6 @@ def get_interesting_news(articles):
} }
} }
selected_articles = []
article_index = [0, 1, 2] article_index = [0, 1, 2]
try: try:
article_index[0] = response['most_interesting']['index'] article_index[0] = response['most_interesting']['index']
@ -226,7 +243,6 @@ def get_interesting_news(articles):
article_index = [0, 1, 2] article_index = [0, 1, 2]
print("Using default article selection...") print("Using default article selection...")
for i in article_index: for i in article_index:
article = articles[i] article = articles[i]
selected_article = {} selected_article = {}
@ -235,10 +251,93 @@ def get_interesting_news(articles):
selected_article['url'] = article['url'] selected_article['url'] = article['url']
selected_article['category'] = article['category'] selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now() selected_article['timestamp'] = datetime.now()
selected_article['content'] = article['content']
selected_articles.append(selected_article) selected_articles.append(selected_article)
print("Interesting news retrieved!") print("Interesting news retrieved!")
# Get image & summary for all selected articles
print("Getting images and summaries for selected articles...")
for article in selected_articles:
img_keywords = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
},
{
"role": "user",
"content": article['title']
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
img_keywords = str(completion.choices[0].message.content)
img_keywords = img_keywords[:99]
except Exception as e:
print("Could not get image keywords, using defaults...")
img_keywords = article['category'] + " News article"
try:
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found!")
except Exception as e:
try:
img_keywords = img_keywords.split(" ")[0]
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found with shortened prompt!")
except Exception as e:
try:
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found using category!")
except Exception as e:
article['image'] = "https://picsum.photos/800/600"
summary = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
},
{
"role": "user",
"content": article['content']
}
],
temperature=1.4,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
summary = str(completion.choices[0].message.content)
except Exception as e:
print(e)
summary = "Read more about this article on the source website."
article['summary'] = summary
return selected_articles return selected_articles
# Write newsfeed data to MongoDB # Write newsfeed data to MongoDB