Added AI generated Summaries and images

This commit is contained in:
Notoric 2024-06-06 17:40:03 +01:00
parent 0c86bd74ef
commit 105d14eebc
7 changed files with 350 additions and 153 deletions

View File

@ -16,5 +16,8 @@
"country" : "$NEWSAPI_COUNTRY", "country" : "$NEWSAPI_COUNTRY",
"article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME", "article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
"article_interval": "$NEWSAPI_ARTICLE_INTERVAL" "article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
},
"pixabay" : {
"api_key" : "$PIXABAY_API_KEY"
} }
} }

View File

@ -14,6 +14,7 @@ services:
- NEWSAPI_COUNTRY=gb - NEWSAPI_COUNTRY=gb
- ARTICLE_LIFETIME=6 # in hours - ARTICLE_LIFETIME=6 # in hours
- ARTICLE_INTERVAL=1 # in hours - ARTICLE_INTERVAL=1 # in hours
- PIXABAY_API_KEY=
mongodb: mongodb:
container_name: notoric-snc-mongo container_name: notoric-snc-mongo
image: mongo image: mongo

View File

@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \
.replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \ .replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
.replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \ .replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
.replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \ .replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
.replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) .replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \
.replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \
# Write the config to a file # Write the config to a file
with open('config.json', 'w') as config_file: with open('config.json', 'w') as config_file:

Binary file not shown.

View File

@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
groq_key = config['groq']['api_key'] groq_key = config['groq']['api_key']
pixabayApiKey = config['pixabay']['api_key']
# Connect to MongoDB # Connect to MongoDB
print("Connecting to MongoDB...") print("Connecting to MongoDB...")
@ -125,8 +127,15 @@ def get_newsfeed(category='general'):
article_data['author'] = article['author'] article_data['author'] = article['author']
article_data['category'] = category article_data['category'] = category
article_data['timestamp'] = datetime.now() article_data['timestamp'] = datetime.now()
if (article['url'].contains("reuters.com") == False):
articles.append(article_data) if (article['url'].find("news.google") != -1):
response = requests.get(article['url'])
soup = BeautifulSoup(response.text, 'html.parser')
htmlarticle = soup.find('article')
if htmlarticle != None:
if len(htmlarticle.text.strip()) > 250:
article_data['content'] = htmlarticle.text.strip()
articles.append(article_data)
print("Newsfeed data retrieved!") print("Newsfeed data retrieved!")
return articles return articles
@ -134,84 +143,170 @@ def get_newsfeed(category='general'):
# Get most interesting news articles with AI # Get most interesting news articles with AI
def get_interesting_news(articles): def get_interesting_news(articles):
print("Getting interesting news...")
interesting_articles = []
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
selected_articles = [] selected_articles = []
article_index = [0, 1, 2] if len(articles) <= 3:
try: print("Not enough articles to select from! Using all articles...")
article_index[0] = response['most_interesting']['index'] selected_articles = articles
article_index[1] = response['second_most_interesting']['index'] else:
article_index[2] = response['third_most_interesting']['index'] print("Getting interesting news...")
print("Selected articles:" + str(article_index))
except Exception as e: try:
print(e) client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
article_index = [0, 1, 2] article_index = [0, 1, 2]
print("Using default article selection...") try:
article_index[0] = response['most_interesting']['index']
article_index[1] = response['second_most_interesting']['index']
article_index[2] = response['third_most_interesting']['index']
print("Selected articles:" + str(article_index))
except Exception as e:
print(e)
article_index = [0, 1, 2]
print("Using default article selection...")
for i in article_index:
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_article['content'] = article['content']
selected_articles.append(selected_article)
for i in article_index: print("Interesting news retrieved!")
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_articles.append(selected_article)
print("Interesting news retrieved!") # Get image & summary for all selected articles
print("Getting images and summaries for selected articles...")
for article in selected_articles:
img_keywords = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
},
{
"role": "user",
"content": article['title']
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
img_keywords = str(completion.choices[0].message.content)
img_keywords = img_keywords[:99]
except Exception as e:
print("Could not get image keywords, using defaults...")
img_keywords = article['category'] + " News article"
try:
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found!")
except Exception as e:
try:
img_keywords = img_keywords.split(" ")[0]
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found with shortened prompt!")
except Exception as e:
try:
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found using category!")
except Exception as e:
article['image'] = "https://picsum.photos/800/600"
summary = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
},
{
"role": "user",
"content": article['content']
}
],
temperature=1.4,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
summary = str(completion.choices[0].message.content)
except Exception as e:
print(e)
summary = "Read more about this article on the source website."
article['summary'] = summary
return selected_articles return selected_articles
@ -238,9 +333,7 @@ def get_all_news():
def delete_old_news(): def delete_old_news():
print("Deleting old news articles...") print("Deleting old news articles...")
hrs = int(config['news']['article_lifetime']) db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }})
db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }})
print("Old news articles deleted!") print("Old news articles deleted!")
@ -249,7 +342,7 @@ def delete_old_news():
create_collections() create_collections()
schedule.every(5).minutes.do(write_weather) schedule.every(5).minutes.do(write_weather)
schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news) schedule.every(config['news']['article_interval']).hours.do(get_all_news)
schedule.every(1).hours.do(delete_old_news) schedule.every(1).hours.do(delete_old_news)
write_weather() write_weather()

Binary file not shown.

247
snc.py
View File

@ -4,6 +4,9 @@ import json
import pymongo import pymongo
import requests import requests
import schedule import schedule
import re
import requests
from bs4 import BeautifulSoup
from groq import Groq from groq import Groq
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -37,6 +40,9 @@ if os.path.exists('config.json') == False:
"country" : "gb", "country" : "gb",
"article_lifetime": 6, "article_lifetime": 6,
"article_interval": 1 "article_interval": 1
},
"pixabay" : {
"api_key" : ""
} }
} }
@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
groq_key = config['groq']['api_key'] groq_key = config['groq']['api_key']
pixabayApiKey = config['pixabay']['api_key']
# Connect to MongoDB # Connect to MongoDB
print("Connecting to MongoDB...") print("Connecting to MongoDB...")
@ -149,10 +157,15 @@ def get_newsfeed(category='general'):
article_data['author'] = article['author'] article_data['author'] = article['author']
article_data['category'] = category article_data['category'] = category
article_data['timestamp'] = datetime.now() article_data['timestamp'] = datetime.now()
if (article['url'].contains("reuters.com") == False):
articles.append(article_data)
if (article['url'].find("news.google") != -1):
response = requests.get(article['url'])
soup = BeautifulSoup(response.text, 'html.parser')
htmlarticle = soup.find('article')
if htmlarticle != None:
if len(htmlarticle.text.strip()) > 250:
article_data['content'] = htmlarticle.text.strip()
articles.append(article_data)
print("Newsfeed data retrieved!") print("Newsfeed data retrieved!")
return articles return articles
@ -160,84 +173,170 @@ def get_newsfeed(category='general'):
# Get most interesting news articles with AI # Get most interesting news articles with AI
def get_interesting_news(articles): def get_interesting_news(articles):
print("Getting interesting news...")
interesting_articles = []
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
selected_articles = [] selected_articles = []
article_index = [0, 1, 2] if len(articles) <= 3:
try: print("Not enough articles to select from! Using all articles...")
article_index[0] = response['most_interesting']['index'] selected_articles = articles
article_index[1] = response['second_most_interesting']['index'] else:
article_index[2] = response['third_most_interesting']['index'] print("Getting interesting news...")
print("Selected articles:" + str(article_index))
except Exception as e: try:
print(e) client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
article_index = [0, 1, 2] article_index = [0, 1, 2]
print("Using default article selection...") try:
article_index[0] = response['most_interesting']['index']
article_index[1] = response['second_most_interesting']['index']
article_index[2] = response['third_most_interesting']['index']
print("Selected articles:" + str(article_index))
except Exception as e:
print(e)
article_index = [0, 1, 2]
print("Using default article selection...")
for i in article_index:
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_article['content'] = article['content']
selected_articles.append(selected_article)
for i in article_index: print("Interesting news retrieved!")
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_articles.append(selected_article)
print("Interesting news retrieved!") # Get image & summary for all selected articles
print("Getting images and summaries for selected articles...")
for article in selected_articles:
img_keywords = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
},
{
"role": "user",
"content": article['title']
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
img_keywords = str(completion.choices[0].message.content)
img_keywords = img_keywords[:99]
except Exception as e:
print("Could not get image keywords, using defaults...")
img_keywords = article['category'] + " News article"
try:
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found!")
except Exception as e:
try:
img_keywords = img_keywords.split(" ")[0]
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found with shortened prompt!")
except Exception as e:
try:
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found using category!")
except Exception as e:
article['image'] = "https://picsum.photos/800/600"
summary = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
},
{
"role": "user",
"content": article['content']
}
],
temperature=1.4,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
summary = str(completion.choices[0].message.content)
except Exception as e:
print(e)
summary = "Read more about this article on the source website."
article['summary'] = summary
return selected_articles return selected_articles