Added AI generated Summaries and images

This commit is contained in:
Notoric 2024-06-06 17:40:03 +01:00
parent 0c86bd74ef
commit 105d14eebc
7 changed files with 350 additions and 153 deletions

View File

@ -16,5 +16,8 @@
"country" : "$NEWSAPI_COUNTRY",
"article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
"article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
},
"pixabay" : {
"api_key" : "$PIXABAY_API_KEY"
}
}

View File

@ -14,6 +14,7 @@ services:
- NEWSAPI_COUNTRY=gb
- ARTICLE_LIFETIME=6 # in hours
- ARTICLE_INTERVAL=1 # in hours
- PIXABAY_API_KEY=
mongodb:
container_name: notoric-snc-mongo
image: mongo

View File

@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \
.replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
.replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
.replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
.replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL'])
.replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \
.replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \
# Write the config to a file
with open('config.json', 'w') as config_file:

Binary file not shown.

View File

@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
groq_key = config['groq']['api_key']
pixabayApiKey = config['pixabay']['api_key']
# Connect to MongoDB
print("Connecting to MongoDB...")
@ -125,8 +127,15 @@ def get_newsfeed(category='general'):
article_data['author'] = article['author']
article_data['category'] = category
article_data['timestamp'] = datetime.now()
if (article['url'].contains("reuters.com") == False):
articles.append(article_data)
if (article['url'].find("news.google") != -1):
response = requests.get(article['url'])
soup = BeautifulSoup(response.text, 'html.parser')
htmlarticle = soup.find('article')
if htmlarticle != None:
if len(htmlarticle.text.strip()) > 250:
article_data['content'] = htmlarticle.text.strip()
articles.append(article_data)
print("Newsfeed data retrieved!")
return articles
@ -134,84 +143,170 @@ def get_newsfeed(category='general'):
# Get most interesting news articles with AI
def get_interesting_news(articles):
print("Getting interesting news...")
interesting_articles = []
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
selected_articles = []
if len(articles) <= 3:
print("Not enough articles to select from! Using all articles...")
selected_articles = articles
else:
print("Getting interesting news...")
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
article_index = [0, 1, 2]
try:
article_index[0] = response['most_interesting']['index']
article_index[1] = response['second_most_interesting']['index']
article_index[2] = response['third_most_interesting']['index']
print("Selected articles:" + str(article_index))
except Exception as e:
print(e)
article_index = [0, 1, 2]
print("Using default article selection...")
try:
article_index[0] = response['most_interesting']['index']
article_index[1] = response['second_most_interesting']['index']
article_index[2] = response['third_most_interesting']['index']
print("Selected articles:" + str(article_index))
except Exception as e:
print(e)
article_index = [0, 1, 2]
print("Using default article selection...")
for i in article_index:
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_article['content'] = article['content']
selected_articles.append(selected_article)
for i in article_index:
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_articles.append(selected_article)
print("Interesting news retrieved!")
print("Interesting news retrieved!")
# Get image & summary for all selected articles
print("Getting images and summaries for selected articles...")
for article in selected_articles:
img_keywords = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
},
{
"role": "user",
"content": article['title']
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
img_keywords = str(completion.choices[0].message.content)
img_keywords = img_keywords[:99]
except Exception as e:
print("Could not get image keywords, using defaults...")
img_keywords = article['category'] + " News article"
try:
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found!")
except Exception as e:
try:
img_keywords = img_keywords.split(" ")[0]
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found with shortened prompt!")
except Exception as e:
try:
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found using category!")
except Exception as e:
article['image'] = "https://picsum.photos/800/600"
summary = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
},
{
"role": "user",
"content": article['content']
}
],
temperature=1.4,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
summary = str(completion.choices[0].message.content)
except Exception as e:
print(e)
summary = "Read more about this article on the source website."
article['summary'] = summary
return selected_articles
@ -238,9 +333,7 @@ def get_all_news():
def delete_old_news():
print("Deleting old news articles...")
hrs = int(config['news']['article_lifetime'])
db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }})
db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }})
print("Old news articles deleted!")
@ -249,7 +342,7 @@ def delete_old_news():
create_collections()
schedule.every(5).minutes.do(write_weather)
schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news)
schedule.every(config['news']['article_interval']).hours.do(get_all_news)
schedule.every(1).hours.do(delete_old_news)
write_weather()

Binary file not shown.

249
snc.py
View File

@ -4,6 +4,9 @@ import json
import pymongo
import requests
import schedule
import re
import requests
from bs4 import BeautifulSoup
from groq import Groq
from datetime import datetime, timedelta
@ -37,6 +40,9 @@ if os.path.exists('config.json') == False:
"country" : "gb",
"article_lifetime": 6,
"article_interval": 1
},
"pixabay" : {
"api_key" : ""
}
}
@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
groq_key = config['groq']['api_key']
pixabayApiKey = config['pixabay']['api_key']
# Connect to MongoDB
print("Connecting to MongoDB...")
@ -149,10 +157,15 @@ def get_newsfeed(category='general'):
article_data['author'] = article['author']
article_data['category'] = category
article_data['timestamp'] = datetime.now()
if (article['url'].contains("reuters.com") == False):
articles.append(article_data)
if (article['url'].find("news.google") != -1):
response = requests.get(article['url'])
soup = BeautifulSoup(response.text, 'html.parser')
htmlarticle = soup.find('article')
if htmlarticle != None:
if len(htmlarticle.text.strip()) > 250:
article_data['content'] = htmlarticle.text.strip()
articles.append(article_data)
print("Newsfeed data retrieved!")
return articles
@ -160,84 +173,170 @@ def get_newsfeed(category='general'):
# Get most interesting news articles with AI
def get_interesting_news(articles):
print("Getting interesting news...")
interesting_articles = []
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
selected_articles = []
if len(articles) <= 3:
print("Not enough articles to select from! Using all articles...")
selected_articles = articles
else:
print("Getting interesting news...")
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given an array of json elements, please provide the 3 indexes of the most interesting, important and notable news headlines that a mid-twenties person would like to read in the following format: {\"most_interesting\": {\"index\": index,\"title\": title},\"second_most_interesting\": {\"index\": index,\"title\": title},\"third_most_interesting\": {\"index\": index,\"title\": title}}"
},
{
"role": "user",
"content": str(articles)
}
],
temperature=1.3,
max_tokens=1024,
top_p=1,
stream=False,
response_format={"type": "json_object"},
stop=None,
)
response = str(completion.choices[0].message.content)
response = response.replace("\n", " ")
response = json.loads(response)
except Exception as e: # If ai doesnt return a valid response, check anyway, if not use the first 3 articles
try:
response = e
response = response[18:]
response = json.loads(response)
response = response['error']['failed_generation']
response = response.replace("\n", " ")
response = json.loads(response)
except:
print("Error selecting articles! Using random selection...")
response = {
"most_interesting": {
"index": 0,
"title": "Interesting"
},
"second_most_interesting": {
"index": 1,
"title": "Interesting"
},
"third_most_interesting": {
"index": 2,
"title": "Interesting"
}
}
article_index = [0, 1, 2]
try:
article_index[0] = response['most_interesting']['index']
article_index[1] = response['second_most_interesting']['index']
article_index[2] = response['third_most_interesting']['index']
print("Selected articles:" + str(article_index))
except Exception as e:
print(e)
article_index = [0, 1, 2]
print("Using default article selection...")
try:
article_index[0] = response['most_interesting']['index']
article_index[1] = response['second_most_interesting']['index']
article_index[2] = response['third_most_interesting']['index']
print("Selected articles:" + str(article_index))
except Exception as e:
print(e)
article_index = [0, 1, 2]
print("Using default article selection...")
for i in article_index:
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_article['content'] = article['content']
selected_articles.append(selected_article)
for i in article_index:
article = articles[i]
selected_article = {}
selected_article['title'] = article['title']
selected_article['author'] = article['author']
selected_article['url'] = article['url']
selected_article['category'] = article['category']
selected_article['timestamp'] = datetime.now()
selected_articles.append(selected_article)
print("Interesting news retrieved!")
print("Interesting news retrieved!")
# Get image & summary for all selected articles
print("Getting images and summaries for selected articles...")
for article in selected_articles:
img_keywords = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
},
{
"role": "user",
"content": article['title']
}
],
temperature=0.5,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
img_keywords = str(completion.choices[0].message.content)
img_keywords = img_keywords[:99]
except Exception as e:
print("Could not get image keywords, using defaults...")
img_keywords = article['category'] + " News article"
try:
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found!")
except Exception as e:
try:
img_keywords = img_keywords.split(" ")[0]
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found with shortened prompt!")
except Exception as e:
try:
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
image_data = image_response.json()
article['image'] = image_data['hits'][0]['largeImageURL']
print("Image found using category!")
except Exception as e:
article['image'] = "https://picsum.photos/800/600"
summary = ""
try:
client = Groq(api_key=groq_key)
completion = client.chat.completions.create(
model="gemma-7b-it",
messages=[
{
"role": "system",
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
},
{
"role": "user",
"content": article['content']
}
],
temperature=1.4,
max_tokens=1024,
top_p=1,
stream=False,
stop=None,
)
summary = str(completion.choices[0].message.content)
except Exception as e:
print(e)
summary = "Read more about this article on the source website."
article['summary'] = summary
return selected_articles