Added AI generated Summaries and images
This commit is contained in:
parent
0c86bd74ef
commit
105d14eebc
|
@ -16,5 +16,8 @@
|
||||||
"country" : "$NEWSAPI_COUNTRY",
|
"country" : "$NEWSAPI_COUNTRY",
|
||||||
"article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
|
"article_lifetime": "$NEWSAPI_ARTICLE_LIFETIME",
|
||||||
"article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
|
"article_interval": "$NEWSAPI_ARTICLE_INTERVAL"
|
||||||
|
},
|
||||||
|
"pixabay" : {
|
||||||
|
"api_key" : "$PIXABAY_API_KEY"
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -14,6 +14,7 @@ services:
|
||||||
- NEWSAPI_COUNTRY=gb
|
- NEWSAPI_COUNTRY=gb
|
||||||
- ARTICLE_LIFETIME=6 # in hours
|
- ARTICLE_LIFETIME=6 # in hours
|
||||||
- ARTICLE_INTERVAL=1 # in hours
|
- ARTICLE_INTERVAL=1 # in hours
|
||||||
|
- PIXABAY_API_KEY=
|
||||||
mongodb:
|
mongodb:
|
||||||
container_name: notoric-snc-mongo
|
container_name: notoric-snc-mongo
|
||||||
image: mongo
|
image: mongo
|
||||||
|
|
|
@ -15,7 +15,8 @@ config = config_template.replace('$MONGO_HOST', os.environ['MONGO_HOST']) \
|
||||||
.replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
|
.replace('$NEWSAPI_API_KEY', os.environ['NEWSAPI_API_KEY']) \
|
||||||
.replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
|
.replace('$NEWSAPI_COUNTRY', os.environ['NEWSAPI_COUNTRY']) \
|
||||||
.replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
|
.replace('$NEWSAPI_ARTICLE_LIFETIME', os.environ['ARTICLE_LIFETIME']) \
|
||||||
.replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL'])
|
.replace('$NEWSAPI_ARTICLE_INTERVAL', os.environ['ARTICLE_INTERVAL']) \
|
||||||
|
.replace('$PIXABAY_API_KEY', os.environ['PIXABAY_API_KEY']) \
|
||||||
|
|
||||||
# Write the config to a file
|
# Write the config to a file
|
||||||
with open('config.json', 'w') as config_file:
|
with open('config.json', 'w') as config_file:
|
||||||
|
|
Binary file not shown.
111
Docker/snc.py
111
Docker/snc.py
|
@ -35,6 +35,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
|
||||||
|
|
||||||
groq_key = config['groq']['api_key']
|
groq_key = config['groq']['api_key']
|
||||||
|
|
||||||
|
pixabayApiKey = config['pixabay']['api_key']
|
||||||
|
|
||||||
# Connect to MongoDB
|
# Connect to MongoDB
|
||||||
print("Connecting to MongoDB...")
|
print("Connecting to MongoDB...")
|
||||||
|
|
||||||
|
@ -125,7 +127,14 @@ def get_newsfeed(category='general'):
|
||||||
article_data['author'] = article['author']
|
article_data['author'] = article['author']
|
||||||
article_data['category'] = category
|
article_data['category'] = category
|
||||||
article_data['timestamp'] = datetime.now()
|
article_data['timestamp'] = datetime.now()
|
||||||
if (article['url'].contains("reuters.com") == False):
|
|
||||||
|
if (article['url'].find("news.google") != -1):
|
||||||
|
response = requests.get(article['url'])
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
htmlarticle = soup.find('article')
|
||||||
|
if htmlarticle != None:
|
||||||
|
if len(htmlarticle.text.strip()) > 250:
|
||||||
|
article_data['content'] = htmlarticle.text.strip()
|
||||||
articles.append(article_data)
|
articles.append(article_data)
|
||||||
|
|
||||||
print("Newsfeed data retrieved!")
|
print("Newsfeed data retrieved!")
|
||||||
|
@ -134,8 +143,14 @@ def get_newsfeed(category='general'):
|
||||||
# Get most interesting news articles with AI
|
# Get most interesting news articles with AI
|
||||||
|
|
||||||
def get_interesting_news(articles):
|
def get_interesting_news(articles):
|
||||||
|
|
||||||
|
selected_articles = []
|
||||||
|
|
||||||
|
if len(articles) <= 3:
|
||||||
|
print("Not enough articles to select from! Using all articles...")
|
||||||
|
selected_articles = articles
|
||||||
|
else:
|
||||||
print("Getting interesting news...")
|
print("Getting interesting news...")
|
||||||
interesting_articles = []
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = Groq(api_key=groq_key)
|
client = Groq(api_key=groq_key)
|
||||||
|
@ -187,8 +202,6 @@ def get_interesting_news(articles):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
selected_articles = []
|
|
||||||
|
|
||||||
article_index = [0, 1, 2]
|
article_index = [0, 1, 2]
|
||||||
try:
|
try:
|
||||||
article_index[0] = response['most_interesting']['index']
|
article_index[0] = response['most_interesting']['index']
|
||||||
|
@ -200,7 +213,6 @@ def get_interesting_news(articles):
|
||||||
article_index = [0, 1, 2]
|
article_index = [0, 1, 2]
|
||||||
print("Using default article selection...")
|
print("Using default article selection...")
|
||||||
|
|
||||||
|
|
||||||
for i in article_index:
|
for i in article_index:
|
||||||
article = articles[i]
|
article = articles[i]
|
||||||
selected_article = {}
|
selected_article = {}
|
||||||
|
@ -209,10 +221,93 @@ def get_interesting_news(articles):
|
||||||
selected_article['url'] = article['url']
|
selected_article['url'] = article['url']
|
||||||
selected_article['category'] = article['category']
|
selected_article['category'] = article['category']
|
||||||
selected_article['timestamp'] = datetime.now()
|
selected_article['timestamp'] = datetime.now()
|
||||||
|
selected_article['content'] = article['content']
|
||||||
selected_articles.append(selected_article)
|
selected_articles.append(selected_article)
|
||||||
|
|
||||||
print("Interesting news retrieved!")
|
print("Interesting news retrieved!")
|
||||||
|
|
||||||
|
# Get image & summary for all selected articles
|
||||||
|
|
||||||
|
print("Getting images and summaries for selected articles...")
|
||||||
|
|
||||||
|
for article in selected_articles:
|
||||||
|
img_keywords = ""
|
||||||
|
try:
|
||||||
|
client = Groq(api_key=groq_key)
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gemma-7b-it",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": article['title']
|
||||||
|
}
|
||||||
|
],
|
||||||
|
temperature=0.5,
|
||||||
|
max_tokens=1024,
|
||||||
|
top_p=1,
|
||||||
|
stream=False,
|
||||||
|
stop=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
img_keywords = str(completion.choices[0].message.content)
|
||||||
|
img_keywords = img_keywords[:99]
|
||||||
|
except Exception as e:
|
||||||
|
print("Could not get image keywords, using defaults...")
|
||||||
|
img_keywords = article['category'] + " News article"
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
|
||||||
|
image_data = image_response.json()
|
||||||
|
article['image'] = image_data['hits'][0]['largeImageURL']
|
||||||
|
print("Image found!")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
img_keywords = img_keywords.split(" ")[0]
|
||||||
|
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
|
||||||
|
image_data = image_response.json()
|
||||||
|
article['image'] = image_data['hits'][0]['largeImageURL']
|
||||||
|
print("Image found with shortened prompt!")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
|
||||||
|
image_data = image_response.json()
|
||||||
|
article['image'] = image_data['hits'][0]['largeImageURL']
|
||||||
|
print("Image found using category!")
|
||||||
|
except Exception as e:
|
||||||
|
article['image'] = "https://picsum.photos/800/600"
|
||||||
|
|
||||||
|
summary = ""
|
||||||
|
try:
|
||||||
|
client = Groq(api_key=groq_key)
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gemma-7b-it",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": article['content']
|
||||||
|
}
|
||||||
|
],
|
||||||
|
temperature=1.4,
|
||||||
|
max_tokens=1024,
|
||||||
|
top_p=1,
|
||||||
|
stream=False,
|
||||||
|
stop=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
summary = str(completion.choices[0].message.content)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
summary = "Read more about this article on the source website."
|
||||||
|
article['summary'] = summary
|
||||||
|
|
||||||
return selected_articles
|
return selected_articles
|
||||||
|
|
||||||
# Write newsfeed data to MongoDB
|
# Write newsfeed data to MongoDB
|
||||||
|
@ -238,9 +333,7 @@ def get_all_news():
|
||||||
def delete_old_news():
|
def delete_old_news():
|
||||||
print("Deleting old news articles...")
|
print("Deleting old news articles...")
|
||||||
|
|
||||||
hrs = int(config['news']['article_lifetime'])
|
db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=config['news']['article_lifetime']) }})
|
||||||
|
|
||||||
db.newsfeed.delete_many({'timestamp': {'$lt': datetime.now() - timedelta(hours=1) }})
|
|
||||||
|
|
||||||
print("Old news articles deleted!")
|
print("Old news articles deleted!")
|
||||||
|
|
||||||
|
@ -249,7 +342,7 @@ def delete_old_news():
|
||||||
create_collections()
|
create_collections()
|
||||||
|
|
||||||
schedule.every(5).minutes.do(write_weather)
|
schedule.every(5).minutes.do(write_weather)
|
||||||
schedule.every(int(config['news']['article_interval'])).hours.do(get_all_news)
|
schedule.every(config['news']['article_interval']).hours.do(get_all_news)
|
||||||
schedule.every(1).hours.do(delete_old_news)
|
schedule.every(1).hours.do(delete_old_news)
|
||||||
|
|
||||||
write_weather()
|
write_weather()
|
||||||
|
|
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
113
snc.py
113
snc.py
|
@ -4,6 +4,9 @@ import json
|
||||||
import pymongo
|
import pymongo
|
||||||
import requests
|
import requests
|
||||||
import schedule
|
import schedule
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from groq import Groq
|
from groq import Groq
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -37,6 +40,9 @@ if os.path.exists('config.json') == False:
|
||||||
"country" : "gb",
|
"country" : "gb",
|
||||||
"article_lifetime": 6,
|
"article_lifetime": 6,
|
||||||
"article_interval": 1
|
"article_interval": 1
|
||||||
|
},
|
||||||
|
"pixabay" : {
|
||||||
|
"api_key" : ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,6 +65,8 @@ news_url = f"http://newsapi.org/v2/top-headlines?country={config['news']['countr
|
||||||
|
|
||||||
groq_key = config['groq']['api_key']
|
groq_key = config['groq']['api_key']
|
||||||
|
|
||||||
|
pixabayApiKey = config['pixabay']['api_key']
|
||||||
|
|
||||||
# Connect to MongoDB
|
# Connect to MongoDB
|
||||||
print("Connecting to MongoDB...")
|
print("Connecting to MongoDB...")
|
||||||
|
|
||||||
|
@ -149,19 +157,30 @@ def get_newsfeed(category='general'):
|
||||||
article_data['author'] = article['author']
|
article_data['author'] = article['author']
|
||||||
article_data['category'] = category
|
article_data['category'] = category
|
||||||
article_data['timestamp'] = datetime.now()
|
article_data['timestamp'] = datetime.now()
|
||||||
if (article['url'].contains("reuters.com") == False):
|
|
||||||
|
if (article['url'].find("news.google") != -1):
|
||||||
|
response = requests.get(article['url'])
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
htmlarticle = soup.find('article')
|
||||||
|
if htmlarticle != None:
|
||||||
|
if len(htmlarticle.text.strip()) > 250:
|
||||||
|
article_data['content'] = htmlarticle.text.strip()
|
||||||
articles.append(article_data)
|
articles.append(article_data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print("Newsfeed data retrieved!")
|
print("Newsfeed data retrieved!")
|
||||||
return articles
|
return articles
|
||||||
|
|
||||||
# Get most interesting news articles with AI
|
# Get most interesting news articles with AI
|
||||||
|
|
||||||
def get_interesting_news(articles):
|
def get_interesting_news(articles):
|
||||||
|
|
||||||
|
selected_articles = []
|
||||||
|
|
||||||
|
if len(articles) <= 3:
|
||||||
|
print("Not enough articles to select from! Using all articles...")
|
||||||
|
selected_articles = articles
|
||||||
|
else:
|
||||||
print("Getting interesting news...")
|
print("Getting interesting news...")
|
||||||
interesting_articles = []
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
client = Groq(api_key=groq_key)
|
client = Groq(api_key=groq_key)
|
||||||
|
@ -213,8 +232,6 @@ def get_interesting_news(articles):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
selected_articles = []
|
|
||||||
|
|
||||||
article_index = [0, 1, 2]
|
article_index = [0, 1, 2]
|
||||||
try:
|
try:
|
||||||
article_index[0] = response['most_interesting']['index']
|
article_index[0] = response['most_interesting']['index']
|
||||||
|
@ -226,7 +243,6 @@ def get_interesting_news(articles):
|
||||||
article_index = [0, 1, 2]
|
article_index = [0, 1, 2]
|
||||||
print("Using default article selection...")
|
print("Using default article selection...")
|
||||||
|
|
||||||
|
|
||||||
for i in article_index:
|
for i in article_index:
|
||||||
article = articles[i]
|
article = articles[i]
|
||||||
selected_article = {}
|
selected_article = {}
|
||||||
|
@ -235,10 +251,93 @@ def get_interesting_news(articles):
|
||||||
selected_article['url'] = article['url']
|
selected_article['url'] = article['url']
|
||||||
selected_article['category'] = article['category']
|
selected_article['category'] = article['category']
|
||||||
selected_article['timestamp'] = datetime.now()
|
selected_article['timestamp'] = datetime.now()
|
||||||
|
selected_article['content'] = article['content']
|
||||||
selected_articles.append(selected_article)
|
selected_articles.append(selected_article)
|
||||||
|
|
||||||
print("Interesting news retrieved!")
|
print("Interesting news retrieved!")
|
||||||
|
|
||||||
|
# Get image & summary for all selected articles
|
||||||
|
|
||||||
|
print("Getting images and summaries for selected articles...")
|
||||||
|
|
||||||
|
for article in selected_articles:
|
||||||
|
img_keywords = ""
|
||||||
|
try:
|
||||||
|
client = Groq(api_key=groq_key)
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gemma-7b-it",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You will be given a title for an article, provide a few keywords (around 3 maximum) (please only use short, vague and common words) for an image that would match the article (less than 50 characters) in the following format: keyword1 keyword2 keyword3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": article['title']
|
||||||
|
}
|
||||||
|
],
|
||||||
|
temperature=0.5,
|
||||||
|
max_tokens=1024,
|
||||||
|
top_p=1,
|
||||||
|
stream=False,
|
||||||
|
stop=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
img_keywords = str(completion.choices[0].message.content)
|
||||||
|
img_keywords = img_keywords[:99]
|
||||||
|
except Exception as e:
|
||||||
|
print("Could not get image keywords, using defaults...")
|
||||||
|
img_keywords = article['category'] + " News article"
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
|
||||||
|
image_data = image_response.json()
|
||||||
|
article['image'] = image_data['hits'][0]['largeImageURL']
|
||||||
|
print("Image found!")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
img_keywords = img_keywords.split(" ")[0]
|
||||||
|
image_response = requests.get(f"https://pixabay.com/api/?q={img_keywords}&key={pixabayApiKey}&orientation=horizontal&per_page=3")
|
||||||
|
image_data = image_response.json()
|
||||||
|
article['image'] = image_data['hits'][0]['largeImageURL']
|
||||||
|
print("Image found with shortened prompt!")
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
image_response = requests.get(f"https://pixabay.com/api/?q={article['category']} news&key={pixabayApiKey}&orientation=horizontal&per_page=3")
|
||||||
|
image_data = image_response.json()
|
||||||
|
article['image'] = image_data['hits'][0]['largeImageURL']
|
||||||
|
print("Image found using category!")
|
||||||
|
except Exception as e:
|
||||||
|
article['image'] = "https://picsum.photos/800/600"
|
||||||
|
|
||||||
|
summary = ""
|
||||||
|
try:
|
||||||
|
client = Groq(api_key=groq_key)
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gemma-7b-it",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You will be given the source code for a webpage. Please respond with a descriptive summary (around 100 words) of the articles content as a radio announcer would read it out, assuming i know nothing about the subject of the article you will need to provide context and your summary should work as a standalone article. Make sure the article is using spoken language and is easy to read and understand for everyone"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": article['content']
|
||||||
|
}
|
||||||
|
],
|
||||||
|
temperature=1.4,
|
||||||
|
max_tokens=1024,
|
||||||
|
top_p=1,
|
||||||
|
stream=False,
|
||||||
|
stop=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
summary = str(completion.choices[0].message.content)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
summary = "Read more about this article on the source website."
|
||||||
|
article['summary'] = summary
|
||||||
|
|
||||||
return selected_articles
|
return selected_articles
|
||||||
|
|
||||||
# Write newsfeed data to MongoDB
|
# Write newsfeed data to MongoDB
|
||||||
|
|
Loading…
Reference in New Issue