Initial project setup

This commit is contained in:
Ayomide
2025-07-07 22:08:02 +01:00
parent c158262a49
commit b76a3e75f3
11 changed files with 208 additions and 0 deletions
+112
View File
@@ -0,0 +1,112 @@
# .gitignore for DS Task AI News Project
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Jupyter Notebook
.ipynb_checkpoints
### Environment ###
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Virtual environment
pythonenv*
### IDE ###
# VS Code
.vscode/
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
# PyCharm
.idea/
*.iml
*.ipr
*.iws
### Data Files ###
# Raw and processed news
data/raw_news/
data/processed_news/
*.csv
*.json
*.parquet
*.feather
*.pkl
*.pickle
*.db
*.sqlite
# Vector database files
*.faiss
*.index
*.bin
*.vec
### Logs ###
*.log
logs/
### OS Generated ###
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
### Groq/Cohere Cache ###
.cache/
model_cache/
### Test Files ###
test_output/
benchmark_results/
### Documentation ###
docs/_build/
View File
+13
View File
@@ -0,0 +1,13 @@
import os
from dotenv import load_dotenv
load_dotenv()
class Config:
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
RSS_FEEDS = [
"http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
"https://feeds.bbci.co.uk/news/technology/rss.xml"
]
VECTOR_DB_PATH = "data/vector_db.index"
+8
View File
@@ -0,0 +1,8 @@
import cohere
from backend.config import Config
co = cohere.Client(Config.COHERE_API_KEY)
def get_embeddings(texts):
response = co.embed(texts=texts, model="embed-english-v3.0")
return response.embeddings
+20
View File
@@ -0,0 +1,20 @@
from fastapi import FastAPI
from backend.news_fetcher import fetch_news
from backend.recommender import recommend_similar
from backend.config import Config
app = FastAPI()
@app.get("/fetch-news")
async def get_latest_news():
all_news = []
for feed in Config.RSS_FEEDS:
all_news.extend(fetch_news(feed))
return {"news": all_news}
@app.get("/recommend")
async def recommend_news(article_id: str):
sample_text = "AI breakthroughs in 2024"
similar_ids = recommend_similar(sample_text)
return {"similar_articles": similar_ids}
+26
View File
@@ -0,0 +1,26 @@
# backend/news_fetcher.py
from datetime import datetime
import feedparser
def fetch_news(rss_url):
feed = feedparser.parse(rss_url)
articles = []
for entry in feed.entries:
try:
# Try parsing with timezone first
pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %z")
except ValueError:
try:
# Fallback to GMT format without timezone
pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
except ValueError:
# Final fallback - use current time if parsing fails
pub_date = datetime.now()
articles.append({
"title": entry.title,
"content": entry.description,
"published": pub_date,
"source": rss_url
})
return articles
+8
View File
@@ -0,0 +1,8 @@
from backend.embeddings import get_embeddings
from backend.vector_store import VectorDB
db = VectorDB()
def recommend_similar(article_text, top_k=3):
query_embed = get_embeddings([article_text])[0]
return db.search(query_embed, k=top_k)
+7
View File
@@ -0,0 +1,7 @@
fastapi
uvicorn
feedparser
cohere
python-dotenv
groq
numpy
+14
View File
@@ -0,0 +1,14 @@
import numpy as np
import faiss
from backend.config import Config
class VectorDB:
def __init__(self):
self.index = faiss.IndexFlatL2(768) # Cohere embedding dim
def add_vectors(self, ids, embeddings):
self.index.add(np.array(embeddings).astype('float32'))
def search(self, query_embedding, k=5):
distances, indices = self.index.search(np.array([query_embedding]), k)
return indices[0]
View File
View File