Initial project setup
This commit is contained in:
+112
@@ -0,0 +1,112 @@
|
||||
# .gitignore for DS Task AI News Project
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
### Environment ###
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Virtual environment
|
||||
pythonenv*
|
||||
|
||||
### IDE ###
|
||||
# VS Code
|
||||
.vscode/
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
|
||||
# PyCharm
|
||||
.idea/
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
|
||||
### Data Files ###
|
||||
# Raw and processed news
|
||||
data/raw_news/
|
||||
data/processed_news/
|
||||
*.csv
|
||||
*.json
|
||||
*.parquet
|
||||
*.feather
|
||||
*.pkl
|
||||
*.pickle
|
||||
*.db
|
||||
*.sqlite
|
||||
|
||||
# Vector database files
|
||||
*.faiss
|
||||
*.index
|
||||
*.bin
|
||||
*.vec
|
||||
|
||||
### Logs ###
|
||||
*.log
|
||||
logs/
|
||||
|
||||
### OS Generated ###
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
### Groq/Cohere Cache ###
|
||||
.cache/
|
||||
model_cache/
|
||||
|
||||
### Test Files ###
|
||||
test_output/
|
||||
benchmark_results/
|
||||
|
||||
### Documentation ###
|
||||
docs/_build/
|
||||
@@ -0,0 +1,13 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
class Config:
|
||||
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
|
||||
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
||||
RSS_FEEDS = [
|
||||
"http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
||||
"https://feeds.bbci.co.uk/news/technology/rss.xml"
|
||||
]
|
||||
VECTOR_DB_PATH = "data/vector_db.index"
|
||||
@@ -0,0 +1,8 @@
|
||||
import cohere
|
||||
from backend.config import Config
|
||||
|
||||
co = cohere.Client(Config.COHERE_API_KEY)
|
||||
|
||||
def get_embeddings(texts):
|
||||
response = co.embed(texts=texts, model="embed-english-v3.0")
|
||||
return response.embeddings
|
||||
@@ -0,0 +1,20 @@
|
||||
from fastapi import FastAPI
|
||||
from backend.news_fetcher import fetch_news
|
||||
from backend.recommender import recommend_similar
|
||||
from backend.config import Config
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/fetch-news")
|
||||
async def get_latest_news():
|
||||
all_news = []
|
||||
for feed in Config.RSS_FEEDS:
|
||||
all_news.extend(fetch_news(feed))
|
||||
return {"news": all_news}
|
||||
|
||||
|
||||
@app.get("/recommend")
|
||||
async def recommend_news(article_id: str):
|
||||
sample_text = "AI breakthroughs in 2024"
|
||||
similar_ids = recommend_similar(sample_text)
|
||||
return {"similar_articles": similar_ids}
|
||||
@@ -0,0 +1,26 @@
|
||||
# backend/news_fetcher.py
|
||||
from datetime import datetime
|
||||
import feedparser
|
||||
|
||||
def fetch_news(rss_url):
|
||||
feed = feedparser.parse(rss_url)
|
||||
articles = []
|
||||
for entry in feed.entries:
|
||||
try:
|
||||
# Try parsing with timezone first
|
||||
pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %z")
|
||||
except ValueError:
|
||||
try:
|
||||
# Fallback to GMT format without timezone
|
||||
pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
except ValueError:
|
||||
# Final fallback - use current time if parsing fails
|
||||
pub_date = datetime.now()
|
||||
|
||||
articles.append({
|
||||
"title": entry.title,
|
||||
"content": entry.description,
|
||||
"published": pub_date,
|
||||
"source": rss_url
|
||||
})
|
||||
return articles
|
||||
@@ -0,0 +1,8 @@
|
||||
from backend.embeddings import get_embeddings
|
||||
from backend.vector_store import VectorDB
|
||||
|
||||
db = VectorDB()
|
||||
|
||||
def recommend_similar(article_text, top_k=3):
|
||||
query_embed = get_embeddings([article_text])[0]
|
||||
return db.search(query_embed, k=top_k)
|
||||
@@ -0,0 +1,7 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
feedparser
|
||||
cohere
|
||||
python-dotenv
|
||||
groq
|
||||
numpy
|
||||
@@ -0,0 +1,14 @@
|
||||
import numpy as np
|
||||
import faiss
|
||||
from backend.config import Config
|
||||
|
||||
class VectorDB:
|
||||
def __init__(self):
|
||||
self.index = faiss.IndexFlatL2(768) # Cohere embedding dim
|
||||
|
||||
def add_vectors(self, ids, embeddings):
|
||||
self.index.add(np.array(embeddings).astype('float32'))
|
||||
|
||||
def search(self, query_embedding, k=5):
|
||||
distances, indices = self.index.search(np.array([query_embedding]), k)
|
||||
return indices[0]
|
||||
Reference in New Issue
Block a user