diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fe1f278 --- /dev/null +++ b/.gitignore @@ -0,0 +1,112 @@ +# .gitignore for DS Task AI News Project + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +### Environment ### +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Virtual environment +pythonenv* + +### IDE ### +# VS Code +.vscode/ +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +# PyCharm +.idea/ +*.iml +*.ipr +*.iws + +### Data Files ### +# Raw and processed news +data/raw_news/ +data/processed_news/ +*.csv +*.json +*.parquet +*.feather +*.pkl +*.pickle +*.db +*.sqlite + +# Vector database files +*.faiss +*.index +*.bin +*.vec + +### Logs ### +*.log +logs/ + +### OS Generated ### +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +### Groq/Cohere Cache ### +.cache/ +model_cache/ + +### Test Files ### +test_output/ +benchmark_results/ + +### Documentation ### +docs/_build/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e69de29 diff --git a/backend/config.py b/backend/config.py new file mode 100644 index 0000000..26ce2d3 --- /dev/null +++ b/backend/config.py @@ -0,0 +1,13 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +class Config: + COHERE_API_KEY = os.getenv("COHERE_API_KEY") + GROQ_API_KEY = os.getenv("GROQ_API_KEY") + RSS_FEEDS = [ + "http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", + "https://feeds.bbci.co.uk/news/technology/rss.xml" + ] + VECTOR_DB_PATH = "data/vector_db.index" \ No newline at end of file diff --git a/backend/embeddings.py b/backend/embeddings.py new file mode 100644 index 0000000..0d02a6b --- /dev/null +++ b/backend/embeddings.py @@ -0,0 +1,8 @@ +import cohere +from backend.config import Config + +co = cohere.Client(Config.COHERE_API_KEY) + +def get_embeddings(texts): + response = co.embed(texts=texts, model="embed-english-v3.0") + return response.embeddings \ No newline at end of file diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..cc5701d --- /dev/null +++ b/backend/main.py @@ -0,0 +1,20 @@ +from fastapi import FastAPI +from backend.news_fetcher import fetch_news +from backend.recommender import recommend_similar +from backend.config import Config + +app = FastAPI() + +@app.get("/fetch-news") +async def get_latest_news(): + all_news = [] + for feed in Config.RSS_FEEDS: + all_news.extend(fetch_news(feed)) + return {"news": all_news} + + +@app.get("/recommend") +async def recommend_news(article_id: str): + sample_text = "AI breakthroughs in 2024" + similar_ids = recommend_similar(sample_text) + return {"similar_articles": similar_ids} diff --git a/backend/news_fetcher.py b/backend/news_fetcher.py new file mode 100644 index 0000000..eafe32c --- /dev/null +++ b/backend/news_fetcher.py @@ -0,0 +1,26 @@ +# backend/news_fetcher.py +from datetime import datetime +import feedparser + +def fetch_news(rss_url): + feed = feedparser.parse(rss_url) + articles = [] + for entry in feed.entries: + try: + # Try parsing with timezone first + pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %z") + except ValueError: + try: + # Fallback to GMT format without timezone + pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z") + except ValueError: + # Final fallback - use current time if parsing fails + pub_date = datetime.now() + + articles.append({ + "title": entry.title, + "content": entry.description, + "published": pub_date, + "source": rss_url + }) + return articles \ No newline at end of file diff --git a/backend/recommender.py b/backend/recommender.py new file mode 100644 index 0000000..10b19d4 --- /dev/null +++ b/backend/recommender.py @@ -0,0 +1,8 @@ +from backend.embeddings import get_embeddings +from backend.vector_store import VectorDB + +db = VectorDB() + +def recommend_similar(article_text, top_k=3): + query_embed = get_embeddings([article_text])[0] + return db.search(query_embed, k=top_k) \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..b679c0d --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,7 @@ +fastapi +uvicorn +feedparser +cohere +python-dotenv +groq +numpy \ No newline at end of file diff --git a/backend/vector_store.py b/backend/vector_store.py new file mode 100644 index 0000000..eb49041 --- /dev/null +++ b/backend/vector_store.py @@ -0,0 +1,14 @@ +import numpy as np +import faiss +from backend.config import Config + +class VectorDB: + def __init__(self): + self.index = faiss.IndexFlatL2(768) # Cohere embedding dim + + def add_vectors(self, ids, embeddings): + self.index.add(np.array(embeddings).astype('float32')) + + def search(self, query_embedding, k=5): + distances, indices = self.index.search(np.array([query_embedding]), k) + return indices[0] \ No newline at end of file diff --git a/docs/API_Documentation.md b/docs/API_Documentation.md new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/docs/README.md similarity index 100% rename from README.md rename to docs/README.md