feat: Implement stock listing extraction and database population
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
This commit is contained in:
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
Populate database with existing JSON data
|
||||
This script reads all existing JSON files and inserts data into the database
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from database import StockDatabase
|
||||
|
||||
|
||||
def populate_from_existing_data():
|
||||
"""Read all existing JSON files and populate database"""
|
||||
print("\n" + "=" * 70)
|
||||
print("POPULATING DATABASE FROM EXISTING JSON FILES")
|
||||
print("=" * 70)
|
||||
|
||||
db = StockDatabase()
|
||||
|
||||
stats = {
|
||||
'metrics': 0,
|
||||
'news': 0,
|
||||
'filings': 0
|
||||
}
|
||||
|
||||
# 1. Import calculated metrics
|
||||
print("\n📊 Importing financial metrics...")
|
||||
metrics_dir = "data/metrics"
|
||||
if os.path.exists(metrics_dir):
|
||||
for filename in os.listdir(metrics_dir):
|
||||
if filename.endswith('_calculated_metrics.json'):
|
||||
ticker = filename.replace('_calculated_metrics.json', '')
|
||||
filepath = os.path.join(metrics_dir, filename)
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
metrics = json.load(f)
|
||||
|
||||
# Insert metrics into database
|
||||
current_year = datetime.now().year
|
||||
success = db.insert_financial_metrics(ticker, current_year, metrics, is_ttm=True)
|
||||
|
||||
if success:
|
||||
stats['metrics'] += 1
|
||||
print(f" ✓ {ticker}: {len(metrics)} metrics")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {ticker}: {e}")
|
||||
|
||||
# 2. Import news articles (from both regular scraping and SerpAPI)
|
||||
print("\n📰 Importing news articles...")
|
||||
|
||||
# Regular news
|
||||
news_dir = "data/news"
|
||||
if os.path.exists(news_dir):
|
||||
for filename in os.listdir(news_dir):
|
||||
if filename.endswith('_news_pr.json'):
|
||||
ticker = filename.replace('_news_pr.json', '')
|
||||
filepath = os.path.join(news_dir, filename)
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Insert news articles
|
||||
articles = data.get('news_articles', [])
|
||||
for article in articles:
|
||||
success = db.insert_news_article(
|
||||
ticker=ticker,
|
||||
title=article.get('title', ''),
|
||||
source=article.get('source', ''),
|
||||
published_date=article.get('date', ''),
|
||||
url=article.get('url', ''),
|
||||
snippet=article.get('snippet', '')
|
||||
)
|
||||
if success:
|
||||
stats['news'] += 1
|
||||
|
||||
# Insert press releases
|
||||
prs = data.get('press_releases', [])
|
||||
for pr in prs:
|
||||
success = db.insert_news_article(
|
||||
ticker=ticker,
|
||||
title=pr.get('title', ''),
|
||||
source=pr.get('source', 'Press Release'),
|
||||
published_date=pr.get('date', ''),
|
||||
url=pr.get('url', ''),
|
||||
snippet=pr.get('snippet', '')
|
||||
)
|
||||
if success:
|
||||
stats['news'] += 1
|
||||
|
||||
if articles or prs:
|
||||
print(f" ✓ {ticker}: {len(articles)} articles, {len(prs)} PRs")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {ticker}: {e}")
|
||||
|
||||
# SerpAPI news
|
||||
serpapi_dir = "data/serpapi_news"
|
||||
if os.path.exists(serpapi_dir):
|
||||
for filename in os.listdir(serpapi_dir):
|
||||
if filename.endswith('_serpapi.json'):
|
||||
ticker = filename.replace('_serpapi.json', '')
|
||||
filepath = os.path.join(serpapi_dir, filename)
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Insert news articles
|
||||
articles = data.get('news_articles', [])
|
||||
for article in articles:
|
||||
success = db.insert_news_article(
|
||||
ticker=ticker,
|
||||
title=article.get('title', ''),
|
||||
source=article.get('source', ''),
|
||||
published_date=article.get('date', ''),
|
||||
url=article.get('link', ''),
|
||||
snippet=article.get('snippet', '')
|
||||
)
|
||||
if success:
|
||||
stats['news'] += 1
|
||||
|
||||
# Insert press releases
|
||||
prs = data.get('press_releases', [])
|
||||
for pr in prs:
|
||||
success = db.insert_news_article(
|
||||
ticker=ticker,
|
||||
title=pr.get('title', ''),
|
||||
source=pr.get('source', 'Press Release'),
|
||||
published_date=pr.get('date', ''),
|
||||
url=pr.get('link', ''),
|
||||
snippet=pr.get('snippet', '')
|
||||
)
|
||||
if success:
|
||||
stats['news'] += 1
|
||||
|
||||
if articles or prs:
|
||||
print(f" ✓ {ticker}: {len(articles)} SerpAPI articles, {len(prs)} PRs")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {ticker}: {e}")
|
||||
|
||||
# 3. Import SEC filings
|
||||
print("\n📄 Importing SEC EDGAR filings...")
|
||||
sec_dir = "data/sec_filings"
|
||||
if os.path.exists(sec_dir):
|
||||
for filename in os.listdir(sec_dir):
|
||||
if filename.endswith('_sec_filings.json'):
|
||||
ticker = filename.replace('_sec_filings.json', '')
|
||||
filepath = os.path.join(sec_dir, filename)
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Insert filings
|
||||
filings = data.get('filings', [])
|
||||
for filing in filings:
|
||||
db.insert_filing(
|
||||
ticker=ticker,
|
||||
filing_date=filing.get('filing_date', ''),
|
||||
filing_type=filing.get('form_type', ''),
|
||||
title=filing.get('description', ''),
|
||||
document_url=filing.get('url', ''),
|
||||
source='SEC EDGAR'
|
||||
)
|
||||
stats['filings'] += 1
|
||||
|
||||
# Insert ownership forms
|
||||
ownership = data.get('insider_ownership', [])
|
||||
for form in ownership:
|
||||
db.insert_filing(
|
||||
ticker=ticker,
|
||||
filing_date=form.get('filing_date', ''),
|
||||
filing_type=form.get('form_type', ''),
|
||||
title=f"Insider Transaction - {form.get('owner', '')}",
|
||||
document_url=form.get('url', ''),
|
||||
source='SEC EDGAR - Ownership'
|
||||
)
|
||||
stats['filings'] += 1
|
||||
|
||||
if filings or ownership:
|
||||
print(f" ✓ {ticker}: {len(filings)} filings, {len(ownership)} ownership")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {ticker}: {e}")
|
||||
|
||||
# 4. Import SEDAR+ filings
|
||||
print("\n📄 Importing SEDAR+ filings...")
|
||||
sedar_dir = "data/sedar_filings"
|
||||
if os.path.exists(sedar_dir):
|
||||
for filename in os.listdir(sedar_dir):
|
||||
if filename.endswith('_sedar_data.json'):
|
||||
ticker = filename.replace('_sedar_data.json', '')
|
||||
filepath = os.path.join(sedar_dir, filename)
|
||||
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Insert filings
|
||||
filings = data.get('filings', [])
|
||||
for filing in filings:
|
||||
db.insert_filing(
|
||||
ticker=ticker,
|
||||
filing_date=filing.get('date', ''),
|
||||
filing_type=filing.get('type', ''),
|
||||
title=filing.get('title', ''),
|
||||
document_url=filing.get('url', ''),
|
||||
source='SEDAR+'
|
||||
)
|
||||
stats['filings'] += 1
|
||||
|
||||
if filings:
|
||||
print(f" ✓ {ticker}: {len(filings)} SEDAR+ filings")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ {ticker}: {e}")
|
||||
|
||||
# Print final stats
|
||||
print("\n" + "=" * 70)
|
||||
print("DATABASE POPULATION COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"Financial metrics inserted: {stats['metrics']}")
|
||||
print(f"News articles inserted: {stats['news']}")
|
||||
print(f"Filings inserted: {stats['filings']}")
|
||||
print("=" * 70)
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
populate_from_existing_data()
|
||||
Reference in New Issue
Block a user