80ee708348
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
236 lines
9.5 KiB
Python
236 lines
9.5 KiB
Python
"""
|
|
Populate database with existing JSON data
|
|
This script reads all existing JSON files and inserts data into the database
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
from datetime import datetime
|
|
from database import StockDatabase
|
|
|
|
|
|
def populate_from_existing_data():
|
|
"""Read all existing JSON files and populate database"""
|
|
print("\n" + "=" * 70)
|
|
print("POPULATING DATABASE FROM EXISTING JSON FILES")
|
|
print("=" * 70)
|
|
|
|
db = StockDatabase()
|
|
|
|
stats = {
|
|
'metrics': 0,
|
|
'news': 0,
|
|
'filings': 0
|
|
}
|
|
|
|
# 1. Import calculated metrics
|
|
print("\n📊 Importing financial metrics...")
|
|
metrics_dir = "data/metrics"
|
|
if os.path.exists(metrics_dir):
|
|
for filename in os.listdir(metrics_dir):
|
|
if filename.endswith('_calculated_metrics.json'):
|
|
ticker = filename.replace('_calculated_metrics.json', '')
|
|
filepath = os.path.join(metrics_dir, filename)
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
metrics = json.load(f)
|
|
|
|
# Insert metrics into database
|
|
current_year = datetime.now().year
|
|
success = db.insert_financial_metrics(ticker, current_year, metrics, is_ttm=True)
|
|
|
|
if success:
|
|
stats['metrics'] += 1
|
|
print(f" ✓ {ticker}: {len(metrics)} metrics")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ {ticker}: {e}")
|
|
|
|
# 2. Import news articles (from both regular scraping and SerpAPI)
|
|
print("\n📰 Importing news articles...")
|
|
|
|
# Regular news
|
|
news_dir = "data/news"
|
|
if os.path.exists(news_dir):
|
|
for filename in os.listdir(news_dir):
|
|
if filename.endswith('_news_pr.json'):
|
|
ticker = filename.replace('_news_pr.json', '')
|
|
filepath = os.path.join(news_dir, filename)
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Insert news articles
|
|
articles = data.get('news_articles', [])
|
|
for article in articles:
|
|
success = db.insert_news_article(
|
|
ticker=ticker,
|
|
title=article.get('title', ''),
|
|
source=article.get('source', ''),
|
|
published_date=article.get('date', ''),
|
|
url=article.get('url', ''),
|
|
snippet=article.get('snippet', '')
|
|
)
|
|
if success:
|
|
stats['news'] += 1
|
|
|
|
# Insert press releases
|
|
prs = data.get('press_releases', [])
|
|
for pr in prs:
|
|
success = db.insert_news_article(
|
|
ticker=ticker,
|
|
title=pr.get('title', ''),
|
|
source=pr.get('source', 'Press Release'),
|
|
published_date=pr.get('date', ''),
|
|
url=pr.get('url', ''),
|
|
snippet=pr.get('snippet', '')
|
|
)
|
|
if success:
|
|
stats['news'] += 1
|
|
|
|
if articles or prs:
|
|
print(f" ✓ {ticker}: {len(articles)} articles, {len(prs)} PRs")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ {ticker}: {e}")
|
|
|
|
# SerpAPI news
|
|
serpapi_dir = "data/serpapi_news"
|
|
if os.path.exists(serpapi_dir):
|
|
for filename in os.listdir(serpapi_dir):
|
|
if filename.endswith('_serpapi.json'):
|
|
ticker = filename.replace('_serpapi.json', '')
|
|
filepath = os.path.join(serpapi_dir, filename)
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Insert news articles
|
|
articles = data.get('news_articles', [])
|
|
for article in articles:
|
|
success = db.insert_news_article(
|
|
ticker=ticker,
|
|
title=article.get('title', ''),
|
|
source=article.get('source', ''),
|
|
published_date=article.get('date', ''),
|
|
url=article.get('link', ''),
|
|
snippet=article.get('snippet', '')
|
|
)
|
|
if success:
|
|
stats['news'] += 1
|
|
|
|
# Insert press releases
|
|
prs = data.get('press_releases', [])
|
|
for pr in prs:
|
|
success = db.insert_news_article(
|
|
ticker=ticker,
|
|
title=pr.get('title', ''),
|
|
source=pr.get('source', 'Press Release'),
|
|
published_date=pr.get('date', ''),
|
|
url=pr.get('link', ''),
|
|
snippet=pr.get('snippet', '')
|
|
)
|
|
if success:
|
|
stats['news'] += 1
|
|
|
|
if articles or prs:
|
|
print(f" ✓ {ticker}: {len(articles)} SerpAPI articles, {len(prs)} PRs")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ {ticker}: {e}")
|
|
|
|
# 3. Import SEC filings
|
|
print("\n📄 Importing SEC EDGAR filings...")
|
|
sec_dir = "data/sec_filings"
|
|
if os.path.exists(sec_dir):
|
|
for filename in os.listdir(sec_dir):
|
|
if filename.endswith('_sec_filings.json'):
|
|
ticker = filename.replace('_sec_filings.json', '')
|
|
filepath = os.path.join(sec_dir, filename)
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Insert filings
|
|
filings = data.get('filings', [])
|
|
for filing in filings:
|
|
db.insert_filing(
|
|
ticker=ticker,
|
|
filing_date=filing.get('filing_date', ''),
|
|
filing_type=filing.get('form_type', ''),
|
|
title=filing.get('description', ''),
|
|
document_url=filing.get('url', ''),
|
|
source='SEC EDGAR'
|
|
)
|
|
stats['filings'] += 1
|
|
|
|
# Insert ownership forms
|
|
ownership = data.get('insider_ownership', [])
|
|
for form in ownership:
|
|
db.insert_filing(
|
|
ticker=ticker,
|
|
filing_date=form.get('filing_date', ''),
|
|
filing_type=form.get('form_type', ''),
|
|
title=f"Insider Transaction - {form.get('owner', '')}",
|
|
document_url=form.get('url', ''),
|
|
source='SEC EDGAR - Ownership'
|
|
)
|
|
stats['filings'] += 1
|
|
|
|
if filings or ownership:
|
|
print(f" ✓ {ticker}: {len(filings)} filings, {len(ownership)} ownership")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ {ticker}: {e}")
|
|
|
|
# 4. Import SEDAR+ filings
|
|
print("\n📄 Importing SEDAR+ filings...")
|
|
sedar_dir = "data/sedar_filings"
|
|
if os.path.exists(sedar_dir):
|
|
for filename in os.listdir(sedar_dir):
|
|
if filename.endswith('_sedar_data.json'):
|
|
ticker = filename.replace('_sedar_data.json', '')
|
|
filepath = os.path.join(sedar_dir, filename)
|
|
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
# Insert filings
|
|
filings = data.get('filings', [])
|
|
for filing in filings:
|
|
db.insert_filing(
|
|
ticker=ticker,
|
|
filing_date=filing.get('date', ''),
|
|
filing_type=filing.get('type', ''),
|
|
title=filing.get('title', ''),
|
|
document_url=filing.get('url', ''),
|
|
source='SEDAR+'
|
|
)
|
|
stats['filings'] += 1
|
|
|
|
if filings:
|
|
print(f" ✓ {ticker}: {len(filings)} SEDAR+ filings")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ {ticker}: {e}")
|
|
|
|
# Print final stats
|
|
print("\n" + "=" * 70)
|
|
print("DATABASE POPULATION COMPLETE")
|
|
print("=" * 70)
|
|
print(f"Financial metrics inserted: {stats['metrics']}")
|
|
print(f"News articles inserted: {stats['news']}")
|
|
print(f"Filings inserted: {stats['filings']}")
|
|
print("=" * 70)
|
|
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
populate_from_existing_data()
|