Files
microcap_scrapping/populate_database.py
T
Aherobo Ovie Victor 80ee708348 feat: Implement stock listing extraction and database population
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright.
- Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation.
- Developed `populate_database.py` to populate the database with existing JSON data.
- Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks.
- Added `setup.py` for initial setup and testing of the system.
- Created `watchlist.txt` template for user-defined stock tracking.
- Generated `final_test_output.txt` to log the results of the test run.
2025-11-06 12:34:01 +01:00

236 lines
9.5 KiB
Python

"""
Populate database with existing JSON data
This script reads all existing JSON files and inserts data into the database
"""
import os
import json
from datetime import datetime
from database import StockDatabase
def populate_from_existing_data():
"""Read all existing JSON files and populate database"""
print("\n" + "=" * 70)
print("POPULATING DATABASE FROM EXISTING JSON FILES")
print("=" * 70)
db = StockDatabase()
stats = {
'metrics': 0,
'news': 0,
'filings': 0
}
# 1. Import calculated metrics
print("\n📊 Importing financial metrics...")
metrics_dir = "data/metrics"
if os.path.exists(metrics_dir):
for filename in os.listdir(metrics_dir):
if filename.endswith('_calculated_metrics.json'):
ticker = filename.replace('_calculated_metrics.json', '')
filepath = os.path.join(metrics_dir, filename)
try:
with open(filepath, 'r') as f:
metrics = json.load(f)
# Insert metrics into database
current_year = datetime.now().year
success = db.insert_financial_metrics(ticker, current_year, metrics, is_ttm=True)
if success:
stats['metrics'] += 1
print(f"{ticker}: {len(metrics)} metrics")
except Exception as e:
print(f"{ticker}: {e}")
# 2. Import news articles (from both regular scraping and SerpAPI)
print("\n📰 Importing news articles...")
# Regular news
news_dir = "data/news"
if os.path.exists(news_dir):
for filename in os.listdir(news_dir):
if filename.endswith('_news_pr.json'):
ticker = filename.replace('_news_pr.json', '')
filepath = os.path.join(news_dir, filename)
try:
with open(filepath, 'r') as f:
data = json.load(f)
# Insert news articles
articles = data.get('news_articles', [])
for article in articles:
success = db.insert_news_article(
ticker=ticker,
title=article.get('title', ''),
source=article.get('source', ''),
published_date=article.get('date', ''),
url=article.get('url', ''),
snippet=article.get('snippet', '')
)
if success:
stats['news'] += 1
# Insert press releases
prs = data.get('press_releases', [])
for pr in prs:
success = db.insert_news_article(
ticker=ticker,
title=pr.get('title', ''),
source=pr.get('source', 'Press Release'),
published_date=pr.get('date', ''),
url=pr.get('url', ''),
snippet=pr.get('snippet', '')
)
if success:
stats['news'] += 1
if articles or prs:
print(f"{ticker}: {len(articles)} articles, {len(prs)} PRs")
except Exception as e:
print(f"{ticker}: {e}")
# SerpAPI news
serpapi_dir = "data/serpapi_news"
if os.path.exists(serpapi_dir):
for filename in os.listdir(serpapi_dir):
if filename.endswith('_serpapi.json'):
ticker = filename.replace('_serpapi.json', '')
filepath = os.path.join(serpapi_dir, filename)
try:
with open(filepath, 'r') as f:
data = json.load(f)
# Insert news articles
articles = data.get('news_articles', [])
for article in articles:
success = db.insert_news_article(
ticker=ticker,
title=article.get('title', ''),
source=article.get('source', ''),
published_date=article.get('date', ''),
url=article.get('link', ''),
snippet=article.get('snippet', '')
)
if success:
stats['news'] += 1
# Insert press releases
prs = data.get('press_releases', [])
for pr in prs:
success = db.insert_news_article(
ticker=ticker,
title=pr.get('title', ''),
source=pr.get('source', 'Press Release'),
published_date=pr.get('date', ''),
url=pr.get('link', ''),
snippet=pr.get('snippet', '')
)
if success:
stats['news'] += 1
if articles or prs:
print(f"{ticker}: {len(articles)} SerpAPI articles, {len(prs)} PRs")
except Exception as e:
print(f"{ticker}: {e}")
# 3. Import SEC filings
print("\n📄 Importing SEC EDGAR filings...")
sec_dir = "data/sec_filings"
if os.path.exists(sec_dir):
for filename in os.listdir(sec_dir):
if filename.endswith('_sec_filings.json'):
ticker = filename.replace('_sec_filings.json', '')
filepath = os.path.join(sec_dir, filename)
try:
with open(filepath, 'r') as f:
data = json.load(f)
# Insert filings
filings = data.get('filings', [])
for filing in filings:
db.insert_filing(
ticker=ticker,
filing_date=filing.get('filing_date', ''),
filing_type=filing.get('form_type', ''),
title=filing.get('description', ''),
document_url=filing.get('url', ''),
source='SEC EDGAR'
)
stats['filings'] += 1
# Insert ownership forms
ownership = data.get('insider_ownership', [])
for form in ownership:
db.insert_filing(
ticker=ticker,
filing_date=form.get('filing_date', ''),
filing_type=form.get('form_type', ''),
title=f"Insider Transaction - {form.get('owner', '')}",
document_url=form.get('url', ''),
source='SEC EDGAR - Ownership'
)
stats['filings'] += 1
if filings or ownership:
print(f"{ticker}: {len(filings)} filings, {len(ownership)} ownership")
except Exception as e:
print(f"{ticker}: {e}")
# 4. Import SEDAR+ filings
print("\n📄 Importing SEDAR+ filings...")
sedar_dir = "data/sedar_filings"
if os.path.exists(sedar_dir):
for filename in os.listdir(sedar_dir):
if filename.endswith('_sedar_data.json'):
ticker = filename.replace('_sedar_data.json', '')
filepath = os.path.join(sedar_dir, filename)
try:
with open(filepath, 'r') as f:
data = json.load(f)
# Insert filings
filings = data.get('filings', [])
for filing in filings:
db.insert_filing(
ticker=ticker,
filing_date=filing.get('date', ''),
filing_type=filing.get('type', ''),
title=filing.get('title', ''),
document_url=filing.get('url', ''),
source='SEDAR+'
)
stats['filings'] += 1
if filings:
print(f"{ticker}: {len(filings)} SEDAR+ filings")
except Exception as e:
print(f"{ticker}: {e}")
# Print final stats
print("\n" + "=" * 70)
print("DATABASE POPULATION COMPLETE")
print("=" * 70)
print(f"Financial metrics inserted: {stats['metrics']}")
print(f"News articles inserted: {stats['news']}")
print(f"Filings inserted: {stats['filings']}")
print("=" * 70)
db.close()
if __name__ == "__main__":
populate_from_existing_data()