feat: Implement stock listing extraction and database population
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
This commit is contained in:
+275
@@ -0,0 +1,275 @@
|
||||
"""
|
||||
Example analysis script - What you can do with the collected data
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
class StockAnalyzer:
|
||||
def __init__(self, db_path="data/stocks.db"):
|
||||
self.conn = sqlite3.connect(db_path)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def get_summary_stats(self):
|
||||
"""Get overall statistics"""
|
||||
print("\n" + "=" * 70)
|
||||
print("DATABASE SUMMARY STATISTICS")
|
||||
print("=" * 70)
|
||||
|
||||
# Total stocks
|
||||
self.cursor.execute("SELECT COUNT(*) FROM stocks_master")
|
||||
total_stocks = self.cursor.fetchone()[0]
|
||||
print(f"\nTotal stocks tracked: {total_stocks}")
|
||||
|
||||
# By exchange
|
||||
self.cursor.execute("""
|
||||
SELECT exchange, COUNT(*) as count
|
||||
FROM stocks_master
|
||||
GROUP BY exchange
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
print("\nStocks by exchange:")
|
||||
for exchange, count in self.cursor.fetchall():
|
||||
print(f" {exchange}: {count}")
|
||||
|
||||
# By sector (if available)
|
||||
self.cursor.execute("""
|
||||
SELECT sector, COUNT(*) as count
|
||||
FROM stocks_master
|
||||
WHERE sector IS NOT NULL AND sector != ''
|
||||
GROUP BY sector
|
||||
ORDER BY count DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
sectors = self.cursor.fetchall()
|
||||
if sectors:
|
||||
print("\nTop 10 sectors:")
|
||||
for sector, count in sectors:
|
||||
print(f" {sector}: {count}")
|
||||
|
||||
# Coverage stats
|
||||
self.cursor.execute("""
|
||||
SELECT
|
||||
SUM(CASE WHEN has_financials = 1 THEN 1 ELSE 0 END) as with_financials,
|
||||
SUM(CASE WHEN has_news = 1 THEN 1 ELSE 0 END) as with_news,
|
||||
SUM(CASE WHEN has_press_releases = 1 THEN 1 ELSE 0 END) as with_pr,
|
||||
SUM(CASE WHEN has_financials = 1 AND has_news = 1 AND has_press_releases = 1 THEN 1 ELSE 0 END) as complete
|
||||
FROM coverage_report
|
||||
""")
|
||||
fin, news, pr, complete = self.cursor.fetchone()
|
||||
|
||||
print("\nData coverage:")
|
||||
print(f" Stocks with financials: {fin}")
|
||||
print(f" Stocks with news: {news}")
|
||||
print(f" Stocks with press releases: {pr}")
|
||||
print(f" Fully covered stocks: {complete}")
|
||||
|
||||
def find_recent_news_activity(self, limit=20):
|
||||
"""Find stocks with most recent news"""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"TOP {limit} STOCKS BY NEWS ACTIVITY")
|
||||
print("=" * 70)
|
||||
|
||||
# Load news files and count articles
|
||||
news_dir = "data/news"
|
||||
if not os.path.exists(news_dir):
|
||||
print("No news data available yet")
|
||||
return
|
||||
|
||||
stock_news_count = []
|
||||
|
||||
for filename in os.listdir(news_dir):
|
||||
if filename.endswith('_news_pr.json'):
|
||||
ticker = filename.replace('_news_pr.json', '')
|
||||
filepath = os.path.join(news_dir, filename)
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
news_count = len(data.get('news_articles', []))
|
||||
pr_count = len(data.get('press_releases', []))
|
||||
|
||||
if news_count > 0 or pr_count > 0:
|
||||
stock_news_count.append({
|
||||
'ticker': ticker,
|
||||
'news': news_count,
|
||||
'pr': pr_count,
|
||||
'total': news_count + pr_count
|
||||
})
|
||||
|
||||
# Sort by total
|
||||
stock_news_count.sort(key=lambda x: x['total'], reverse=True)
|
||||
|
||||
print(f"\n{'Ticker':<10} {'News':<10} {'PR':<10} {'Total':<10}")
|
||||
print("-" * 40)
|
||||
for stock in stock_news_count[:limit]:
|
||||
print(f"{stock['ticker']:<10} {stock['news']:<10} {stock['pr']:<10} {stock['total']:<10}")
|
||||
|
||||
def find_stocks_by_sector(self, sector):
|
||||
"""Find all stocks in a sector"""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"STOCKS IN SECTOR: {sector.upper()}")
|
||||
print("=" * 70)
|
||||
|
||||
self.cursor.execute("""
|
||||
SELECT symbol, company_name, exchange
|
||||
FROM stocks_master
|
||||
WHERE sector LIKE ?
|
||||
ORDER BY symbol
|
||||
""", (f"%{sector}%",))
|
||||
|
||||
stocks = self.cursor.fetchall()
|
||||
|
||||
if stocks:
|
||||
print(f"\nFound {len(stocks)} stocks:")
|
||||
for symbol, name, exchange in stocks:
|
||||
print(f" {symbol:<8} {name:<50} [{exchange}]")
|
||||
else:
|
||||
print(f"\nNo stocks found in sector: {sector}")
|
||||
|
||||
def get_stock_report(self, ticker):
|
||||
"""Get full report for a stock"""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"STOCK REPORT: {ticker}")
|
||||
print("=" * 70)
|
||||
|
||||
# Get basic info
|
||||
self.cursor.execute("""
|
||||
SELECT company_name, exchange, sector, industry, listing_date
|
||||
FROM stocks_master
|
||||
WHERE symbol = ?
|
||||
""", (ticker,))
|
||||
|
||||
result = self.cursor.fetchone()
|
||||
if not result:
|
||||
print(f"\nStock {ticker} not found in database")
|
||||
return
|
||||
|
||||
name, exchange, sector, industry, listing_date = result
|
||||
|
||||
print(f"\nCompany: {name}")
|
||||
print(f"Exchange: {exchange}")
|
||||
if sector:
|
||||
print(f"Sector: {sector}")
|
||||
if industry:
|
||||
print(f"Industry: {industry}")
|
||||
if listing_date:
|
||||
print(f"Listing Date: {listing_date}")
|
||||
|
||||
# Check coverage
|
||||
self.cursor.execute("""
|
||||
SELECT has_financials, has_news, has_press_releases
|
||||
FROM coverage_report
|
||||
WHERE ticker = ?
|
||||
""", (ticker,))
|
||||
|
||||
coverage = self.cursor.fetchone()
|
||||
if coverage:
|
||||
has_fin, has_news, has_pr = coverage
|
||||
print(f"\nData Coverage:")
|
||||
print(f" Financials: {'✅' if has_fin else '❌'}")
|
||||
print(f" News: {'✅' if has_news else '❌'}")
|
||||
print(f" Press Releases: {'✅' if has_pr else '❌'}")
|
||||
|
||||
# Load financial data if available
|
||||
fin_file = f"data/financials/{ticker}_yahoo.json"
|
||||
if os.path.exists(fin_file):
|
||||
print(f"\nFinancial Data: (see {fin_file})")
|
||||
with open(fin_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
if data.get('profile', {}).get('current_price'):
|
||||
print(f" Current Price: ${data['profile']['current_price']}")
|
||||
|
||||
# Load news if available
|
||||
news_file = f"data/news/{ticker}_news_pr.json"
|
||||
if os.path.exists(news_file):
|
||||
with open(news_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
news_count = len(data.get('news_articles', []))
|
||||
pr_count = len(data.get('press_releases', []))
|
||||
print(f"\nNews & Press Releases:")
|
||||
print(f" News articles: {news_count}")
|
||||
print(f" Press releases: {pr_count}")
|
||||
|
||||
if news_count > 0:
|
||||
print(f"\n Recent news:")
|
||||
for article in data['news_articles'][:3]:
|
||||
print(f" - {article.get('title', 'N/A')}")
|
||||
|
||||
# Check if report exists
|
||||
report_file = f"data/reports/{ticker}_report.txt"
|
||||
if os.path.exists(report_file):
|
||||
print(f"\nFull report available at: {report_file}")
|
||||
|
||||
def export_to_csv(self, output_file="stock_list.csv"):
|
||||
"""Export stock list to CSV"""
|
||||
print("\n" + "=" * 70)
|
||||
print(f"EXPORTING TO CSV: {output_file}")
|
||||
print("=" * 70)
|
||||
|
||||
self.cursor.execute("""
|
||||
SELECT s.symbol, s.company_name, s.exchange, s.sector, s.industry,
|
||||
c.has_financials, c.has_news, c.has_press_releases
|
||||
FROM stocks_master s
|
||||
LEFT JOIN coverage_report c ON s.symbol = c.ticker
|
||||
ORDER BY s.symbol
|
||||
""")
|
||||
|
||||
import csv
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Symbol', 'Company', 'Exchange', 'Sector', 'Industry',
|
||||
'Has Financials', 'Has News', 'Has PR'])
|
||||
writer.writerows(self.cursor.fetchall())
|
||||
|
||||
print(f"\n✅ Exported to {output_file}")
|
||||
|
||||
def close(self):
|
||||
self.conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage"""
|
||||
print("\n" + "=" * 70)
|
||||
print("STOCK DATA ANALYSIS - EXAMPLES")
|
||||
print("=" * 70)
|
||||
|
||||
# Check if database exists
|
||||
if not os.path.exists("data/stocks.db"):
|
||||
print("\n❌ Database not found!")
|
||||
print(" Run 'python main.py' first to collect data")
|
||||
return
|
||||
|
||||
analyzer = StockAnalyzer()
|
||||
|
||||
# Example 1: Get summary statistics
|
||||
analyzer.get_summary_stats()
|
||||
|
||||
# Example 2: Find most active stocks (by news)
|
||||
analyzer.find_recent_news_activity(limit=10)
|
||||
|
||||
# Example 3: Find stocks in a sector
|
||||
# analyzer.find_stocks_by_sector("Technology")
|
||||
|
||||
# Example 4: Get report for specific stock
|
||||
# analyzer.get_stock_report("ABC")
|
||||
|
||||
# Example 5: Export to CSV
|
||||
# analyzer.export_to_csv("my_stocks.csv")
|
||||
|
||||
analyzer.close()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("ANALYSIS COMPLETE")
|
||||
print("=" * 70)
|
||||
print("\nYou can modify this script to:")
|
||||
print(" - Filter stocks by criteria (P/E, market cap, etc.)")
|
||||
print(" - Find stocks with specific keywords in news")
|
||||
print(" - Compare stocks within sectors")
|
||||
print(" - Track changes over time")
|
||||
print(" - Generate custom reports")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user