feat: Implement stock listing extraction and database population
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
This commit is contained in:
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
Extract stock listings from TSX, TSXV, CSE, and CBOE
|
||||
Uses Playwright to handle JavaScript-rendered content
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from playwright.async_api import async_playwright
|
||||
import time
|
||||
|
||||
class StockListingExtractor:
|
||||
def __init__(self, output_dir="data/listings"):
|
||||
self.output_dir = output_dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
async def extract_tsx_tsxv_listings(self):
|
||||
"""Extract TSX and TSXV listings"""
|
||||
print("🔍 Extracting TSX/TSXV listings...")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
# Navigate to the listing page
|
||||
await page.goto('https://www.tsx.com/listings/listing-with-us/listed-company-directory',
|
||||
wait_until='networkidle', timeout=90000)
|
||||
|
||||
# Wait for the table to load
|
||||
print("⏳ Waiting for table to load...")
|
||||
await page.wait_for_selector('table, .company-list, [class*="listing"]', timeout=45000)
|
||||
await asyncio.sleep(8) # Extra wait for dynamic content
|
||||
|
||||
# Try multiple possible selectors
|
||||
listings = []
|
||||
|
||||
# Method 1: Try to find a data table
|
||||
table_rows = await page.query_selector_all('tbody tr')
|
||||
if table_rows:
|
||||
print(f"✅ Found {len(table_rows)} rows in table")
|
||||
for row in table_rows:
|
||||
try:
|
||||
cells = await row.query_selector_all('td')
|
||||
if len(cells) >= 3:
|
||||
symbol = await cells[0].inner_text()
|
||||
name = await cells[1].inner_text()
|
||||
sector = await cells[2].inner_text() if len(cells) > 2 else ""
|
||||
|
||||
# Clean ticker symbols - remove newlines and extra whitespace
|
||||
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
|
||||
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
|
||||
sector_clean = sector.strip().replace('\n', ' ').replace('\r', ' ')
|
||||
|
||||
listings.append({
|
||||
'symbol': symbol_clean,
|
||||
'name': name_clean,
|
||||
'sector': sector_clean,
|
||||
'exchange': 'TSX/TSXV',
|
||||
'country': 'Canada',
|
||||
'extracted_at': datetime.now().isoformat()
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error parsing row: {e}")
|
||||
continue
|
||||
|
||||
# Save HTML for manual inspection if needed
|
||||
html_content = await page.content()
|
||||
with open(f"{self.output_dir}/tsx_page.html", 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
# Save listings
|
||||
if listings:
|
||||
output_file = f"{self.output_dir}/tsx_tsxv_listings.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(listings, f, indent=2)
|
||||
print(f"✅ Saved {len(listings)} TSX/TSXV listings to {output_file}")
|
||||
else:
|
||||
print("⚠️ No listings found. Check tsx_page.html for debugging")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error extracting TSX/TSXV: {e}")
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return listings
|
||||
|
||||
async def extract_cse_listings(self):
|
||||
"""Extract CSE listings"""
|
||||
print("\n🔍 Extracting CSE listings...")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
await page.goto('https://thecse.com/en/listings',
|
||||
wait_until='networkidle', timeout=90000)
|
||||
|
||||
print("⏳ Waiting for listings to load...")
|
||||
await page.wait_for_selector('table, [class*="listing"], [class*="company"]', timeout=45000)
|
||||
await asyncio.sleep(8)
|
||||
|
||||
listings = []
|
||||
|
||||
# Try to find the listings table
|
||||
table_rows = await page.query_selector_all('tbody tr, [role="row"]')
|
||||
if table_rows:
|
||||
print(f"✅ Found {len(table_rows)} rows")
|
||||
for row in table_rows:
|
||||
try:
|
||||
# Try different cell selectors
|
||||
cells = await row.query_selector_all('td, [role="cell"]')
|
||||
if len(cells) >= 2:
|
||||
symbol = await cells[0].inner_text()
|
||||
name = await cells[1].inner_text()
|
||||
industry = await cells[2].inner_text() if len(cells) > 2 else ""
|
||||
|
||||
# Clean ticker symbols - remove newlines and extra whitespace
|
||||
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
|
||||
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
|
||||
industry_clean = industry.strip().replace('\n', ' ').replace('\r', ' ')
|
||||
|
||||
listings.append({
|
||||
'symbol': symbol_clean,
|
||||
'name': name_clean,
|
||||
'industry': industry_clean,
|
||||
'exchange': 'CSE',
|
||||
'country': 'Canada',
|
||||
'extracted_at': datetime.now().isoformat()
|
||||
})
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Save HTML for debugging
|
||||
html_content = await page.content()
|
||||
with open(f"{self.output_dir}/cse_page.html", 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
# Save listings
|
||||
if listings:
|
||||
output_file = f"{self.output_dir}/cse_listings.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(listings, f, indent=2)
|
||||
print(f"✅ Saved {len(listings)} CSE listings to {output_file}")
|
||||
else:
|
||||
print("⚠️ No listings found. Check cse_page.html for debugging")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error extracting CSE: {e}")
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return listings
|
||||
|
||||
async def extract_cboe_listings(self):
|
||||
"""Extract CBOE listings"""
|
||||
print("\n🔍 Extracting CBOE listings...")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
await page.goto('https://www.cboe.com/us/equities/listings/',
|
||||
wait_until='networkidle', timeout=90000)
|
||||
|
||||
print("⏳ Waiting for listings to load...")
|
||||
await asyncio.sleep(8)
|
||||
|
||||
listings = []
|
||||
|
||||
# Look for any tables or lists
|
||||
table_rows = await page.query_selector_all('tbody tr, [class*="listing"]')
|
||||
if table_rows:
|
||||
print(f"✅ Found {len(table_rows)} potential listings")
|
||||
for row in table_rows:
|
||||
try:
|
||||
cells = await row.query_selector_all('td')
|
||||
if len(cells) >= 2:
|
||||
symbol = await cells[0].inner_text()
|
||||
name = await cells[1].inner_text()
|
||||
|
||||
# Clean ticker symbols - remove newlines and extra whitespace
|
||||
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
|
||||
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
|
||||
|
||||
listings.append({
|
||||
'symbol': symbol_clean,
|
||||
'name': name_clean,
|
||||
'exchange': 'CBOE',
|
||||
'country': 'USA',
|
||||
'extracted_at': datetime.now().isoformat()
|
||||
})
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Save HTML
|
||||
html_content = await page.content()
|
||||
with open(f"{self.output_dir}/cboe_page.html", 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
# Save listings
|
||||
if listings:
|
||||
output_file = f"{self.output_dir}/cboe_listings.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(listings, f, indent=2)
|
||||
print(f"✅ Saved {len(listings)} CBOE listings to {output_file}")
|
||||
else:
|
||||
print("⚠️ No listings found. Check cboe_page.html for debugging")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error extracting CBOE: {e}")
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return listings
|
||||
|
||||
async def extract_all(self):
|
||||
"""Extract from all exchanges"""
|
||||
print("=" * 60)
|
||||
print("STOCK LISTING EXTRACTION")
|
||||
print("=" * 60)
|
||||
|
||||
all_listings = {}
|
||||
|
||||
# Extract from each exchange
|
||||
all_listings['tsx_tsxv'] = await self.extract_tsx_tsxv_listings()
|
||||
all_listings['cse'] = await self.extract_cse_listings()
|
||||
all_listings['cboe'] = await self.extract_cboe_listings()
|
||||
|
||||
# Combine all listings
|
||||
combined = []
|
||||
for exchange, listings in all_listings.items():
|
||||
combined.extend(listings)
|
||||
|
||||
# Save combined file
|
||||
if combined:
|
||||
output_file = f"{self.output_dir}/all_listings_combined.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined, f, indent=2)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"✅ TOTAL EXTRACTED: {len(combined)} stocks")
|
||||
print(f"📁 Saved to: {output_file}")
|
||||
print("=" * 60)
|
||||
else:
|
||||
print("\n⚠️ No listings extracted. Check the HTML files for debugging.")
|
||||
|
||||
return combined
|
||||
|
||||
|
||||
async def main():
|
||||
extractor = StockListingExtractor()
|
||||
listings = await extractor.extract_all()
|
||||
|
||||
if listings:
|
||||
print(f"\n📊 Sample of extracted stocks:")
|
||||
for stock in listings[:5]:
|
||||
print(f" - {stock['symbol']}: {stock['name']} ({stock['exchange']})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user