80ee708348
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright. - Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation. - Developed `populate_database.py` to populate the database with existing JSON data. - Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks. - Added `setup.py` for initial setup and testing of the system. - Created `watchlist.txt` template for user-defined stock tracking. - Generated `final_test_output.txt` to log the results of the test run.
266 lines
12 KiB
Python
266 lines
12 KiB
Python
"""
|
|
Extract stock listings from TSX, TSXV, CSE, and CBOE
|
|
Uses Playwright to handle JavaScript-rendered content
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
from playwright.async_api import async_playwright
|
|
import time
|
|
|
|
class StockListingExtractor:
|
|
def __init__(self, output_dir="data/listings"):
|
|
self.output_dir = output_dir
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
async def extract_tsx_tsxv_listings(self):
|
|
"""Extract TSX and TSXV listings"""
|
|
print("🔍 Extracting TSX/TSXV listings...")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
page = await browser.new_page()
|
|
|
|
try:
|
|
# Navigate to the listing page
|
|
await page.goto('https://www.tsx.com/listings/listing-with-us/listed-company-directory',
|
|
wait_until='networkidle', timeout=90000)
|
|
|
|
# Wait for the table to load
|
|
print("⏳ Waiting for table to load...")
|
|
await page.wait_for_selector('table, .company-list, [class*="listing"]', timeout=45000)
|
|
await asyncio.sleep(8) # Extra wait for dynamic content
|
|
|
|
# Try multiple possible selectors
|
|
listings = []
|
|
|
|
# Method 1: Try to find a data table
|
|
table_rows = await page.query_selector_all('tbody tr')
|
|
if table_rows:
|
|
print(f"✅ Found {len(table_rows)} rows in table")
|
|
for row in table_rows:
|
|
try:
|
|
cells = await row.query_selector_all('td')
|
|
if len(cells) >= 3:
|
|
symbol = await cells[0].inner_text()
|
|
name = await cells[1].inner_text()
|
|
sector = await cells[2].inner_text() if len(cells) > 2 else ""
|
|
|
|
# Clean ticker symbols - remove newlines and extra whitespace
|
|
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
|
|
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
|
|
sector_clean = sector.strip().replace('\n', ' ').replace('\r', ' ')
|
|
|
|
listings.append({
|
|
'symbol': symbol_clean,
|
|
'name': name_clean,
|
|
'sector': sector_clean,
|
|
'exchange': 'TSX/TSXV',
|
|
'country': 'Canada',
|
|
'extracted_at': datetime.now().isoformat()
|
|
})
|
|
except Exception as e:
|
|
print(f"Error parsing row: {e}")
|
|
continue
|
|
|
|
# Save HTML for manual inspection if needed
|
|
html_content = await page.content()
|
|
with open(f"{self.output_dir}/tsx_page.html", 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
|
|
# Save listings
|
|
if listings:
|
|
output_file = f"{self.output_dir}/tsx_tsxv_listings.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(listings, f, indent=2)
|
|
print(f"✅ Saved {len(listings)} TSX/TSXV listings to {output_file}")
|
|
else:
|
|
print("⚠️ No listings found. Check tsx_page.html for debugging")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error extracting TSX/TSXV: {e}")
|
|
finally:
|
|
await browser.close()
|
|
|
|
return listings
|
|
|
|
async def extract_cse_listings(self):
|
|
"""Extract CSE listings"""
|
|
print("\n🔍 Extracting CSE listings...")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
page = await browser.new_page()
|
|
|
|
try:
|
|
await page.goto('https://thecse.com/en/listings',
|
|
wait_until='networkidle', timeout=90000)
|
|
|
|
print("⏳ Waiting for listings to load...")
|
|
await page.wait_for_selector('table, [class*="listing"], [class*="company"]', timeout=45000)
|
|
await asyncio.sleep(8)
|
|
|
|
listings = []
|
|
|
|
# Try to find the listings table
|
|
table_rows = await page.query_selector_all('tbody tr, [role="row"]')
|
|
if table_rows:
|
|
print(f"✅ Found {len(table_rows)} rows")
|
|
for row in table_rows:
|
|
try:
|
|
# Try different cell selectors
|
|
cells = await row.query_selector_all('td, [role="cell"]')
|
|
if len(cells) >= 2:
|
|
symbol = await cells[0].inner_text()
|
|
name = await cells[1].inner_text()
|
|
industry = await cells[2].inner_text() if len(cells) > 2 else ""
|
|
|
|
# Clean ticker symbols - remove newlines and extra whitespace
|
|
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
|
|
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
|
|
industry_clean = industry.strip().replace('\n', ' ').replace('\r', ' ')
|
|
|
|
listings.append({
|
|
'symbol': symbol_clean,
|
|
'name': name_clean,
|
|
'industry': industry_clean,
|
|
'exchange': 'CSE',
|
|
'country': 'Canada',
|
|
'extracted_at': datetime.now().isoformat()
|
|
})
|
|
except Exception as e:
|
|
continue
|
|
|
|
# Save HTML for debugging
|
|
html_content = await page.content()
|
|
with open(f"{self.output_dir}/cse_page.html", 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
|
|
# Save listings
|
|
if listings:
|
|
output_file = f"{self.output_dir}/cse_listings.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(listings, f, indent=2)
|
|
print(f"✅ Saved {len(listings)} CSE listings to {output_file}")
|
|
else:
|
|
print("⚠️ No listings found. Check cse_page.html for debugging")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error extracting CSE: {e}")
|
|
finally:
|
|
await browser.close()
|
|
|
|
return listings
|
|
|
|
async def extract_cboe_listings(self):
|
|
"""Extract CBOE listings"""
|
|
print("\n🔍 Extracting CBOE listings...")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
page = await browser.new_page()
|
|
|
|
try:
|
|
await page.goto('https://www.cboe.com/us/equities/listings/',
|
|
wait_until='networkidle', timeout=90000)
|
|
|
|
print("⏳ Waiting for listings to load...")
|
|
await asyncio.sleep(8)
|
|
|
|
listings = []
|
|
|
|
# Look for any tables or lists
|
|
table_rows = await page.query_selector_all('tbody tr, [class*="listing"]')
|
|
if table_rows:
|
|
print(f"✅ Found {len(table_rows)} potential listings")
|
|
for row in table_rows:
|
|
try:
|
|
cells = await row.query_selector_all('td')
|
|
if len(cells) >= 2:
|
|
symbol = await cells[0].inner_text()
|
|
name = await cells[1].inner_text()
|
|
|
|
# Clean ticker symbols - remove newlines and extra whitespace
|
|
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
|
|
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
|
|
|
|
listings.append({
|
|
'symbol': symbol_clean,
|
|
'name': name_clean,
|
|
'exchange': 'CBOE',
|
|
'country': 'USA',
|
|
'extracted_at': datetime.now().isoformat()
|
|
})
|
|
except Exception as e:
|
|
continue
|
|
|
|
# Save HTML
|
|
html_content = await page.content()
|
|
with open(f"{self.output_dir}/cboe_page.html", 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
|
|
# Save listings
|
|
if listings:
|
|
output_file = f"{self.output_dir}/cboe_listings.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(listings, f, indent=2)
|
|
print(f"✅ Saved {len(listings)} CBOE listings to {output_file}")
|
|
else:
|
|
print("⚠️ No listings found. Check cboe_page.html for debugging")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error extracting CBOE: {e}")
|
|
finally:
|
|
await browser.close()
|
|
|
|
return listings
|
|
|
|
async def extract_all(self):
|
|
"""Extract from all exchanges"""
|
|
print("=" * 60)
|
|
print("STOCK LISTING EXTRACTION")
|
|
print("=" * 60)
|
|
|
|
all_listings = {}
|
|
|
|
# Extract from each exchange
|
|
all_listings['tsx_tsxv'] = await self.extract_tsx_tsxv_listings()
|
|
all_listings['cse'] = await self.extract_cse_listings()
|
|
all_listings['cboe'] = await self.extract_cboe_listings()
|
|
|
|
# Combine all listings
|
|
combined = []
|
|
for exchange, listings in all_listings.items():
|
|
combined.extend(listings)
|
|
|
|
# Save combined file
|
|
if combined:
|
|
output_file = f"{self.output_dir}/all_listings_combined.json"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(combined, f, indent=2)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"✅ TOTAL EXTRACTED: {len(combined)} stocks")
|
|
print(f"📁 Saved to: {output_file}")
|
|
print("=" * 60)
|
|
else:
|
|
print("\n⚠️ No listings extracted. Check the HTML files for debugging.")
|
|
|
|
return combined
|
|
|
|
|
|
async def main():
|
|
extractor = StockListingExtractor()
|
|
listings = await extractor.extract_all()
|
|
|
|
if listings:
|
|
print(f"\n📊 Sample of extracted stocks:")
|
|
for stock in listings[:5]:
|
|
print(f" - {stock['symbol']}: {stock['name']} ({stock['exchange']})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|