Files
microcap_scrapping/extract_listings.py
T
Aherobo Ovie Victor 80ee708348 feat: Implement stock listing extraction and database population
- Added `extract_listings.py` for extracting stock listings from TSX, TSXV, CSE, and CBOE using Playwright.
- Created `main.py` to orchestrate the entire stock intelligence system, including extraction, database import, financial scraping, news scraping, and report generation.
- Developed `populate_database.py` to populate the database with existing JSON data.
- Introduced `scrape_nasdaq_tsx_only.py` for focused scraping of NASDAQ and TSX stocks.
- Added `setup.py` for initial setup and testing of the system.
- Created `watchlist.txt` template for user-defined stock tracking.
- Generated `final_test_output.txt` to log the results of the test run.
2025-11-06 12:34:01 +01:00

266 lines
12 KiB
Python

"""
Extract stock listings from TSX, TSXV, CSE, and CBOE
Uses Playwright to handle JavaScript-rendered content
"""
import asyncio
import json
import os
from datetime import datetime
from playwright.async_api import async_playwright
import time
class StockListingExtractor:
def __init__(self, output_dir="data/listings"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
async def extract_tsx_tsxv_listings(self):
"""Extract TSX and TSXV listings"""
print("🔍 Extracting TSX/TSXV listings...")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
try:
# Navigate to the listing page
await page.goto('https://www.tsx.com/listings/listing-with-us/listed-company-directory',
wait_until='networkidle', timeout=90000)
# Wait for the table to load
print("⏳ Waiting for table to load...")
await page.wait_for_selector('table, .company-list, [class*="listing"]', timeout=45000)
await asyncio.sleep(8) # Extra wait for dynamic content
# Try multiple possible selectors
listings = []
# Method 1: Try to find a data table
table_rows = await page.query_selector_all('tbody tr')
if table_rows:
print(f"✅ Found {len(table_rows)} rows in table")
for row in table_rows:
try:
cells = await row.query_selector_all('td')
if len(cells) >= 3:
symbol = await cells[0].inner_text()
name = await cells[1].inner_text()
sector = await cells[2].inner_text() if len(cells) > 2 else ""
# Clean ticker symbols - remove newlines and extra whitespace
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
sector_clean = sector.strip().replace('\n', ' ').replace('\r', ' ')
listings.append({
'symbol': symbol_clean,
'name': name_clean,
'sector': sector_clean,
'exchange': 'TSX/TSXV',
'country': 'Canada',
'extracted_at': datetime.now().isoformat()
})
except Exception as e:
print(f"Error parsing row: {e}")
continue
# Save HTML for manual inspection if needed
html_content = await page.content()
with open(f"{self.output_dir}/tsx_page.html", 'w', encoding='utf-8') as f:
f.write(html_content)
# Save listings
if listings:
output_file = f"{self.output_dir}/tsx_tsxv_listings.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(listings, f, indent=2)
print(f"✅ Saved {len(listings)} TSX/TSXV listings to {output_file}")
else:
print("⚠️ No listings found. Check tsx_page.html for debugging")
except Exception as e:
print(f"❌ Error extracting TSX/TSXV: {e}")
finally:
await browser.close()
return listings
async def extract_cse_listings(self):
"""Extract CSE listings"""
print("\n🔍 Extracting CSE listings...")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
try:
await page.goto('https://thecse.com/en/listings',
wait_until='networkidle', timeout=90000)
print("⏳ Waiting for listings to load...")
await page.wait_for_selector('table, [class*="listing"], [class*="company"]', timeout=45000)
await asyncio.sleep(8)
listings = []
# Try to find the listings table
table_rows = await page.query_selector_all('tbody tr, [role="row"]')
if table_rows:
print(f"✅ Found {len(table_rows)} rows")
for row in table_rows:
try:
# Try different cell selectors
cells = await row.query_selector_all('td, [role="cell"]')
if len(cells) >= 2:
symbol = await cells[0].inner_text()
name = await cells[1].inner_text()
industry = await cells[2].inner_text() if len(cells) > 2 else ""
# Clean ticker symbols - remove newlines and extra whitespace
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
industry_clean = industry.strip().replace('\n', ' ').replace('\r', ' ')
listings.append({
'symbol': symbol_clean,
'name': name_clean,
'industry': industry_clean,
'exchange': 'CSE',
'country': 'Canada',
'extracted_at': datetime.now().isoformat()
})
except Exception as e:
continue
# Save HTML for debugging
html_content = await page.content()
with open(f"{self.output_dir}/cse_page.html", 'w', encoding='utf-8') as f:
f.write(html_content)
# Save listings
if listings:
output_file = f"{self.output_dir}/cse_listings.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(listings, f, indent=2)
print(f"✅ Saved {len(listings)} CSE listings to {output_file}")
else:
print("⚠️ No listings found. Check cse_page.html for debugging")
except Exception as e:
print(f"❌ Error extracting CSE: {e}")
finally:
await browser.close()
return listings
async def extract_cboe_listings(self):
"""Extract CBOE listings"""
print("\n🔍 Extracting CBOE listings...")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
try:
await page.goto('https://www.cboe.com/us/equities/listings/',
wait_until='networkidle', timeout=90000)
print("⏳ Waiting for listings to load...")
await asyncio.sleep(8)
listings = []
# Look for any tables or lists
table_rows = await page.query_selector_all('tbody tr, [class*="listing"]')
if table_rows:
print(f"✅ Found {len(table_rows)} potential listings")
for row in table_rows:
try:
cells = await row.query_selector_all('td')
if len(cells) >= 2:
symbol = await cells[0].inner_text()
name = await cells[1].inner_text()
# Clean ticker symbols - remove newlines and extra whitespace
symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
listings.append({
'symbol': symbol_clean,
'name': name_clean,
'exchange': 'CBOE',
'country': 'USA',
'extracted_at': datetime.now().isoformat()
})
except Exception as e:
continue
# Save HTML
html_content = await page.content()
with open(f"{self.output_dir}/cboe_page.html", 'w', encoding='utf-8') as f:
f.write(html_content)
# Save listings
if listings:
output_file = f"{self.output_dir}/cboe_listings.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(listings, f, indent=2)
print(f"✅ Saved {len(listings)} CBOE listings to {output_file}")
else:
print("⚠️ No listings found. Check cboe_page.html for debugging")
except Exception as e:
print(f"❌ Error extracting CBOE: {e}")
finally:
await browser.close()
return listings
async def extract_all(self):
"""Extract from all exchanges"""
print("=" * 60)
print("STOCK LISTING EXTRACTION")
print("=" * 60)
all_listings = {}
# Extract from each exchange
all_listings['tsx_tsxv'] = await self.extract_tsx_tsxv_listings()
all_listings['cse'] = await self.extract_cse_listings()
all_listings['cboe'] = await self.extract_cboe_listings()
# Combine all listings
combined = []
for exchange, listings in all_listings.items():
combined.extend(listings)
# Save combined file
if combined:
output_file = f"{self.output_dir}/all_listings_combined.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(combined, f, indent=2)
print("\n" + "=" * 60)
print(f"✅ TOTAL EXTRACTED: {len(combined)} stocks")
print(f"📁 Saved to: {output_file}")
print("=" * 60)
else:
print("\n⚠️ No listings extracted. Check the HTML files for debugging.")
return combined
async def main():
extractor = StockListingExtractor()
listings = await extractor.extract_all()
if listings:
print(f"\n📊 Sample of extracted stocks:")
for stock in listings[:5]:
print(f" - {stock['symbol']}: {stock['name']} ({stock['exchange']})")
if __name__ == "__main__":
asyncio.run(main())