microcap_scrapping/extract_listings.py

"""
Extract stock listings from TSX, TSXV, CSE, and CBOE
Uses Playwright to handle JavaScript-rendered content
"""

import asyncio
import json
import os
from datetime import datetime
from playwright.async_api import async_playwright
import time

class StockListingExtractor:
    def __init__(self, output_dir="data/listings"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    async def extract_tsx_tsxv_listings(self):
        """Extract TSX and TSXV listings"""
        print("🔍 Extracting TSX/TSXV listings...")

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=False)
            page = await browser.new_page()

            try:
                # Navigate to the listing page
                await page.goto('https://www.tsx.com/listings/listing-with-us/listed-company-directory',
                              wait_until='networkidle', timeout=90000)

                # Wait for the table to load
                print("⏳ Waiting for table to load...")
                await page.wait_for_selector('table, .company-list, [class*="listing"]', timeout=45000)
                await asyncio.sleep(8)  # Extra wait for dynamic content

                # Try multiple possible selectors
                listings = []

                # Method 1: Try to find a data table
                table_rows = await page.query_selector_all('tbody tr')
                if table_rows:
                    print(f"✅ Found {len(table_rows)} rows in table")
                    for row in table_rows:
                        try:
                            cells = await row.query_selector_all('td')
                            if len(cells) >= 3:
                                symbol = await cells[0].inner_text()
                                name = await cells[1].inner_text()
                                sector = await cells[2].inner_text() if len(cells) > 2 else ""

                                # Clean ticker symbols - remove newlines and extra whitespace
                                symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
                                name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
                                sector_clean = sector.strip().replace('\n', ' ').replace('\r', ' ')

                                listings.append({
                                    'symbol': symbol_clean,
                                    'name': name_clean,
                                    'sector': sector_clean,
                                    'exchange': 'TSX/TSXV',
                                    'country': 'Canada',
                                    'extracted_at': datetime.now().isoformat()
                                })
                        except Exception as e:
                            print(f"Error parsing row: {e}")
                            continue

                # Save HTML for manual inspection if needed
                html_content = await page.content()
                with open(f"{self.output_dir}/tsx_page.html", 'w', encoding='utf-8') as f:
                    f.write(html_content)

                # Save listings
                if listings:
                    output_file = f"{self.output_dir}/tsx_tsxv_listings.json"
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(listings, f, indent=2)
                    print(f"✅ Saved {len(listings)} TSX/TSXV listings to {output_file}")
                else:
                    print("⚠️  No listings found. Check tsx_page.html for debugging")

            except Exception as e:
                print(f"❌ Error extracting TSX/TSXV: {e}")
            finally:
                await browser.close()

        return listings

    async def extract_cse_listings(self):
        """Extract CSE listings"""
        print("\n🔍 Extracting CSE listings...")

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=False)
            page = await browser.new_page()

            try:
                await page.goto('https://thecse.com/en/listings',
                              wait_until='networkidle', timeout=90000)

                print("⏳ Waiting for listings to load...")
                await page.wait_for_selector('table, [class*="listing"], [class*="company"]', timeout=45000)
                await asyncio.sleep(8)

                listings = []

                # Try to find the listings table
                table_rows = await page.query_selector_all('tbody tr, [role="row"]')
                if table_rows:
                    print(f"✅ Found {len(table_rows)} rows")
                    for row in table_rows:
                        try:
                            # Try different cell selectors
                            cells = await row.query_selector_all('td, [role="cell"]')
                            if len(cells) >= 2:
                                symbol = await cells[0].inner_text()
                                name = await cells[1].inner_text()
                                industry = await cells[2].inner_text() if len(cells) > 2 else ""

                                # Clean ticker symbols - remove newlines and extra whitespace
                                symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
                                name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')
                                industry_clean = industry.strip().replace('\n', ' ').replace('\r', ' ')

                                listings.append({
                                    'symbol': symbol_clean,
                                    'name': name_clean,
                                    'industry': industry_clean,
                                    'exchange': 'CSE',
                                    'country': 'Canada',
                                    'extracted_at': datetime.now().isoformat()
                                })
                        except Exception as e:
                            continue

                # Save HTML for debugging
                html_content = await page.content()
                with open(f"{self.output_dir}/cse_page.html", 'w', encoding='utf-8') as f:
                    f.write(html_content)

                # Save listings
                if listings:
                    output_file = f"{self.output_dir}/cse_listings.json"
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(listings, f, indent=2)
                    print(f"✅ Saved {len(listings)} CSE listings to {output_file}")
                else:
                    print("⚠️  No listings found. Check cse_page.html for debugging")

            except Exception as e:
                print(f"❌ Error extracting CSE: {e}")
            finally:
                await browser.close()

        return listings

    async def extract_cboe_listings(self):
        """Extract CBOE listings"""
        print("\n🔍 Extracting CBOE listings...")

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=False)
            page = await browser.new_page()

            try:
                await page.goto('https://www.cboe.com/us/equities/listings/',
                              wait_until='networkidle', timeout=90000)

                print("⏳ Waiting for listings to load...")
                await asyncio.sleep(8)

                listings = []

                # Look for any tables or lists
                table_rows = await page.query_selector_all('tbody tr, [class*="listing"]')
                if table_rows:
                    print(f"✅ Found {len(table_rows)} potential listings")
                    for row in table_rows:
                        try:
                            cells = await row.query_selector_all('td')
                            if len(cells) >= 2:
                                symbol = await cells[0].inner_text()
                                name = await cells[1].inner_text()

                                # Clean ticker symbols - remove newlines and extra whitespace
                                symbol_clean = symbol.strip().replace('\n', '').replace('\r', '').replace('\t', ' ')
                                name_clean = name.strip().replace('\n', ' ').replace('\r', ' ')

                                listings.append({
                                    'symbol': symbol_clean,
                                    'name': name_clean,
                                    'exchange': 'CBOE',
                                    'country': 'USA',
                                    'extracted_at': datetime.now().isoformat()
                                })
                        except Exception as e:
                            continue

                # Save HTML
                html_content = await page.content()
                with open(f"{self.output_dir}/cboe_page.html", 'w', encoding='utf-8') as f:
                    f.write(html_content)

                # Save listings
                if listings:
                    output_file = f"{self.output_dir}/cboe_listings.json"
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(listings, f, indent=2)
                    print(f"✅ Saved {len(listings)} CBOE listings to {output_file}")
                else:
                    print("⚠️  No listings found. Check cboe_page.html for debugging")

            except Exception as e:
                print(f"❌ Error extracting CBOE: {e}")
            finally:
                await browser.close()

        return listings

    async def extract_all(self):
        """Extract from all exchanges"""
        print("=" * 60)
        print("STOCK LISTING EXTRACTION")
        print("=" * 60)

        all_listings = {}

        # Extract from each exchange
        all_listings['tsx_tsxv'] = await self.extract_tsx_tsxv_listings()
        all_listings['cse'] = await self.extract_cse_listings()
        all_listings['cboe'] = await self.extract_cboe_listings()

        # Combine all listings
        combined = []
        for exchange, listings in all_listings.items():
            combined.extend(listings)

        # Save combined file
        if combined:
            output_file = f"{self.output_dir}/all_listings_combined.json"
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(combined, f, indent=2)

            print("\n" + "=" * 60)
            print(f"✅ TOTAL EXTRACTED: {len(combined)} stocks")
            print(f"📁 Saved to: {output_file}")
            print("=" * 60)
        else:
            print("\n⚠️  No listings extracted. Check the HTML files for debugging.")

        return combined


async def main():
    extractor = StockListingExtractor()
    listings = await extractor.extract_all()

    if listings:
        print(f"\n📊 Sample of extracted stocks:")
        for stock in listings[:5]:
            print(f"  - {stock['symbol']}: {stock['name']} ({stock['exchange']})")


if __name__ == "__main__":
    asyncio.run(main())