""" Scrape SEDAR+ filings for Canadian companies Gets annual reports, AGM circulars, financial statements, tax disclosures """ import asyncio import json import os import re from datetime import datetime from playwright.async_api import async_playwright from typing import Dict, List, Any import time from config import SEDAR_BASE_URL, SEDAR_SEARCH_URL, FILING_TYPES_SEDAR class SEDARPlusScraper: def __init__(self, output_dir="data/sedar_filings"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) async def search_company(self, company_name: str, ticker: str) -> List[Dict]: """Search for a company on SEDAR+""" print(f"\nšŸ” Searching SEDAR+ for {company_name} ({ticker})...") async with async_playwright() as p: browser = await p.chromium.launch(headless=False) # Non-headless for debugging page = await browser.new_page() try: # Navigate to SEDAR+ search await page.goto(SEDAR_BASE_URL, wait_until='networkidle', timeout=60000) await asyncio.sleep(3) # Try to find and use the search functionality # Note: SEDAR+ structure may vary, adjust selectors as needed search_input = await page.query_selector('input[type="search"], input[placeholder*="search"], input[name*="search"]') if search_input: await search_input.fill(ticker) await search_input.press('Enter') await asyncio.sleep(5) # Get page content to parse results content = await page.content() # Save HTML for debugging debug_file = f"{self.output_dir}/{ticker}_sedar_search.html" with open(debug_file, 'w', encoding='utf-8') as f: f.write(content) print(f" Saved search results to {debug_file}") # Try to extract filing links filings = [] links = await page.query_selector_all('a[href*="document"], a[href*="filing"]') for link in links[:50]: # Get first 50 results try: href = await link.get_attribute('href') text = await link.inner_text() filings.append({ 'title': text.strip(), 'url': href if href.startswith('http') else f"{SEDAR_BASE_URL}{href}", 'found_at': datetime.now().isoformat() }) except: continue print(f"āœ… Found {len(filings)} potential filings") return filings except Exception as e: print(f"āŒ Error searching SEDAR+: {e}") return [] finally: await browser.close() async def get_filing_document(self, url: str) -> Dict[str, Any]: """Download and parse a SEDAR+ document""" async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(url, wait_until='networkidle', timeout=30000) await asyncio.sleep(2) content = await page.content() text = await page.inner_text('body') filing_data = { 'url': url, 'scraped_at': datetime.now().isoformat(), 'text_content': text[:100000], # Limit size 'html_content': content[:100000] } # Extract AGM information agm_patterns = [ r'annual\s+general\s+meeting.*?(\d{1,2}\s+\w+\s+\d{4})', r'agm.*?(\d{1,2}\s+\w+\s+\d{4})', r'meeting\s+date.*?(\d{1,2}\s+\w+\s+\d{4})' ] for pattern in agm_patterns: match = re.search(pattern, text.lower()) if match: filing_data['agm_date'] = match.group(1) break # Extract location location_patterns = [ r'meeting\s+location:?\s*([^\n]{10,100})', r'to\s+be\s+held\s+at\s+([^\n]{10,100})', r'location:?\s*([^\n]{10,100})' ] for pattern in location_patterns: match = re.search(pattern, text.lower()) if match: filing_data['agm_location'] = match.group(1).strip() break # Extract tax information tax_keywords = ['income tax', 'tax expense', 'effective tax rate', 'deferred tax', 'tax loss carryforward', 'tax jurisdiction'] tax_sections = [] for keyword in tax_keywords: pattern = rf'{keyword}.*?(\d+(?:,\d{{3}})*(?:\.\d+)?)' matches = re.finditer(pattern, text.lower()) for match in matches: tax_sections.append({ 'keyword': keyword, 'context': match.group(0), 'amount': match.group(1) }) if tax_sections: filing_data['tax_information'] = tax_sections[:20] # Limit results # Extract share ownership information ownership_patterns = [ r'(insider|director|officer|founder).*?(\d{1,3}(?:,\d{3})*)\s*shares', r'beneficially\s+own.*?(\d{1,3}(?:,\d{3})*)\s*shares', r'voting\s+shares.*?(\d{1,3}(?:,\d{3})*)' ] ownership_data = [] for pattern in ownership_patterns: matches = re.finditer(pattern, text.lower()) for match in matches: ownership_data.append({ 'context': match.group(0)[:200], 'shares': match.group(2) if len(match.groups()) > 1 else match.group(1) }) if ownership_data: filing_data['ownership_mentions'] = ownership_data[:30] return filing_data except Exception as e: print(f"Error scraping document {url}: {e}") return {'url': url, 'error': str(e)} finally: await browser.close() async def get_complete_company_data(self, ticker: str, company_name: str) -> Dict[str, Any]: """Get complete SEDAR+ data for a company""" print(f"\n{'='*70}") print(f"SCRAPING SEDAR+ FOR: {ticker} - {company_name}") print(f"{'='*70}") data = { 'ticker': ticker, 'company_name': company_name, 'scraped_at': datetime.now().isoformat(), 'filings': [], 'agm_info': {}, 'tax_disclosures': {}, 'ownership_data': [] } # Search for company filings = await self.search_company(company_name, ticker) data['filings'] = filings # Get details from key documents priority_keywords = ['annual', 'circular', 'information', 'financial statement', 'md&a'] priority_filings = [] for filing in filings: title_lower = filing['title'].lower() if any(keyword in title_lower for keyword in priority_keywords): priority_filings.append(filing) # Scrape top priority documents for filing in priority_filings[:5]: # Limit to top 5 print(f" Scraping: {filing['title'][:60]}...") doc_data = await self.get_filing_document(filing['url']) filing['detailed_data'] = doc_data await asyncio.sleep(3) # Rate limiting # Aggregate AGM information agm_dates = [] agm_locations = [] for filing in data['filings']: if 'detailed_data' in filing: if 'agm_date' in filing['detailed_data']: agm_dates.append(filing['detailed_data']['agm_date']) if 'agm_location' in filing['detailed_data']: agm_locations.append(filing['detailed_data']['agm_location']) if agm_dates: data['agm_info']['date'] = agm_dates[0] # Most recent if agm_locations: data['agm_info']['location'] = agm_locations[0] # Save to file output_file = f"{self.output_dir}/{ticker}_sedar_data.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) print(f"āœ… Saved SEDAR+ data to {output_file}") return data async def scrape_multiple_companies(self, stock_list: List[Dict]): """Scrape SEDAR+ data for multiple companies""" print("=" * 70) print("SEDAR+ SCRAPER") print("=" * 70) all_data = [] for stock in stock_list: ticker = stock.get('symbol') company_name = stock.get('name') data = await self.get_complete_company_data(ticker, company_name) all_data.append(data) await asyncio.sleep(5) # Respectful rate limiting print(f"\nāœ… Completed scraping {len(all_data)} companies") return all_data async def main(): """Test the SEDAR+ scraper""" scraper = SEDARPlusScraper() # Test with a sample Canadian company test_stocks = [ {'symbol': 'SHOP', 'name': 'Shopify Inc.'}, ] await scraper.scrape_multiple_companies(test_stocks) if __name__ == "__main__": asyncio.run(main())