269 lines
10 KiB
Python
269 lines
10 KiB
Python
|
|
"""
|
||
|
|
Scrape SEDAR+ filings for Canadian companies
|
||
|
|
Gets annual reports, AGM circulars, financial statements, tax disclosures
|
||
|
|
"""
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
from datetime import datetime
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
from typing import Dict, List, Any
|
||
|
|
import time
|
||
|
|
|
||
|
|
from config import SEDAR_BASE_URL, SEDAR_SEARCH_URL, FILING_TYPES_SEDAR
|
||
|
|
|
||
|
|
|
||
|
|
class SEDARPlusScraper:
|
||
|
|
def __init__(self, output_dir="data/sedar_filings"):
|
||
|
|
self.output_dir = output_dir
|
||
|
|
os.makedirs(output_dir, exist_ok=True)
|
||
|
|
|
||
|
|
async def search_company(self, company_name: str, ticker: str) -> List[Dict]:
|
||
|
|
"""Search for a company on SEDAR+"""
|
||
|
|
print(f"\n🔍 Searching SEDAR+ for {company_name} ({ticker})...")
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(headless=False) # Non-headless for debugging
|
||
|
|
page = await browser.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Navigate to SEDAR+ search
|
||
|
|
await page.goto(SEDAR_BASE_URL, wait_until='networkidle', timeout=60000)
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
|
||
|
|
# Try to find and use the search functionality
|
||
|
|
# Note: SEDAR+ structure may vary, adjust selectors as needed
|
||
|
|
search_input = await page.query_selector('input[type="search"], input[placeholder*="search"], input[name*="search"]')
|
||
|
|
|
||
|
|
if search_input:
|
||
|
|
await search_input.fill(ticker)
|
||
|
|
await search_input.press('Enter')
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
# Get page content to parse results
|
||
|
|
content = await page.content()
|
||
|
|
|
||
|
|
# Save HTML for debugging
|
||
|
|
debug_file = f"{self.output_dir}/{ticker}_sedar_search.html"
|
||
|
|
with open(debug_file, 'w', encoding='utf-8') as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
print(f" Saved search results to {debug_file}")
|
||
|
|
|
||
|
|
# Try to extract filing links
|
||
|
|
filings = []
|
||
|
|
links = await page.query_selector_all('a[href*="document"], a[href*="filing"]')
|
||
|
|
|
||
|
|
for link in links[:50]: # Get first 50 results
|
||
|
|
try:
|
||
|
|
href = await link.get_attribute('href')
|
||
|
|
text = await link.inner_text()
|
||
|
|
|
||
|
|
filings.append({
|
||
|
|
'title': text.strip(),
|
||
|
|
'url': href if href.startswith('http') else f"{SEDAR_BASE_URL}{href}",
|
||
|
|
'found_at': datetime.now().isoformat()
|
||
|
|
})
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"✅ Found {len(filings)} potential filings")
|
||
|
|
|
||
|
|
return filings
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"❌ Error searching SEDAR+: {e}")
|
||
|
|
return []
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
async def get_filing_document(self, url: str) -> Dict[str, Any]:
|
||
|
|
"""Download and parse a SEDAR+ document"""
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(headless=True)
|
||
|
|
page = await browser.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
content = await page.content()
|
||
|
|
text = await page.inner_text('body')
|
||
|
|
|
||
|
|
filing_data = {
|
||
|
|
'url': url,
|
||
|
|
'scraped_at': datetime.now().isoformat(),
|
||
|
|
'text_content': text[:100000], # Limit size
|
||
|
|
'html_content': content[:100000]
|
||
|
|
}
|
||
|
|
|
||
|
|
# Extract AGM information
|
||
|
|
agm_patterns = [
|
||
|
|
r'annual\s+general\s+meeting.*?(\d{1,2}\s+\w+\s+\d{4})',
|
||
|
|
r'agm.*?(\d{1,2}\s+\w+\s+\d{4})',
|
||
|
|
r'meeting\s+date.*?(\d{1,2}\s+\w+\s+\d{4})'
|
||
|
|
]
|
||
|
|
|
||
|
|
for pattern in agm_patterns:
|
||
|
|
match = re.search(pattern, text.lower())
|
||
|
|
if match:
|
||
|
|
filing_data['agm_date'] = match.group(1)
|
||
|
|
break
|
||
|
|
|
||
|
|
# Extract location
|
||
|
|
location_patterns = [
|
||
|
|
r'meeting\s+location:?\s*([^\n]{10,100})',
|
||
|
|
r'to\s+be\s+held\s+at\s+([^\n]{10,100})',
|
||
|
|
r'location:?\s*([^\n]{10,100})'
|
||
|
|
]
|
||
|
|
|
||
|
|
for pattern in location_patterns:
|
||
|
|
match = re.search(pattern, text.lower())
|
||
|
|
if match:
|
||
|
|
filing_data['agm_location'] = match.group(1).strip()
|
||
|
|
break
|
||
|
|
|
||
|
|
# Extract tax information
|
||
|
|
tax_keywords = ['income tax', 'tax expense', 'effective tax rate', 'deferred tax',
|
||
|
|
'tax loss carryforward', 'tax jurisdiction']
|
||
|
|
|
||
|
|
tax_sections = []
|
||
|
|
for keyword in tax_keywords:
|
||
|
|
pattern = rf'{keyword}.*?(\d+(?:,\d{{3}})*(?:\.\d+)?)'
|
||
|
|
matches = re.finditer(pattern, text.lower())
|
||
|
|
for match in matches:
|
||
|
|
tax_sections.append({
|
||
|
|
'keyword': keyword,
|
||
|
|
'context': match.group(0),
|
||
|
|
'amount': match.group(1)
|
||
|
|
})
|
||
|
|
|
||
|
|
if tax_sections:
|
||
|
|
filing_data['tax_information'] = tax_sections[:20] # Limit results
|
||
|
|
|
||
|
|
# Extract share ownership information
|
||
|
|
ownership_patterns = [
|
||
|
|
r'(insider|director|officer|founder).*?(\d{1,3}(?:,\d{3})*)\s*shares',
|
||
|
|
r'beneficially\s+own.*?(\d{1,3}(?:,\d{3})*)\s*shares',
|
||
|
|
r'voting\s+shares.*?(\d{1,3}(?:,\d{3})*)'
|
||
|
|
]
|
||
|
|
|
||
|
|
ownership_data = []
|
||
|
|
for pattern in ownership_patterns:
|
||
|
|
matches = re.finditer(pattern, text.lower())
|
||
|
|
for match in matches:
|
||
|
|
ownership_data.append({
|
||
|
|
'context': match.group(0)[:200],
|
||
|
|
'shares': match.group(2) if len(match.groups()) > 1 else match.group(1)
|
||
|
|
})
|
||
|
|
|
||
|
|
if ownership_data:
|
||
|
|
filing_data['ownership_mentions'] = ownership_data[:30]
|
||
|
|
|
||
|
|
return filing_data
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error scraping document {url}: {e}")
|
||
|
|
return {'url': url, 'error': str(e)}
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
async def get_complete_company_data(self, ticker: str, company_name: str) -> Dict[str, Any]:
|
||
|
|
"""Get complete SEDAR+ data for a company"""
|
||
|
|
print(f"\n{'='*70}")
|
||
|
|
print(f"SCRAPING SEDAR+ FOR: {ticker} - {company_name}")
|
||
|
|
print(f"{'='*70}")
|
||
|
|
|
||
|
|
data = {
|
||
|
|
'ticker': ticker,
|
||
|
|
'company_name': company_name,
|
||
|
|
'scraped_at': datetime.now().isoformat(),
|
||
|
|
'filings': [],
|
||
|
|
'agm_info': {},
|
||
|
|
'tax_disclosures': {},
|
||
|
|
'ownership_data': []
|
||
|
|
}
|
||
|
|
|
||
|
|
# Search for company
|
||
|
|
filings = await self.search_company(company_name, ticker)
|
||
|
|
data['filings'] = filings
|
||
|
|
|
||
|
|
# Get details from key documents
|
||
|
|
priority_keywords = ['annual', 'circular', 'information', 'financial statement', 'md&a']
|
||
|
|
|
||
|
|
priority_filings = []
|
||
|
|
for filing in filings:
|
||
|
|
title_lower = filing['title'].lower()
|
||
|
|
if any(keyword in title_lower for keyword in priority_keywords):
|
||
|
|
priority_filings.append(filing)
|
||
|
|
|
||
|
|
# Scrape top priority documents
|
||
|
|
for filing in priority_filings[:5]: # Limit to top 5
|
||
|
|
print(f" Scraping: {filing['title'][:60]}...")
|
||
|
|
doc_data = await self.get_filing_document(filing['url'])
|
||
|
|
filing['detailed_data'] = doc_data
|
||
|
|
await asyncio.sleep(3) # Rate limiting
|
||
|
|
|
||
|
|
# Aggregate AGM information
|
||
|
|
agm_dates = []
|
||
|
|
agm_locations = []
|
||
|
|
for filing in data['filings']:
|
||
|
|
if 'detailed_data' in filing:
|
||
|
|
if 'agm_date' in filing['detailed_data']:
|
||
|
|
agm_dates.append(filing['detailed_data']['agm_date'])
|
||
|
|
if 'agm_location' in filing['detailed_data']:
|
||
|
|
agm_locations.append(filing['detailed_data']['agm_location'])
|
||
|
|
|
||
|
|
if agm_dates:
|
||
|
|
data['agm_info']['date'] = agm_dates[0] # Most recent
|
||
|
|
if agm_locations:
|
||
|
|
data['agm_info']['location'] = agm_locations[0]
|
||
|
|
|
||
|
|
# Save to file
|
||
|
|
output_file = f"{self.output_dir}/{ticker}_sedar_data.json"
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump(data, f, indent=2)
|
||
|
|
|
||
|
|
print(f"✅ Saved SEDAR+ data to {output_file}")
|
||
|
|
|
||
|
|
return data
|
||
|
|
|
||
|
|
async def scrape_multiple_companies(self, stock_list: List[Dict]):
|
||
|
|
"""Scrape SEDAR+ data for multiple companies"""
|
||
|
|
print("=" * 70)
|
||
|
|
print("SEDAR+ SCRAPER")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
all_data = []
|
||
|
|
|
||
|
|
for stock in stock_list:
|
||
|
|
ticker = stock.get('symbol')
|
||
|
|
company_name = stock.get('name')
|
||
|
|
|
||
|
|
data = await self.get_complete_company_data(ticker, company_name)
|
||
|
|
all_data.append(data)
|
||
|
|
|
||
|
|
await asyncio.sleep(5) # Respectful rate limiting
|
||
|
|
|
||
|
|
print(f"\n✅ Completed scraping {len(all_data)} companies")
|
||
|
|
return all_data
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
"""Test the SEDAR+ scraper"""
|
||
|
|
scraper = SEDARPlusScraper()
|
||
|
|
|
||
|
|
# Test with a sample Canadian company
|
||
|
|
test_stocks = [
|
||
|
|
{'symbol': 'SHOP', 'name': 'Shopify Inc.'},
|
||
|
|
]
|
||
|
|
|
||
|
|
await scraper.scrape_multiple_companies(test_stocks)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(main())
|