microcap_scrapping/main.py

"""
Main orchestrator script for the Stock Intelligence System
Runs all steps in sequence
"""

import asyncio
import os
import json
from datetime import datetime
import sys

# Import our modules
from extract_listings import StockListingExtractor
from database import StockDatabase
from scrape_yahoo_finance import YahooFinanceScraper
from scrape_news_pr import NewsPressScraper


class StockIntelligenceOrchestrator:
    def __init__(self):
        self.db = StockDatabase()
        self.stats = {
            'start_time': datetime.now(),
            'stocks_extracted': 0,
            'financials_scraped': 0,
            'news_scraped': 0,
            'errors': []
        }

    async def step1_extract_listings(self):
        """Step 1: Extract stock listings from exchanges"""
        print("\n" + "=" * 70)
        print("STEP 1: EXTRACTING STOCK LISTINGS FROM EXCHANGES")
        print("=" * 70)

        extractor = StockListingExtractor()
        listings = await extractor.extract_all()

        self.stats['stocks_extracted'] = len(listings)

        return listings

    def step2_import_to_database(self):
        """Step 2: Import listings to database"""
        print("\n" + "=" * 70)
        print("STEP 2: IMPORTING LISTINGS TO DATABASE")
        print("=" * 70)

        listings_file = "data/listings/all_listings_combined.json"

        if os.path.exists(listings_file):
            imported = self.db.import_listings_from_json(listings_file)
            print(f"✅ Imported {imported} stocks to database")
            return imported
        else:
            print(f"❌ Listings file not found: {listings_file}")
            return 0

    async def step3_scrape_financials(self, max_stocks=None):
        """Step 3: Scrape financial data from Yahoo Finance"""
        print("\n" + "=" * 70)
        print("STEP 3: SCRAPING FINANCIAL DATA")
        print("=" * 70)

        # Get stocks from database
        stocks = self.db.get_all_stocks()
        print(f"📊 Found {len(stocks)} stocks in database")

        if max_stocks:
            stocks = stocks[:max_stocks]
            print(f"⚠️  Limiting to {max_stocks} stocks for testing")

        # Convert to list format for scraper
        stock_list = []
        for stock in stocks:
            stock_list.append({
                'symbol': stock[1],  # symbol column
                'name': stock[2],    # company_name column
                'exchange': stock[3]  # exchange column
            })

        scraper = YahooFinanceScraper()
        results = await scraper.scrape_multiple_stocks(stock_list, max_stocks=max_stocks)

        self.stats['financials_scraped'] = len([r for r in results if not r.get('error')])

        # Update coverage in database
        for result in results:
            if not result.get('error'):
                self.db.update_coverage(
                    result['ticker'],
                    has_financials=True,
                    has_ttm=True
                )

        return results

    async def step4_scrape_news_pr(self, max_stocks=None):
        """Step 4: Scrape news and press releases"""
        print("\n" + "=" * 70)
        print("STEP 4: SCRAPING NEWS & PRESS RELEASES")
        print("=" * 70)

        # Get stocks from database
        stocks = self.db.get_all_stocks()

        if max_stocks:
            stocks = stocks[:max_stocks]
            print(f"⚠️  Limiting to {max_stocks} stocks for testing")

        # Convert to list format
        stock_list = []
        for stock in stocks:
            stock_list.append({
                'symbol': stock[1],
                'name': stock[2],
                'exchange': stock[3]
            })

        scraper = NewsPressScraper()
        results = await scraper.scrape_multiple_stocks(stock_list, max_stocks=max_stocks)

        self.stats['news_scraped'] = len(results)

        # Update coverage in database
        for result in results:
            has_news = len(result.get('news_articles', [])) > 0
            has_pr = len(result.get('press_releases', [])) > 0

            self.db.update_coverage(
                result['ticker'],
                has_news=has_news,
                has_press_releases=has_pr
            )

        return results

    def step5_generate_reports(self):
        """Step 5: Generate text reports for each stock"""
        print("\n" + "=" * 70)
        print("STEP 5: GENERATING STOCK REPORTS")
        print("=" * 70)

        reports_dir = "data/reports"
        os.makedirs(reports_dir, exist_ok=True)

        # Get all stocks
        stocks = self.db.get_all_stocks()
        reports_generated = 0

        for stock in stocks:
            ticker = stock[1]
            company_name = stock[2]
            exchange = stock[3]

            try:
                report = self.generate_stock_report(ticker, company_name, exchange)

                # Save report
                report_file = f"{reports_dir}/{ticker}_report.txt"
                with open(report_file, 'w', encoding='utf-8') as f:
                    f.write(report)

                reports_generated += 1

            except Exception as e:
                print(f"❌ Error generating report for {ticker}: {e}")
                self.stats['errors'].append(f"{ticker}: {e}")

        print(f"✅ Generated {reports_generated} stock reports")
        print(f"📁 Reports saved to: {reports_dir}/")

        return reports_generated

    def generate_stock_report(self, ticker, company_name, exchange):
        """Generate a comprehensive text report for a stock"""

        report = []
        report.append("=" * 70)
        report.append(f"STOCK INTELLIGENCE REPORT: {ticker}")
        report.append("=" * 70)
        report.append(f"Company: {company_name}")
        report.append(f"Exchange: {exchange}")
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append("=" * 70)
        report.append("")

        # Load financial data if available
        financials_file = f"data/financials/{ticker}_yahoo.json"
        if os.path.exists(financials_file):
            report.append("[FINANCIAL DATA]")
            report.append("-" * 70)
            with open(financials_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

                if data.get('profile'):
                    report.append("\nProfile:")
                    for key, value in data['profile'].items():
                        report.append(f"  {key}: {value}")

                if data.get('statistics'):
                    report.append("\nKey Statistics:")
                    for key, value in data['statistics'].items():
                        report.append(f"  {key}: {value}")

            report.append("")

        # Load news if available
        news_file = f"data/news/{ticker}_news_pr.json"
        if os.path.exists(news_file):
            with open(news_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

                if data.get('news_articles'):
                    report.append("[NEWS ARTICLES - Last 12 Months]")
                    report.append("-" * 70)
                    for article in data['news_articles'][:10]:
                        report.append(f"\nTitle: {article.get('title', 'N/A')}")
                        report.append(f"Source: {article.get('source', 'N/A')}")
                        report.append(f"Date: {article.get('date', 'N/A')}")
                        report.append(f"URL: {article.get('url', 'N/A')}")
                        if article.get('snippet'):
                            report.append(f"Snippet: {article['snippet']}")
                    report.append("")

                if data.get('press_releases'):
                    report.append("[PRESS RELEASES]")
                    report.append("-" * 70)
                    for pr in data['press_releases'][:10]:
                        report.append(f"\nTitle: {pr.get('title', 'N/A')}")
                        report.append(f"Source: {pr.get('source', 'N/A')}")
                        report.append(f"Date: {pr.get('date', 'N/A')}")
                        report.append(f"URL: {pr.get('url', 'N/A')}")
                    report.append("")

        report.append("=" * 70)
        report.append("END OF REPORT")
        report.append("=" * 70)

        return "\n".join(report)

    def print_final_stats(self):
        """Print final statistics"""
        end_time = datetime.now()
        duration = end_time - self.stats['start_time']

        print("\n" + "=" * 70)
        print("FINAL STATISTICS")
        print("=" * 70)
        print(f"Duration: {duration}")
        print(f"Stocks extracted: {self.stats['stocks_extracted']}")
        print(f"Financials scraped: {self.stats['financials_scraped']}")
        print(f"News scraped: {self.stats['news_scraped']}")
        print(f"Errors: {len(self.stats['errors'])}")

        if self.stats['errors']:
            print("\nError summary:")
            for error in self.stats['errors'][:10]:
                print(f"  - {error}")

        # Get coverage report from database
        coverage = self.db.get_coverage_report()
        print(f"\nCoverage Report:")
        print(f"  Total stocks tracked: {len(coverage)}")

        complete = sum(1 for c in coverage if c[2] and c[3] and c[4] and c[5])
        print(f"  Fully covered stocks: {complete}")

        print("=" * 70)

    async def run_full_pipeline(self, test_mode=True):
        """Run the full data collection pipeline"""
        print("\n" + "=" * 70)
        print("STOCK INTELLIGENCE AUTOMATION SYSTEM")
        print("Starting full pipeline...")
        if test_mode:
            print("⚠️  RUNNING IN TEST MODE (limited stocks)")
        print("=" * 70)

        try:
            # Step 1: Extract listings
            listings = await self.step1_extract_listings()

            if not listings:
                print("\n❌ No listings extracted. Check if websites are accessible.")
                return

            # Step 2: Import to database
            self.step2_import_to_database()

            # Step 3: Scrape financials
            if test_mode:
                await self.step3_scrape_financials(max_stocks=5)
            else:
                await self.step3_scrape_financials()

            # Step 4: Scrape news & PR
            if test_mode:
                await self.step4_scrape_news_pr(max_stocks=3)
            else:
                await self.step4_scrape_news_pr()

            # Step 5: Generate reports
            self.step5_generate_reports()

            # Print stats
            self.print_final_stats()

            print("\n✅ Pipeline completed successfully!")

        except Exception as e:
            print(f"\n❌ Pipeline failed with error: {e}")
            import traceback
            traceback.print_exc()

        finally:
            self.db.close()


async def main():
    """Main entry point"""
    # Check if running in test mode
    test_mode = "--full" not in sys.argv

    if test_mode:
        print("\n⚠️  Running in TEST MODE (limited stocks)")
        print("   To run full pipeline, use: python main.py --full")

    orchestrator = StockIntelligenceOrchestrator()
    await orchestrator.run_full_pipeline(test_mode=test_mode)


if __name__ == "__main__":
    asyncio.run(main())