apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts

// crawler.test.ts
import { WebCrawler } from '../crawler';
import axios from 'axios';
import robotsParser from 'robots-parser';

jest.mock('axios');
jest.mock('robots-parser');

describe('WebCrawler', () => {
  let crawler: WebCrawler;
  const mockAxios = axios as jest.Mocked<typeof axios>;
  const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;

  let maxCrawledDepth: number;

  beforeEach(() => {
    // Setup default mocks
    mockAxios.get.mockImplementation((url) => {
      if (url.includes('robots.txt')) {
        return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
      } else if (url.includes('sitemap.xml')) {
        return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
      }
      return Promise.resolve({ data: '<html></html>' });
    });

    mockRobotsParser.mockReturnValue({
      isAllowed: jest.fn().mockReturnValue(true),
      isDisallowed: jest.fn().mockReturnValue(false),
      getMatchingLineNumber: jest.fn().mockReturnValue(0),
      getCrawlDelay: jest.fn().mockReturnValue(0),
      getSitemaps: jest.fn().mockReturnValue([]),
      getPreferredHost: jest.fn().mockReturnValue('example.com')
    });
  });

  it('should respect the limit parameter by not returning more links than specified', async () => {
    const initialUrl = 'http://example.com';
    const limit = 2;  // Set a limit for the number of links

    crawler = new WebCrawler({
      jobId: "TEST",
      initialUrl: initialUrl,
      includes: [],
      excludes: [],
      limit: limit,  // Apply the limit
      maxCrawledDepth: 10
    });

    // Mock sitemap fetching function to return more links than the limit
    crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
      initialUrl,
      initialUrl + '/page1',
      initialUrl + '/page2',
      initialUrl + '/page3'
    ]);

    const filteredLinks = crawler['filterLinks'](
      [initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'],
      limit,
      10
    );

    expect(filteredLinks.length).toBe(limit);  // Check if the number of results respects the limit
    expect(filteredLinks).toEqual([
      initialUrl,
      initialUrl + '/page1'
    ]);
  });
});
Add unit tests to replace e2e 2024-06-15 16:43:37 -04:00			`// crawler.test.ts`
			`import { WebCrawler } from '../crawler';`
			`import axios from 'axios';`
			`import robotsParser from 'robots-parser';`

			`jest.mock('axios');`
			`jest.mock('robots-parser');`

transcribed from e2e to unit tests for many cases 2024-06-17 17:09:44 -03:00			`describe('WebCrawler', () => {`
Add unit tests to replace e2e 2024-06-15 16:43:37 -04:00			`let crawler: WebCrawler;`
			`const mockAxios = axios as jest.Mocked<typeof axios>;`
			`const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;`

			`let maxCrawledDepth: number;`

			`beforeEach(() => {`
			`// Setup default mocks`
			`mockAxios.get.mockImplementation((url) => {`
			`if (url.includes('robots.txt')) {`
			`return Promise.resolve({ data: 'User-agent: *\nAllow: /' });`
			`} else if (url.includes('sitemap.xml')) {`
			`return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs`
			`}`
			`return Promise.resolve({ data: '<html></html>' });`
			`});`

			`mockRobotsParser.mockReturnValue({`
			`isAllowed: jest.fn().mockReturnValue(true),`
			`isDisallowed: jest.fn().mockReturnValue(false),`
			`getMatchingLineNumber: jest.fn().mockReturnValue(0),`
			`getCrawlDelay: jest.fn().mockReturnValue(0),`
			`getSitemaps: jest.fn().mockReturnValue([]),`
			`getPreferredHost: jest.fn().mockReturnValue('example.com')`
			`});`
			`});`

Added crawl limit unit test 2024-06-26 09:54:25 -03:00			`it('should respect the limit parameter by not returning more links than specified', async () => {`
			`const initialUrl = 'http://example.com';`
			`const limit = 2; // Set a limit for the number of links`

			`crawler = new WebCrawler({`
feat: scrape event logging to DB 2024-07-24 14:31:25 +02:00			`jobId: "TEST",`
Added crawl limit unit test 2024-06-26 09:54:25 -03:00			`initialUrl: initialUrl,`
			`includes: [],`
			`excludes: [],`
			`limit: limit, // Apply the limit`
			`maxCrawledDepth: 10`
			`});`

			`// Mock sitemap fetching function to return more links than the limit`
			`crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([`
			`initialUrl,`
			`initialUrl + '/page1',`
			`initialUrl + '/page2',`
			`initialUrl + '/page3'`
			`]);`

			`const filteredLinks = crawler['filterLinks'](`
			`[initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'],`
			`limit,`
			`10`
			`);`

			`expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit`
			`expect(filteredLinks).toEqual([`
			`initialUrl,`
			`initialUrl + '/page1'`
			`]);`
			`});`
Add unit tests to replace e2e 2024-06-15 16:43:37 -04:00			`});`