2024-06-15 16:43:37 -04:00
|
|
|
// crawler.test.ts
|
|
|
|
|
import { WebCrawler } from '../crawler';
|
|
|
|
|
import axios from 'axios';
|
|
|
|
|
import robotsParser from 'robots-parser';
|
|
|
|
|
|
|
|
|
|
jest.mock('axios');
|
|
|
|
|
jest.mock('robots-parser');
|
|
|
|
|
|
2024-06-17 17:09:44 -03:00
|
|
|
describe('WebCrawler', () => {
|
2024-06-15 16:43:37 -04:00
|
|
|
let crawler: WebCrawler;
|
|
|
|
|
const mockAxios = axios as jest.Mocked<typeof axios>;
|
|
|
|
|
const mockRobotsParser = robotsParser as jest.MockedFunction<typeof robotsParser>;
|
|
|
|
|
|
|
|
|
|
let maxCrawledDepth: number;
|
|
|
|
|
|
|
|
|
|
beforeEach(() => {
|
|
|
|
|
// Setup default mocks
|
|
|
|
|
mockAxios.get.mockImplementation((url) => {
|
|
|
|
|
if (url.includes('robots.txt')) {
|
|
|
|
|
return Promise.resolve({ data: 'User-agent: *\nAllow: /' });
|
|
|
|
|
} else if (url.includes('sitemap.xml')) {
|
|
|
|
|
return Promise.resolve({ data: 'sitemap content' }); // You would normally parse this to URLs
|
|
|
|
|
}
|
|
|
|
|
return Promise.resolve({ data: '<html></html>' });
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
mockRobotsParser.mockReturnValue({
|
|
|
|
|
isAllowed: jest.fn().mockReturnValue(true),
|
|
|
|
|
isDisallowed: jest.fn().mockReturnValue(false),
|
|
|
|
|
getMatchingLineNumber: jest.fn().mockReturnValue(0),
|
|
|
|
|
getCrawlDelay: jest.fn().mockReturnValue(0),
|
|
|
|
|
getSitemaps: jest.fn().mockReturnValue([]),
|
|
|
|
|
getPreferredHost: jest.fn().mockReturnValue('example.com')
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
|
2024-06-26 09:54:25 -03:00
|
|
|
it('should respect the limit parameter by not returning more links than specified', async () => {
|
|
|
|
|
const initialUrl = 'http://example.com';
|
|
|
|
|
const limit = 2; // Set a limit for the number of links
|
|
|
|
|
|
|
|
|
|
crawler = new WebCrawler({
|
2024-07-24 14:31:25 +02:00
|
|
|
jobId: "TEST",
|
2024-06-26 09:54:25 -03:00
|
|
|
initialUrl: initialUrl,
|
|
|
|
|
includes: [],
|
|
|
|
|
excludes: [],
|
|
|
|
|
limit: limit, // Apply the limit
|
|
|
|
|
maxCrawledDepth: 10
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Mock sitemap fetching function to return more links than the limit
|
|
|
|
|
crawler['tryFetchSitemapLinks'] = jest.fn().mockResolvedValue([
|
|
|
|
|
initialUrl,
|
|
|
|
|
initialUrl + '/page1',
|
|
|
|
|
initialUrl + '/page2',
|
|
|
|
|
initialUrl + '/page3'
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
const filteredLinks = crawler['filterLinks'](
|
|
|
|
|
[initialUrl, initialUrl + '/page1', initialUrl + '/page2', initialUrl + '/page3'],
|
|
|
|
|
limit,
|
|
|
|
|
10
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
expect(filteredLinks.length).toBe(limit); // Check if the number of results respects the limit
|
|
|
|
|
expect(filteredLinks).toEqual([
|
|
|
|
|
initialUrl,
|
|
|
|
|
initialUrl + '/page1'
|
|
|
|
|
]);
|
|
|
|
|
});
|
2024-06-15 16:43:37 -04:00
|
|
|
});
|
|
|
|
|
|