Merge pull request #436 from mendableai/mog/fix-infinite-regex
fix(WebScraper): infinite regex leading to fly.io instance hangs
This commit is contained in:
@@ -6,13 +6,13 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
const paths =
|
const paths =
|
||||||
document.content.match(
|
document.content.match(
|
||||||
/(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g
|
/!?\[.*?\]\(.*?\)|href=".+?"/g
|
||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
paths.forEach((path: string) => {
|
paths.forEach((path: string) => {
|
||||||
try {
|
try {
|
||||||
const isImage = path.startsWith("!");
|
const isImage = path.startsWith("!");
|
||||||
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/);
|
||||||
let url = matchedUrl[1];
|
let url = matchedUrl[1];
|
||||||
|
|
||||||
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
if (!url.startsWith("data:") && !url.startsWith("http")) {
|
||||||
@@ -50,11 +50,11 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||||||
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
const baseUrl = new URL(document.metadata.sourceURL).origin;
|
||||||
const images =
|
const images =
|
||||||
document.content.match(
|
document.content.match(
|
||||||
/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
|
/!\[.*?\]\(.*?\)/g
|
||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
images.forEach((image: string) => {
|
images.forEach((image: string) => {
|
||||||
let imageUrl = image.match(/\(([^)]+)\)/)[1];
|
let imageUrl = image.match(/\((.*?)\)/)[1];
|
||||||
let altText = image.match(/\[(.*?)\]/)[1];
|
let altText = image.match(/\[(.*?)\]/)[1];
|
||||||
|
|
||||||
if (!imageUrl.startsWith("data:image")) {
|
if (!imageUrl.startsWith("data:image")) {
|
||||||
|
|||||||
Reference in New Issue
Block a user