Spaces:
Runtime error
Runtime error
Fix Playwright leak and use multiple contexts (#1187)
Browse files* fix: playwright leak and use multiple contexts
* lint
---------
Co-authored-by: Nathan Sarrazin <sarrazin.nathan@gmail.com>
src/lib/server/websearch/scrape/playwright.ts
CHANGED
|
@@ -1,23 +1,35 @@
|
|
| 1 |
import {
|
| 2 |
-
type BrowserContext,
|
| 3 |
chromium,
|
| 4 |
devices,
|
| 5 |
type Page,
|
| 6 |
type BrowserContextOptions,
|
| 7 |
type Response,
|
|
|
|
| 8 |
} from "playwright";
|
| 9 |
import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
|
| 10 |
import { env } from "$env/dynamic/private";
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
|
|
|
|
|
|
| 18 |
const browser = await chromium.launch({ headless: true });
|
| 19 |
-
|
| 20 |
process.on("SIGINT", () => browser.close());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
const device = devices["Desktop Chrome"];
|
| 23 |
const options: BrowserContextOptions = {
|
|
@@ -36,31 +48,26 @@ async function initPlaywrightService() {
|
|
| 36 |
timezoneId: "America/New_York",
|
| 37 |
locale: "en-US",
|
| 38 |
};
|
| 39 |
-
|
| 40 |
-
const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
|
| 41 |
-
const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
|
| 42 |
-
if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
|
| 43 |
-
return mostBlocked;
|
| 44 |
-
});
|
| 45 |
-
|
| 46 |
-
// Clear the singleton when the context closes
|
| 47 |
-
ctx.on("close", () => {
|
| 48 |
-
playwrightService = undefined;
|
| 49 |
-
});
|
| 50 |
-
|
| 51 |
-
return Object.freeze({ ctx, blocker });
|
| 52 |
}
|
| 53 |
|
| 54 |
-
export async function
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
|
|
|
| 1 |
import {
|
|
|
|
| 2 |
chromium,
|
| 3 |
devices,
|
| 4 |
type Page,
|
| 5 |
type BrowserContextOptions,
|
| 6 |
type Response,
|
| 7 |
+
type Browser,
|
| 8 |
} from "playwright";
|
| 9 |
import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
|
| 10 |
import { env } from "$env/dynamic/private";
|
| 11 |
+
import { logger } from "$lib/server/logger";
|
| 12 |
|
| 13 |
+
const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
|
| 14 |
+
const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
|
| 15 |
+
if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
|
| 16 |
+
return mostBlocked;
|
| 17 |
+
});
|
| 18 |
|
| 19 |
+
let browserSingleton: Promise<Browser> | undefined;
|
| 20 |
+
async function getBrowser() {
|
| 21 |
const browser = await chromium.launch({ headless: true });
|
|
|
|
| 22 |
process.on("SIGINT", () => browser.close());
|
| 23 |
+
browser.on("disconnected", () => {
|
| 24 |
+
logger.warn("Browser closed");
|
| 25 |
+
browserSingleton = undefined;
|
| 26 |
+
});
|
| 27 |
+
return browser;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
async function getPlaywrightCtx() {
|
| 31 |
+
if (!browserSingleton) browserSingleton = getBrowser();
|
| 32 |
+
const browser = await browserSingleton;
|
| 33 |
|
| 34 |
const device = devices["Desktop Chrome"];
|
| 35 |
const options: BrowserContextOptions = {
|
|
|
|
| 48 |
timezoneId: "America/New_York",
|
| 49 |
locale: "en-US",
|
| 50 |
};
|
| 51 |
+
return browser.newContext(options);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
}
|
| 53 |
|
| 54 |
+
export async function withPage<T>(
|
| 55 |
+
url: string,
|
| 56 |
+
callback: (page: Page, response?: Response) => Promise<T>
|
| 57 |
+
): Promise<T> {
|
| 58 |
+
const ctx = await getPlaywrightCtx();
|
| 59 |
|
| 60 |
+
try {
|
| 61 |
+
const page = await ctx.newPage();
|
| 62 |
+
await blocker.enableBlockingInPage(page);
|
| 63 |
|
| 64 |
+
const res = await page.goto(url, { waitUntil: "load", timeout: 3500 }).catch(() => {
|
| 65 |
+
console.warn(`Failed to load page within 2s: ${url}`);
|
| 66 |
+
});
|
| 67 |
|
| 68 |
+
// await needed here so that we don't close the context before the callback is done
|
| 69 |
+
return await callback(page, res ?? undefined);
|
| 70 |
+
} finally {
|
| 71 |
+
ctx.close();
|
| 72 |
+
}
|
| 73 |
}
|
src/lib/server/websearch/scrape/scrape.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
|
| 2 |
import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
|
| 3 |
-
import {
|
| 4 |
|
| 5 |
import { spatialParser } from "./parser";
|
| 6 |
import { htmlToMarkdownTree } from "../markdown/tree";
|
|
@@ -30,9 +30,7 @@ export const scrape = (maxCharsPerElem: number) =>
|
|
| 30 |
};
|
| 31 |
|
| 32 |
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
try {
|
| 36 |
if (!res) throw Error("Failed to load page");
|
| 37 |
|
| 38 |
// Check if it's a non-html content type that we can handle directly
|
|
@@ -66,7 +64,5 @@ export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
|
| 66 |
throw Error("Parsing failed", { cause });
|
| 67 |
});
|
| 68 |
return scrapedOutput;
|
| 69 |
-
}
|
| 70 |
-
page.close();
|
| 71 |
-
}
|
| 72 |
}
|
|
|
|
| 1 |
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
|
| 2 |
import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
|
| 3 |
+
import { withPage } from "./playwright";
|
| 4 |
|
| 5 |
import { spatialParser } from "./parser";
|
| 6 |
import { htmlToMarkdownTree } from "../markdown/tree";
|
|
|
|
| 30 |
};
|
| 31 |
|
| 32 |
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
| 33 |
+
return withPage(url, async (page, res) => {
|
|
|
|
|
|
|
| 34 |
if (!res) throw Error("Failed to load page");
|
| 35 |
|
| 36 |
// Check if it's a non-html content type that we can handle directly
|
|
|
|
| 64 |
throw Error("Parsing failed", { cause });
|
| 65 |
});
|
| 66 |
return scrapedOutput;
|
| 67 |
+
});
|
|
|
|
|
|
|
| 68 |
}
|